<a href="https://colab.research.google.com/github/Rakeshatla/Movie-Recommendation/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# using cosine_similarity

In [None]:
import numpy as np
import pandas as pd
import zipfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
with zipfile.ZipFile('/content/tmdb_5000_credits.csv.zip') as z:
    with z.open('tmdb_5000_credits.csv') as f:
        credits = pd.read_csv(f)

with zipfile.ZipFile('/content/tmdb_5000_movies.csv.zip') as z:
    with z.open('tmdb_5000_movies.csv') as f:
        movies = pd.read_csv(f)

# Data preprocessing
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

import ast
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Process cast and crew data
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L
movies['crew'] = movies['crew'].apply(fetch_director)

# Collapse and clean data
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Drop unnecessary columns
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# Apply CountVectorizer to create the feature vector
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()

# Calculate similarity matrix (Cosine Similarity in this case)
similarity = cosine_similarity(vector)

# Function to recommend movies
def recommend(movie, k=5):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    recommended_movies = [new.iloc[i[0]].title for i in distances[1:k+1]]
    return recommended_movies

# Ground truth list of similar movies for "Gandhi"
ground_truth = ['The Wind That Shakes the Barley', 'A Passage to India', 'Gandhi', 'Guiana 1838', 'Ramanujan']

# Precision at K function
def precision_at_k(recommended_movies, ground_truth, k=5):
    recommended = recommended_movies[:k]
    relevant_recommendations = [movie for movie in recommended if movie in ground_truth]
    precision = len(relevant_recommendations) / k
    return precision

# Get recommended movies for "Gandhi"
recommended_movies = recommend('Gandhi', k=5)
print(recommended_movies)

# Calculate Precision at K
precision = precision_at_k(recommended_movies, ground_truth, k=5)
print(f"Precision at 5: {precision}")


['Gandhi, My Father', 'The Wind That Shakes the Barley', 'A Passage to India', 'Guiana 1838', 'Ramanujan']
Precision at 5: 0.8


#Using Manhattan Distance

In [None]:
import numpy as np
import pandas as pd
import zipfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
import ast

# Load the datasets
with zipfile.ZipFile('/content/tmdb_5000_credits.csv.zip') as z:
    with z.open('tmdb_5000_credits.csv') as f:
        credits = pd.read_csv(f)

with zipfile.ZipFile('/content/tmdb_5000_movies.csv.zip') as z:
    with z.open('tmdb_5000_movies.csv') as f:
        movies = pd.read_csv(f)

# Merge datasets and keep necessary columns
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Helper functions for data transformation
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

def fetch_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return [i['name']]
    return []

def collapse(L):
    return [i.replace(" ", "") for i in L]

# Process the data
movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: convert(x)[:3])  # Limit to top 3 cast members
movies['crew'] = movies['crew'].apply(fetch_director)

# Remove spaces in tag lists and create a 'tags' column
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['overview'] = movies['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new DataFrame with the 'tags' column
new = movies[['movie_id', 'title', 'tags']]
new.loc[:, 'tags'] = new['tags'].apply(lambda x: " ".join(x))  # Use .loc to avoid SettingWithCopyWarning

# Vectorize the 'tags' column and convert the array to binary
cv = CountVectorizer(max_features=1000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()

# Calculate Manhattan distance
manhattan_distance = pairwise_distances(vector, metric="manhattan")

# Convert Manhattan distance to a similarity measure (inverse)
manhattan_similarity = 1 / (1 + manhattan_distance)

# Recommendation function using Manhattan similarity
def recommend(movie):
    if movie not in new['title'].values:
        print(f"Movie '{movie}' not found.")
        return
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(manhattan_similarity[index])), reverse=True, key=lambda x: x[1])
    recommended_movies = [new.iloc[i[0]].title for i in distances[1:6]]  # Skip the first item (itself)
    return recommended_movies

# Precision at K function
def precision_at_k(recommended_movies, ground_truth, k=5):
    recommended = recommended_movies[:k]
    relevant_recommendations = [movie for movie in recommended if movie in ground_truth]
    precision = len(relevant_recommendations) / k
    return precision

# Example: Ground truth for "Gandhi" (adjust as needed)
ground_truth_gandhi = ['The Wind That Shakes the Barley', 'A Passage to India', 'Gandhi', 'Guiana 1838', 'Ramanujan']

# Get recommended movies for "Gandhi"
recommended_movies = recommend('Gandhi')
print(recommended_movies)

# Calculate Precision at K (assuming k=5)
precision = precision_at_k(recommended_movies, ground_truth_gandhi, k=5)
print(f"Precision at 5 for 'Gandhi': {precision}")


['Mr. Turner', 'Chariots of Fire', 'Ben-Hur', 'Seabiscuit', 'Ramanujan']
Precision at 5 for 'Gandhi': 0.2
