In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ast

In [None]:
#Load Dataset
movies = pd.read_csv("movies_metadata.csv", low_memory=False)
credits = pd.read_csv("credits.csv")
keywords = pd.read_csv("keywords.csv")

In [None]:
movies.head()

In [None]:
movies.info()

In [None]:
movies.shape

In [None]:
movies.columns

In [None]:
movies.isnull().sum()

In [None]:
#Basic Data Cleaning
movies = movies[['id', 'title', 'overview', 'genres']]
movies.dropna(inplace=True)

movies['id'] = movies['id'].astype(str)
credits['id'] = credits['id'].astype(str)
keywords['id'] = keywords['id'].astype(str)

In [None]:
#Merge Metadata
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [None]:
#Convert JSON-Like Columns
def extract_names(text):
    try:
        data = ast.literal_eval(text)
        return " ".join([i['name'] for i in data])
    except:
        return ""

movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)
movies['cast'] = movies['cast'].apply(extract_names)
movies['crew'] = movies['crew'].apply(extract_names)

In [None]:
#Create a Combined Feature Column
movies['combined_features'] = (
    movies['overview'] + " " +
    movies['genres'] + " " +
    movies['keywords'] + " " +
    movies['cast']
)

In [None]:
#TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
#Movie Recommendation Function
import pandas as pd

indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def recommend_movies(title, tfidf_matrix=tfidf_matrix):
    idx = indices[title]

    # Compute similarity ONLY for one movie
    sim_scores = linear_kernel(tfidf_matrix[idx], tfidf_matrix).flatten()

    sim_indices = sim_scores.argsort()[::-1][1:11]

    return movies['title'].iloc[sim_indices]

In [None]:
#Test Recommendations
    recommend_movies("Avatar")

In [None]:
#Ground-Truth
def get_genres(title):
    return set(movies[movies['title'] == title]['genres'].values[0].split())

def evaluate_recommendations(movie_title, top_n=10):
    true_genres = get_genres(movie_title)
    recs = recommend_movies(movie_title)[:top_n]

    y_true = []
    y_pred = []

    for rec in recs:
        rec_genres = get_genres(rec)
        y_true.append(1)   # assume recommended = positive
        y_pred.append(1 if len(true_genres & rec_genres) > 0 else 0)

    return y_true, y_pred


In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
#Precision, Recall, Accuracy
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_true, y_pred = evaluate_recommendations("Avatar")

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))


In [None]:
#Precisionâ€“Recall Heatmap
metrics = {
    "Precision": precision_score(y_true, y_pred),
    "Recall": recall_score(y_true, y_pred),
    "Accuracy": accuracy_score(y_true, y_pred)
}

plt.figure(figsize=(6,3))
sns.heatmap(pd.DataFrame(metrics, index=["Score"]), annot=True, cmap="Greens")
plt.title("Evaluation Metrics Heatmap")
plt.show()


In [None]:
#Model Save (TF-IDF + Metadata)
import pickle

with open("movie_recommender.pkl", "wb") as f:
    pickle.dump((tfidf, tfidf_matrix, movies, indices), f)
