In [1]:
import pandas as pd
import numpy as np
import ast
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ========== LOAD & PREPROCESS ========== #
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = movies.merge(credits, on="title")
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

def convert(obj):
    try:
        return [i["name"] for i in ast.literal_eval(obj)]
    except:
        return []

def get_director(obj):
    try:
        return [i["name"] for i in ast.literal_eval(obj) if i["job"] == "Director"]
    except:
        return []

movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)
movies["cast"] = movies["cast"].apply(lambda x: convert(x)[:3])
movies["crew"] = movies["crew"].apply(get_director)
movies["overview"] = movies["overview"].fillna("")

# Weighted tags: cast, crew, keywords count more
movies["tags"] = (
    movies["overview"] + " " +
    movies["genres"].apply(lambda x: " ".join(x)) + " " +
    movies["keywords"].apply(lambda x: " ".join(x)) * 2 + " " +
    movies["cast"].apply(lambda x: " ".join(x)) * 2 + " " +
    movies["crew"].apply(lambda x: " ".join(x)) * 2
).str.lower()

# ========== TF-IDF VECTORIZATION ========== #
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies["tags"]).toarray()

# ========== RECOMMENDER FUNCTION ========== #
def recommend_filtered(title, tfidf_matrix, movie_df, top_n=5):
    if title not in movie_df["title"].values:
        return [], []
    index = movie_df[movie_df["title"] == title].index[0]
    vector = tfidf_matrix[index].reshape(1, -1)
    cosine_sim_vector = cosine_similarity(vector, tfidf_matrix)[0]

    input_genres = set(movie_df.iloc[index]["genres"])
    valid_indices = [
        i for i in range(len(movie_df))
        if i != index and len(set(movie_df.iloc[i]["genres"]) & input_genres) > 0
    ]

    valid_scores = [(i, cosine_sim_vector[i]) for i in valid_indices]
    sorted_scores = sorted(valid_scores, key=lambda x: x[1], reverse=True)[:top_n]

    return [movie_df.iloc[i[0]]["title"] for i in sorted_scores], [i[0] for i in sorted_scores]

# ========== METRIC FUNCTIONS ========== #
def precision_at_k(actual, predicted, k):
    actual_set = set(actual)
    pred_set = set(predicted[:k])
    return len(actual_set & pred_set) / k

def recall_at_k(actual, predicted, k):
    actual_set = set(actual)
    pred_set = set(predicted[:k])
    return len(actual_set & pred_set) / len(actual_set) if actual_set else 0

def hit_rate_at_k(actual, predicted, k):
    return 1.0 if len(set(actual) & set(predicted[:k])) > 0 else 0

def ndcg_at_k(actual, predicted, k):
    dcg = 0.0
    for i, p in enumerate(predicted[:k]):
        if p in actual:
            dcg += 1.0 / np.log2(i + 2)
    idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / idcg if idcg > 0 else 0

def intra_list_diversity(indices, tfidf_matrix):
    if len(indices) < 2:
        return 0
    sims = cosine_similarity(tfidf_matrix[indices])
    total_sim = np.sum(np.triu(sims, k=1))
    count = len(indices) * (len(indices) - 1) / 2
    return 1 - (total_sim / count) if count else 0

# ========== RUN EXAMPLE ========== #
sample_movie = "The Dark Knight"
recommended_titles, recommended_indices = recommend_filtered(sample_movie, tfidf_matrix, movies, top_n=5)

# Simulate that user liked 3 out of the 5
simulated_likes = random.sample(recommended_titles, k=3)

# Evaluate
metrics = {
    "Precision@5": precision_at_k(simulated_likes, recommended_titles, 5),
    "Recall@5": recall_at_k(simulated_likes, recommended_titles, 5),
    "Hit Rate@5": hit_rate_at_k(simulated_likes, recommended_titles, 5),
    "NDCG@5": ndcg_at_k(simulated_likes, recommended_titles, 5),
    "Diversity": intra_list_diversity(recommended_indices, tfidf_matrix)
}

# Output
print(f"\n🎬 Top 5 recommendations for '{sample_movie}':")
for i, movie in enumerate(recommended_titles, 1):
    print(f"{i}. {movie}")

print("\n📊 Evaluation Metrics:")
for key, val in metrics.items():
    print(f"{key}: {val:.4f}")



🎬 Top 5 recommendations for 'The Dark Knight':
1. Batman Begins
2. Superman
3. Superman III
4. Batman v Superman: Dawn of Justice
5. Batman

📊 Evaluation Metrics:
Precision@5: 0.6000
Recall@5: 1.0000
Hit Rate@5: 1.0000
NDCG@5: 0.7328
Diversity: 0.5444


In [2]:
import pickle

# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

# Save the TF-IDF matrix (optional, useful if large and pre-computed)
with open("tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)

# Save the processed movie DataFrame
movies.to_pickle("processed_movies.pkl")

print("✅ Vectorizer, TF-IDF matrix, and processed movie data saved successfully.")


✅ Vectorizer, TF-IDF matrix, and processed movie data saved successfully.
