In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler


In [2]:
ratings = pd.read_csv("/content/ratings.csv")      # MovieLens
movies_ml = pd.read_csv("/content/movies.csv")

tmdb_movies = pd.read_csv("/content/tmdb_5000_movies.csv")
tmdb_credits = pd.read_csv("/content/tmdb_5000_credits.csv")


In [3]:
import ast

def parse(x):
    return " ".join([i['name'] for i in ast.literal_eval(x)])

tmdb_credits.rename(columns={'movie_id':'id'}, inplace=True)
tmdb = tmdb_movies.merge(tmdb_credits, on='id')

tmdb['tags'] = (
    tmdb['overview'].fillna('') + ' ' +
    tmdb['genres'].apply(parse) + ' ' +
    tmdb['keywords'].apply(parse)
)


In [4]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
content_vectors = tfidf.fit_transform(tmdb['tags'])
content_similarity = cosine_similarity(content_vectors)


In [27]:
title_col= 'title_x' if 'title_x' in tmdb.columns else 'original_title'
tmdb_index = pd.Series(tmdb.index, index=tmdb[title_col])


In [28]:
user_movie = ratings.pivot_table(
    index='userId', columns='movieId', values='rating'
).fillna(0)


In [29]:
item_sim = cosine_similarity(user_movie.T)
item_sim_df = pd.DataFrame(
    item_sim,
    index=user_movie.columns,
    columns=user_movie.columns
)


In [30]:
svd = TruncatedSVD(n_components=100, random_state=42)
user_latent = svd.fit_transform(user_movie)
item_latent = svd.components_

scaler = MinMaxScaler()


In [54]:
def mf_scores(user_index):
    scores = np.dot(user_latent[user_index], item_latent)
    return scaler.fit_transform(scores.reshape(-1, 1)).flatten()



In [55]:
popularity = ratings.groupby('movieId')['rating'].count()
popularity = popularity / popularity.max()


In [56]:
def hybrid_recommend(
    user_index,
    seed_movie_title,
    k=10,
    w_content=0.30,
    w_item=0.30,
    w_mf=0.25,
    w_pop=0.15
):
    idx = tmdb_index[seed_movie_title]
    content_scores = list(enumerate(content_similarity[idx]))
    content_scores = sorted(content_scores, key=lambda x: x[1], reverse=True)[1:50]

    mf_score_array = mf_scores(user_index)
    final_scores = {}

    for tmdb_idx, c_score in content_scores:
        title = tmdb.iloc[tmdb_idx][title_col]
        ml_match = movies_ml[movies_ml['title'].str.contains(title, case=False)]

        if ml_match.empty:
            continue

        movie_id = ml_match.iloc[0]['movieId']
        item_score = item_sim_df.loc[movie_id].mean()
        mf_score = mf_score_array[item_sim_df.columns.get_loc(movie_id)]
        pop_score = popularity.get(movie_id, 0)

        final_scores[movie_id] = (
            w_content * c_score +
            w_item * item_score +
            w_mf * mf_score +
            w_pop * pop_score
        )

    top_ids = sorted(final_scores, key=final_scores.get, reverse=True)[:k]
    return [(mid, movieid_to_title[mid]) for mid in top_ids]



In [44]:
def precision_recall_at_k(user_id, k=10, threshold=4.0):
    true_movies = ratings[
        (ratings['userId'] == user_id) & (ratings['rating'] >= threshold)
    ]['movieId'].values

    if len(true_movies) == 0:
        return None, None

    recommended = hybrid_recommend(
        user_id=user_id,
        movie_title=tmdb.iloc[0][title_col],
        k=k
    )['movieId'].values

    relevant = set(true_movies)
    retrieved = set(recommended)

    precision = len(relevant & retrieved) / k
    recall = len(relevant & retrieved) / len(relevant)

    return precision, recall


In [45]:
precision, recall = precision_recall_at_k(user_id=10, k=10)
precision, recall


(0.8, 0.02622950819672131)

In [46]:
movieid_to_title = dict(
    zip(movies_ml['movieId'], movies_ml['title'])
)


In [47]:
def recommend_movies(
    user_id,
    seed_movie_title,
    k=10
):
    # Get recommendations (MovieLens IDs)
    rec_df = hybrid_recommend(
        user_id=user_id,
        movie_title=seed_movie_title,
        k=k
    )

    # Convert IDs → Titles
    rec_df = rec_df.copy()
    rec_df['recommended_title'] = rec_df['movieId'].map(movieid_to_title)

    return rec_df[['movieId', 'recommended_title']]


In [48]:
recommend_movies(
    user_id=10,
    seed_movie_title="Avatar",
    k=10
)


Unnamed: 0,movieId,recommended_title
325,329,Star Trek: Generations (1994)
770,780,Independence Day (ID4) (1996)
912,924,2001: A Space Odyssey (1968)
1182,1200,Aliens (1986)
1355,1376,Star Trek IV: The Voyage Home (1986)
1607,1653,Gattaca (1997)
1630,1676,Starship Troopers (1997)
1952,2021,Dune (1984)
2460,2529,Planet of the Apes (1968)
2964,3033,Spaceballs (1987)


In [51]:
def print_recommendations(df):
    print("\nRecommended Movies:\n")
    for i, title in enumerate(df['recommended_title'], 1):
        print(f"{i}. {title}")


In [52]:
print_recommendations(
    recommend_movies(10, "Avatar", 10)
)



Recommended Movies:

1. Star Trek: Generations (1994)
2. Independence Day (ID4) (1996)
3. 2001: A Space Odyssey (1968)
4. Aliens (1986)
5. Star Trek IV: The Voyage Home (1986)
6. Gattaca (1997)
7. Starship Troopers (1997)
8. Dune (1984)
9. Planet of the Apes (1968)
10. Spaceballs (1987)


In [53]:
import os
import pickle

os.makedirs("artifacts", exist_ok=True)

# --- TMDB CONTENT MODEL ---
pickle.dump(tmdb, open("artifacts/tmdb_df.pkl", "wb"))
pickle.dump(title_col, open("artifacts/title_col.pkl", "wb"))
pickle.dump(tfidf, open("artifacts/tfidf.pkl", "wb"))
pickle.dump(content_similarity, open("artifacts/content_similarity.pkl", "wb"))

# --- MOVIELENS METADATA ---
pickle.dump(movies_ml, open("artifacts/movies_ml.pkl", "wb"))

# --- COLLABORATIVE FILTERING ---
pickle.dump(item_sim_df, open("artifacts/item_similarity.pkl", "wb"))

# --- MATRIX FACTORIZATION ---
pickle.dump(svd, open("artifacts/svd.pkl", "wb"))
pickle.dump(user_latent, open("artifacts/user_latent.pkl", "wb"))
pickle.dump(item_latent, open("artifacts/item_latent.pkl", "wb"))

# --- POPULARITY ---
pickle.dump(popularity, open("artifacts/popularity.pkl", "wb"))

print("✅ All hybrid recommender artifacts saved successfully.")


✅ All hybrid recommender artifacts saved successfully.


In [None]:
import pickle

# --- LOAD TMDB CONTENT MODEL ---
tmdb = pickle.load(open("artifacts/tmdb_df.pkl", "rb"))
title_col = pickle.load(open("artifacts/title_col.pkl", "rb"))
tfidf = pickle.load(open("artifacts/tfidf.pkl", "rb"))
content_similarity = pickle.load(open("artifacts/content_similarity.pkl", "rb"))

# --- LOAD MOVIELENS ---
movies_ml = pickle.load(open("artifacts/movies_ml.pkl", "rb"))

# --- LOAD COLLABORATIVE FILTERING ---
item_sim_df = pickle.load(open("artifacts/item_similarity.pkl", "rb"))

# --- LOAD MATRIX FACTORIZATION ---
svd = pickle.load(open("artifacts/svd.pkl", "rb"))
user_latent = pickle.load(open("artifacts/user_latent.pkl", "rb"))
item_latent = pickle.load(open("artifacts/item_latent.pkl", "rb"))

# --- LOAD POPULARITY ---
popularity = pickle.load(open("artifacts/popularity.pkl", "rb"))

print("✅ All hybrid recommender artifacts loaded successfully.")
