In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd. read_csv('movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [5]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [6]:
m = vote_counts.quantile(0.95)
m

434.0

In [7]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [8]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [9]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [10]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


In [11]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [12]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988,834,8,14.177005,7.744878
19901,Paperman,2012,734,8,7.198633,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.994281,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


In [3]:
md = pd.read_csv('movies_metadata.csv')
links_small = pd.read_csv('links_small.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

md = md.drop([19730, 29503, 35587])

md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

md['id'] = md['id'].astype('int')
credits['id'] = credits['id'].astype(int)
keywords['id'] = keywords['id'].astype(int)

md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 27)

In [4]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + ' ' + smd['tagline']
smd['description'] = smd['description'].fillna('')

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

cosine_sim_desc = linear_kernel(tfidf_matrix, tfidf_matrix)

smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

s = s.value_counts()

s = s[s > 1]

stemmer = SnowballStemmer('english')

def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

cosine_sim_soup = cosine_similarity(count_matrix, count_matrix)

cosine_sim_hybrid = (cosine_sim_desc * 0.5) + (cosine_sim_soup * 0.5)

In [5]:
smd = smd.reset_index(drop=True)
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_hybrid[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    
    movie_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    return pd.DataFrame({
        "title": titles.iloc[movie_indices],
        "description + tagline": smd['description'].iloc[movie_indices],
        "keywords": smd['keywords'].iloc[movie_indices],
        "cast": smd['cast'].iloc[movie_indices],
        "director": smd['director'].iloc[movie_indices],
        "genres": smd['genres'].iloc[movie_indices],
        "score": scores
    })

In [6]:
user_input = input("Enter a movie title: ")
print(get_recommendations(user_input).head(10))

                              title  \
982          The Godfather: Part II   
3600  Tucker: The Man and His Dream   
1338                  The Rainmaker   
1594        The Godfather: Part III   
3292               Gardens of Stone   
3689                The Cotton Club   
4490             One from the Heart   
2990               The Conversation   
5831                    Rumble Fish   
1984          Peggy Sue Got Married   

                                  description + tagline  \
982   In the continuing saga of the Corleone crime f...   
3600  Based on a true story. Shortly after World War...   
1338  When Rudy Baylor (Matt Damon), a young attorne...   
1594  In the midst of trying to legitimize his busin...   
3292  A sergeant must deal with his desires to save ...   
3689  The story of the people that frequented Harlem...   
4490  Hank and Frannie don't seem to be able to live...   
2990  Surveillance expert Harry Caul (Gene Hackman) ...   
5831  Rusty James, an absent-minded str

In [17]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [18]:
ratings[(ratings['userId'] == 1) & (ratings['movieId'] == 1167)]

Unnamed: 0,userId,movieId,rating,timestamp


In [19]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8961  0.8884  0.8993  0.9042  0.8948  0.8966  0.0052  
MAE (testset)     0.6876  0.6855  0.6921  0.6981  0.6898  0.6906  0.0044  
Fit time          0.88    0.92    0.96    1.01    1.00    0.95    0.05    
Test time         0.20    0.08    0.08    0.40    0.09    0.17    0.12    


{'test_rmse': array([0.89612109, 0.88841673, 0.89933353, 0.90421751, 0.89477959]),
 'test_mae': array([0.68756511, 0.68548951, 0.69209777, 0.69813198, 0.68981116]),
 'fit_time': (0.8845324516296387,
  0.9200954437255859,
  0.9553546905517578,
  1.0065603256225586,
  0.9987566471099854),
 'test_time': (0.19922995567321777,
  0.08358073234558105,
  0.07978963851928711,
  0.402402400970459,
  0.09443449974060059)}

In [20]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15e4945dc70>

In [21]:
svd.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.81596801885865, details={'was_impossible': False})

In [22]:
# Make sure 'md' has 'id' (movieId) and 'title'
movie_map = md.set_index('id')['title'].to_dict()  # movieId → title

def get_top_n_movies(user_id, top_n=10):
    # Build anti-testset (all movies the user hasn't rated)
    testset = trainset.build_anti_testset()
    
    # Predict ratings for all unseen movies
    predictions = svd.test(testset)
    
    # Filter predictions for the requested user AND movies with valid titles
    user_preds = [pred for pred in predictions 
                  if int(pred.uid) == int(user_id) and pred.iid in movie_map]
    
    # Sort by estimated rating descending
    user_preds.sort(key=lambda x: x.est, reverse=True)
    
    # Take top-N
    top_movies = pd.DataFrame(
        [(pred.iid, pred.est) for pred in user_preds[:top_n]],
        columns=['movieId', 'est_rating']
    )
    
    # Map movieId → title
    top_movies['title'] = top_movies['movieId'].map(movie_map)

    # Set movieId as index
    top_movies = top_movies.set_index('movieId')[['title', 'est_rating']]
    top_movies.index.name = None
    
    return top_movies

In [23]:
pd.set_option('display.max_colwidth', None)

user_input = int(input("Enter a user id: "))
print(get_top_n_movies(user_input))

                                   title  est_rating
912              The Thomas Crown Affair    3.755500
2692                       The Red Elvis    3.749014
922                             Dead Man    3.631933
296   Terminator 3: Rise of the Machines    3.625612
318             The Million Dollar Hotel    3.604890
899                      Broken Blossoms    3.576174
1254               Don't Worry, I'm Fine    3.556840
1247                   The Good Shepherd    3.548078
2959                      License to Wed    3.531350
898                  Birdman of Alcatraz    3.491579


In [24]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

indices_map = id_map.set_index('id')

def hybrid(userId, title):
    # Get index and TMDB/movie mapping
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    
    # Compute similarity scores
    sim_scores = list(enumerate(cosine_sim_hybrid[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    # Collect movies info
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    
    # Estimate rating for the user
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    
    # Sort by estimated rating
    movies = movies.sort_values('est', ascending=False)
    
    return movies.head(10)

In [25]:
hybrid(1, 'Avatar')

KeyError: "['year'] not in index"

In [None]:
# Make sure convert_int function exists
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

# Load links_small
links_small = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
links_small['tmdbId'] = links_small['tmdbId'].apply(convert_int)

# Merge with smd to get titles
movie_titles = links_small.merge(smd[['id', 'title']], left_on='tmdbId', right_on='id', how='left')
movie_titles = movie_titles.set_index('movieId')['title'].to_dict()

# Map titles in ratings
ratings['title'] = ratings['movieId'].map(movie_titles)

# Filter user 1
user_ratings = ratings[ratings['userId'] == 1]
print(user_ratings[['movieId', 'title', 'rating']])

    movieId                            title  rating
0        31                  Dangerous Minds     2.5
1      1029                            Dumbo     3.0
2      1061                         Sleepers     3.0
3      1129             Escape from New York     2.0
4      1172                  Cinema Paradiso     4.0
5      1263                  The Deer Hunter     2.0
6      1287                          Ben-Hur     2.0
7      1293                           Gandhi     2.0
8      1339                          Dracula     3.5
9      1343                        Cape Fear     2.0
10     1371    Star Trek: The Motion Picture     2.5
11     1405  Beavis and Butt-Head Do America     1.0
12     1953            The French Connection     4.0
13     2105                             Tron     4.0
14     2150           The Gods Must Be Crazy     3.0
15     2193                           Willow     2.0
16     2294                             Antz     2.0
17     2455                          The Fly  

In [26]:
# ----------------------------
# LOAD DATA
# ----------------------------
md = pd.read_csv('movies_metadata.csv')
links_small = pd.read_csv('links_small.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype(int)

# Drop problematic rows
md = md.drop([19730, 29503, 35587])

# ----------------------------
# PREPROCESS METADATA
# ----------------------------
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
md['id'] = md['id'].astype(int)
credits['id'] = credits['id'].astype(int)
keywords['id'] = keywords['id'].astype(int)

# Merge credits and keywords
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

# Filter smd to movies in links_small
smd = md[md['id'].isin(links_small)].copy()
smd = smd.reset_index(drop=True)

# ----------------------------
# YEAR EXTRACTION
# ----------------------------
md['release_date'] = pd.to_datetime(md['release_date'], errors='coerce')
# Fix known wrong data
md.loc[md['title']=="The Terminator",'release_date'] = pd.to_datetime('1984-10-26')
md['year'] = md['release_date'].dt.year

# Unique mapping id -> year
year_map = md.drop_duplicates(subset='id').set_index('id')['year']
smd['year'] = smd['id'].map(year_map)

# ----------------------------
# CONTENT-BASED FEATURES
# ----------------------------
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = (smd['overview'].fillna('') + ' ' + smd['tagline']).fillna('')

tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
cosine_sim_desc = linear_kernel(tfidf_matrix, tfidf_matrix)

smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)

def get_director(x):
    for i in x:
        if i['job']=='Director':
            return i['name']
    return np.nan

smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x][:3] if isinstance(x,list) else [])
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])

# Lowercase & clean
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
smd['director'] = smd['director'].astype(str).apply(lambda x: str.lower(x.replace(" ",""))).apply(lambda x: [x]*3)

# Filter keywords and stem
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().value_counts()
s = s[s>1]
stemmer = SnowballStemmer('english')

def filter_keywords(x):
    return [stemmer.stem(i.lower().replace(" ","")) for i in x if i in s]

smd['keywords'] = smd['keywords'].apply(filter_keywords)

# Soup for CountVectorizer
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

count = CountVectorizer(analyzer='word', ngram_range=(1,2), min_df=1, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])
cosine_sim_soup = cosine_similarity(count_matrix, count_matrix)

# Hybrid similarity
cosine_sim_hybrid = (cosine_sim_desc * 0.5) + (cosine_sim_soup * 0.5)

# ----------------------------
# MAPPINGS
# ----------------------------
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

id_map = pd.read_csv('links_small.csv')[['movieId','tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(lambda x: int(x) if pd.notnull(x) else np.nan)
id_map.columns = ['movieId','id']
id_map = id_map.merge(smd[['title','id']], on='id').set_index('title')
indices_map = id_map.set_index('id')

# ----------------------------
# COLLABORATIVE FILTERING
# ----------------------------
ratings = pd.read_csv('ratings_small.csv')
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)
svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

# ----------------------------
# 1️⃣ CONTENT-BASED RECOMMENDATION
# ----------------------------
def content_based(title, top_n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_hybrid[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    return pd.DataFrame({
        'title': smd['title'].iloc[movie_indices],
        'year': smd['year'].iloc[movie_indices],
        'score': scores
    })

# ----------------------------
# 2️⃣ COLLABORATIVE FILTERING
# ----------------------------
def collaborative(userId, top_n=10):
    testset = trainset.build_anti_testset()
    predictions = svd.test(testset)
    
    movie_map = smd.set_index('id')['title'].to_dict()
    user_preds = [pred for pred in predictions if int(pred.uid)==int(userId) and pred.iid in movie_map]
    user_preds.sort(key=lambda x: x.est, reverse=True)
    
    top_movies = pd.DataFrame({
        'title': [movie_map[pred.iid] for pred in user_preds[:top_n]],
        'est_rating': [pred.est for pred in user_preds[:top_n]],
        'year': [smd[smd['id']==pred.iid]['year'].values[0] for pred in user_preds[:top_n]]
    })
    return top_movies

# ----------------------------
# 3️⃣ HYBRID RECOMMENDATION
# ----------------------------
def hybrid(userId, title, top_n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_hybrid[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title','year','id']].copy()
    movies['est_rating'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est_rating', ascending=False).head(top_n)
    
    return movies[['title','year','est_rating']]


In [27]:
movie_title = "Avatar"
user_id = 1

In [28]:
print("Content-based recommendations:")
display(content_based(movie_title))

Content-based recommendations:


Unnamed: 0,title,year,score
962,Aliens,1986.0,0.175974
999,The Terminator,1984.0,0.153062
522,Terminator 2: Judgment Day,1991.0,0.149564
910,The Abyss,1989.0,0.145526
4323,Piranha Part Two: The Spawning,1981.0,0.134285
344,True Lies,1994.0,0.129832
8357,Star Trek Into Darkness,2013.0,0.113652
1368,Titanic,1997.0,0.106366
3208,Dungeons & Dragons,2000.0,0.085517
8692,Jupiter Ascending,2015.0,0.085517


In [29]:
print("Collaborative filtering recommendations:")
display(collaborative(user_id))

Collaborative filtering recommendations:


Unnamed: 0,title,est_rating,year
0,The Million Dollar Hotel,3.67239,2000.0
1,Madagascar,3.669956,2005.0
2,The 39 Steps,3.656842,1935.0
3,The Thomas Crown Affair,3.599354,1999.0
4,While You Were Sleeping,3.563833,1995.0
5,The Thomas Crown Affair,3.553987,1968.0
6,Mission: Impossible,3.551622,1996.0
7,Beverly Hills Cop III,3.523753,1994.0
8,Sleepless in Seattle,3.4995,1993.0
9,Frankenstein,3.481987,1931.0


In [30]:
print("Hybrid recommendations:")
display(hybrid(user_id, movie_title))

Hybrid recommendations:


Unnamed: 0,title,year,est_rating
522,Terminator 2: Judgment Day,1991.0,3.478669
999,The Terminator,1984.0,3.237543
962,Aliens,1986.0,3.14346
8357,Star Trek Into Darkness,2013.0,3.113299
6147,A Trip to the Moon,1902.0,3.087636
2826,Predator,1987.0,2.949901
344,True Lies,1994.0,2.821303
910,The Abyss,1989.0,2.795049
1660,Return from Witch Mountain,1978.0,2.762973
4001,Hawk the Slayer,1980.0,2.661242
