In [1]:
import pandas as pd 
import numpy as np 

In [4]:
# loading the dataset 
movies_df = pd.read_csv('../../movie_dataset/movies_metadata.csv', low_memory=False)
movies_df.columns 

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
# creating a df with only movie id and the title 
id_map = movies_df[['id', 'title']]
id_to_title = id_map.set_index('id')
title_to_id = id_map.set_index('title')
id_to_title.head(2), title_to_id.head(2)

(          title
 id             
 862   Toy Story
 8844    Jumanji,
              id
 title          
 Toy Story   862
 Jumanji    8844)

In [13]:
from surprise import SVD, Reader, Dataset 
import surprise 

In [14]:
reader = Reader()
ratings = pd.read_csv('../../movie_dataset/ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],reader)

svd = SVD()
svd 

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16289df2990>

In [21]:
ratings.head(2) , ratings.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


(   userId  movieId  rating   timestamp
 0       1       31     2.5  1260759144
 1       1     1029     3.0  1260759179,
 None)

In [15]:
surprise.model_selection.cross_validate(svd, data, measures = ["RMSE"], cv=5)

{'test_rmse': array([0.90100891, 0.8952684 , 0.89293555, 0.90409824, 0.88824767]),
 'fit_time': (0.7699284553527832,
  0.6979999542236328,
  0.7541947364807129,
  0.7191648483276367,
  0.7028579711914062),
 'test_time': (0.10700106620788574,
  0.10224461555480957,
  0.05728936195373535,
  0.061135292053222656,
  0.05694127082824707)}

In [18]:

trainset = data.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16289df2990>

In [22]:
svd.predict(1,856).est 

2.699173969889621

In [23]:
# loading the cosine similarity of the movies
import pickle 

with open("cosine_sim.pkl", "rb") as f: 
    cosine_sim = pickle.load(f) 

cosine_sim.shape 

(46628, 46628)

In [25]:
with open("indices.pkl", "rb") as f: 
    indices_cont = pickle.load(f) 
indices_cont.shape 

(46628,)

In [37]:
def content_recommender(title, cosine_sim=cosine_sim, df=movies_df,indices=indices_cont, top_n = 20):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    try:
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    except:
        pass         
    # top n movies 
    sim_scores = sim_scores[1:top_n]
    movie_indices = [i[0] for i in sim_scores]
    
    return df['title'].iloc[movie_indices].tolist()

In [38]:
content_recommender('Toy Story', cosine_sim, movies_df,indices_cont,10) 

['It Takes Two',
 'Balto',
 'Now and Then',
 'Father of the Bride Part II',
 'Othello',
 'Sabrina',
 'Restoration',
 'Grumpier Old Men',
 'Waiting to Exhale']

In [36]:
movies_df['title'].iloc[37]

'It Takes Two'

In [56]:
# using cosine sim for movies list with similar content 
## then using svd for collaborative recommendation 

def hybrid_recommend(user_id, title,svd, cosine_sim, movies_df, indices_cont, top_n = 10): 
    # using content based rec
    top_movies_titles  = content_recommender(title, cosine_sim, movies_df, indices_cont, top_n)
    
    top_movies = []
    
    for i in top_movies_titles: 
        idx = movies_df.index[movies_df['title'] == i].tolist()[0]
        
        score = svd.predict(user_id, idx).est
        top_movies.append((score, movies_df['title'].iloc[idx]))
    
    # Sort the recommendations based on SVD scores
    top_movies.sort(reverse=True)
    
    return top_movies

In [58]:
hybrid_recommend(13, 'Toy Story', svd, cosine_sim, movies_df, indices_cont, top_n=30)

[(3.9690322452128886, 'Jumanji'),
 (3.7694667419430368, 'Sense and Sensibility'),
 (3.7641787154535424, 'Nixon'),
 (3.7448179174551743, 'Now and Then'),
 (3.6917628738631443, 'Dracula: Dead and Loving It'),
 (3.6622675075760456, 'Sabrina'),
 (3.6274706306659525, 'Four Rooms'),
 (3.569372741708889, 'Othello'),
 (3.565795965448871, 'Copycat'),
 (3.478722564019268, 'Clueless'),
 (3.4760037101124546, 'Sudden Death'),
 (3.4498802678192892, 'Babe'),
 (3.428055087036806, 'Pocahontas'),
 (3.427317232634702, 'Heat'),
 (3.4113229862482273, 'Grumpier Old Men'),
 (3.399265274908767, 'Cutthroat Island'),
 (3.363268264300846, 'It Takes Two'),
 (3.3106186435903693, 'Ace Ventura: When Nature Calls'),
 (3.268206224546398, 'Tom and Huck'),
 (3.2015637369332834, 'The American President'),
 (3.1755474277389197, 'GoldenEye'),
 (3.1316981716249863, 'Waiting to Exhale'),
 (2.9533535648647358, 'To Die For'),
 (2.9476961034649656, 'Balto'),
 (2.8108823960505642, 'Money Train'),
 (2.7614474950144574, 'Casino'),