In [72]:
import pandas as pd 
import numpy as np 
import random 

In [16]:
# loading the dataset 
movies_df = pd.read_csv('../../movie_dataset/movies_metadata.csv', low_memory=False)
movies_df.columns 

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [17]:
# creating a df with only movie id and the title 
id_map = movies_df[['id', 'title']]
id_to_title = id_map.set_index('id')
title_to_id = id_map.set_index('title')
id_to_title.head(2), title_to_id.head(2)

(          title
 id             
 862   Toy Story
 8844    Jumanji,
              id
 title          
 Toy Story   862
 Jumanji    8844)

In [18]:
from surprise import SVD, Reader, Dataset 
import surprise 

In [19]:
reader = Reader()
ratings = pd.read_csv('../../movie_dataset/ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],reader)

svd = SVD()
svd 

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2162d2dd910>

In [20]:
ratings.head(2) , ratings.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


(   userId  movieId  rating   timestamp
 0       1       31     2.5  1260759144
 1       1     1029     3.0  1260759179,
 None)

In [21]:
surprise.model_selection.cross_validate(svd, data, measures = ["RMSE"], cv=5)

{'test_rmse': array([0.90660804, 0.8957668 , 0.89732883, 0.89175318, 0.8919353 ]),
 'fit_time': (2.7585561275482178,
  2.81941294670105,
  2.9234230518341064,
  2.8841300010681152,
  2.870241403579712),
 'test_time': (0.29331183433532715,
  0.616814374923706,
  0.2983400821685791,
  0.2945382595062256,
  0.5261836051940918)}

In [22]:

trainset = data.build_full_trainset()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2162d2dd910>

In [23]:
import pickle 

with open("svd.pkl", "wb") as f: 
    pickle.dump(svd, f) 

In [24]:
svd.predict(1,856).est 

2.6511383807318225

In [25]:
# loading the cosine similarity of the movies
import pickle 

with open("cosine_sim.pkl", "rb") as f: 
    cosine_sim = pickle.load(f) 

cosine_sim.shape 

(46628, 46628)

In [26]:
import pickle 

with open("svd.pkl", "rb") as f: 
    svd2 = pickle.load(f) 
svd2.predict(1,856).est 

2.6511383807318225

In [27]:
with open("indices.pkl", "rb") as f: 
    indices_cont = pickle.load(f) 
indices_cont.shape 

(46628,)

In [28]:
def content_recommender(title, cosine_sim=cosine_sim, df=movies_df,indices=indices_cont, top_n = 20):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    try:
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    except:
        pass         
    # top n movies 
    sim_scores = sim_scores[1:top_n]
    movie_indices = [i[0] for i in sim_scores]
    
    return df['title'].iloc[movie_indices].tolist()

In [29]:
content_recommender('Toy Story', cosine_sim, movies_df,indices_cont,10) 

['It Takes Two',
 'Balto',
 'Now and Then',
 'Father of the Bride Part II',
 'Othello',
 'Sabrina',
 'Restoration',
 'Grumpier Old Men',
 'Waiting to Exhale']

In [30]:
movies_df['title'].iloc[37]

'It Takes Two'

In [32]:
movies_df['title'].tolist()[:4], len(movies_df['title'].tolist()) 

(['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale'], 45466)

In [73]:
def collaborative_recommend(user_id, movies_df, top_n = 10): 
    top_movies = [] 
    movies = random.sample(movies_df['title'].tolist(), 200)
    for movie in movies:
        try:
            idx = movies_df.index[movies_df['title'] == movie].tolist()[0]
        
            score = svd.predict(user_id, idx).est 
            top_movies.append((score, movies_df['title'].iloc[idx]))
        except: 
            continue 
    top_movies.sort(reverse=True)
    return [movie[1] for movie in top_movies][:top_n] 

In [75]:
collaborative_recommend(109, movies_df) 

['Sgt. Bilko',
 'The Negotiator',
 'The Woman Next Door',
 'Wild Wild West',
 'S.O.B.',
 'Zardoz',
 'Citizen Kane',
 'Samurai Vendetta',
 'JLG/JLG: Self-Portrait in December',
 'The Impostors']

In [81]:
# using cosine sim for movies list with similar content 
## then using svd for collaborative recommendation 

def hybrid_recommend(user_id, title,svd, cosine_sim, movies_df, indices_cont, top_n = 10): 
    # using content based rec
    top_movies_titles  = content_recommender(title, cosine_sim, movies_df, indices_cont, top_n=100)
    print(top_movies_titles) 
    top_movies = []
    
    for i in top_movies_titles: 
        idx = movies_df.index[movies_df['title'] == i].tolist()[0]
        
        score = svd.predict(user_id, idx).est
        top_movies.append((score, movies_df['title'].iloc[idx]))
    
    # Sort the recommendations based on SVD scores
    top_movies.sort(reverse=True)
    
    return top_movies

In [82]:
hybrid_recommend(1984, 'Bed of Roses', svd, cosine_sim, movies_df, indices_cont, top_n=30)

['Jumanji']


[(3.8934401625079516, 'Jumanji')]

In [83]:
content_recommender('Bed of Roses') 

['Jumanji']