<a href="https://colab.research.google.com/github/VIVEK-JADHAV/MovieRecommendation/blob/main/Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing libraries

import pandas as pd
import numpy as np
import os
from ast import literal_eval
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.sparse import csr_matrix,coo_matrix

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Loading the data

os.mkdir('/content/data')
os.chdir('/content/data')
print(os.getcwd())

!unzip '/content/drive/MyDrive/CaseStudies/RecommendationSystem/archive.zip'

/content/data
Archive:  /content/drive/MyDrive/CaseStudies/RecommendationSystem/archive.zip
  inflating: credits.csv             
  inflating: keywords.csv            
  inflating: links.csv               
  inflating: links_small.csv         
  inflating: movies_metadata.csv     
  inflating: ratings.csv             
  inflating: ratings_small.csv       


### Models


In [None]:
#Creating Sparse Matrix as it creates userId as row indices and movieId as column names

ratings_sparse=sparse.coo_matrix((ratings['rating'].values,(ratings['userId'].values,ratings['movieId'].values)))
print('The shape of sparse train matrix is ',ratings_sparse.shape)

The shape of sparse train matrix is  (672, 160719)


In [None]:
#Converting to dense matrix and removing columns which has no rating
ratings_dense=pd.DataFrame.sparse.from_spmatrix(ratings_sparse)
ratings_dense.shape

(672, 160719)

In [None]:
ratings_dense=ratings_dense.loc[:,ratings_dense.sum(axis=0)!=0]


print('The shape of dense  matrix is ',ratings_dense.shape)

The shape of dense  matrix is  (672, 2830)


In [None]:
#Making the userId from 1 
ratings_dense.rename(lambda x: x+1,axis='index',inplace=True)


#### User-User Similarity

In [None]:
#Computing similar users based on cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

def userSimilarity(uid,top):
  '''Takes userId and returns top similar users '''
  
  if uid in ratings_dense.index.values:
  #Computes cosine similarity between given user and all the users
    sim=cosine_similarity(ratings_dense.loc[uid:uid,:],ratings_dense).ravel()

    #Picking the top+1 users (+1 as cosine similarity between itself is 1 which is max) 
    top_sim_idx=sim.argsort()[-(top+1):]
    
    #Reverse the order
    top_sim_idx=top_sim_idx[::-1]
    return ratings_dense.index.values[top_sim_idx[1:]]
  else:
    return "User does not exist"



In [None]:
def common(row,mlist):
  if row['id'] in mlist:
    return True
  else:
    return False


def recommendUser(uid,top):

  #Obtain similarity score
  users=userSimilarity(uid,top)
  

  if not (isinstance(users,str)):

    #Obtain movies not watched by the user
    lst=[]
    for j in users:      
      lst.extend(ratings['movieId'][ratings['userId']==j].values)
    ulist=ratings['movieId'][ratings['userId']==uid].values

    mlist=list(set(lst)-set(ulist))

    display=movies[movies.apply(lambda x: common(x,mlist),axis=1)]
    return display
  return users

In [None]:
recommendUser(1,10).head()


Unnamed: 0,title,genres,vote_average,vote_count,original_language,id
17,Four Rooms,"[Crime, Comedy]",6.5,539.0,en,5
109,Taxi Driver,"[Crime, Drama]",8.1,2632.0,en,103
249,Interview with the Vampire,"[Horror, Romance]",7.2,1558.0,en,628
302,Three Colors: Red,"[Drama, Mystery, Romance]",7.8,246.0,fr,110
324,Star Trek: Generations,"[Science Fiction, Action, Adventure, Thriller]",6.4,461.0,en,193


In [None]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,vote_average,vote_count,original_language
0,1,1371,2.5,1970-01-01,Rocky III,[Drama],6.6,894.0,en
47,1,1405,1.0,1970-01-01,Greed,"[Drama, History]",7.5,25.0,en
93,1,2105,4.0,1970-01-01,American Pie,"[Comedy, Romance]",6.4,2358.0,en
140,1,2193,2.0,1970-01-01,My Tutor,"[Comedy, Drama, Romance]",5.8,17.0,en
182,1,2294,2.0,1970-01-01,Jay and Silent Bob Strike Back,[Comedy],6.4,491.0,en
235,1,2455,2.5,1970-01-01,Confidentially Yours,"[Drama, Comedy, Crime]",7.1,25.0,fr


#### Item-Item Similarity

In [None]:
#Computing similar users based on cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

def itemSimilarity(iid,top):
  '''Takes userId and returns top similar users '''
  if iid in ratings_dense.columns:
  #Computes cosine similarity between given user and all the users
    sim=cosine_similarity(ratings_dense.loc[:,iid:iid].transpose(),ratings_dense.transpose()).ravel()

    #Picking the top+1 users (+1 as cosine similarity between itself is 1 which is max) 
    top_sim_idx=sim.argsort()[-(top+1):]
    
    #Reverse the order
    top_sim_idx=top_sim_idx[::-1]
    return ratings_dense.columns.values[top_sim_idx[1:]]
  else:
    return "Movie does not exist"



In [None]:

def recommendMovie(iid,top):

  #Obtain similarity score
  items=itemSimilarity(iid,top)
  print(items)
  

  if not (isinstance(items,str)):

    #Obtain movies not watched by the user
    display=movies[movies['id'].isin(items)]
    return display
  return items

In [None]:
recommendMovie(628,5)

[832 786 805  14 802]


Unnamed: 0,title,genres,vote_average,vote_count,original_language,id
1215,M,"[Drama, Action, Thriller, Crime]",8.0,465.0,de,832
2051,Rosemary's Baby,"[Horror, Drama, Mystery]",7.5,892.0,en,805
2614,Lolita,"[Drama, Romance]",7.3,409.0,en,802
2742,American Beauty,[Drama],7.9,3438.0,en,14
3773,Almost Famous,"[Drama, Music]",7.4,807.0,en,786


In [None]:
movies[movies['id']==628]

Unnamed: 0,title,genres,vote_average,vote_count,original_language,id
249,Interview with the Vampire,"[Horror, Romance]",7.2,1558.0,en,628


#### SVD

In [None]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 4.0MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1618242 sha256=488a7d27ad2a60a563650a57f99d9d1d68be1502d5125c90210d1f874c0278f0
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [None]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split,GridSearchCV
from surprise import SVD


In [None]:
#Loading data from Dataframe
reader = Reader()
data=Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)



In [None]:
#Finding the best parameters using Grid Search CV

param_grid = {'n_epochs': [5, 10,15], 'lr_all': [0.002, 0.005,0.01],
              'reg_all': [0.4, 0.6,1.0]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3,joblib_verbose=1,n_jobs=-1,return_train_measures=True)

gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  2.3min finished


In [None]:
#Best parameters for rmse algorithm
gs.best_params['rmse']

{'lr_all': 0.01, 'n_epochs': 15, 'reg_all': 0.4}

In [None]:
#Best RMSE from cv
gs.best_score['rmse']

0.9050775344171739

In [None]:
#Obtaining best parameters for rmse score and fitting on training data
algo = gs.best_estimator['rmse']
data_set = data.build_full_trainset()
algo.fit(data_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f963a62dd68>

In [None]:
#Predicting on userId=1
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,vote_average,vote_count,original_language
0,1,1371,2.5,1970-01-01,Rocky III,[Drama],6.6,894.0,en
47,1,1405,1.0,1970-01-01,Greed,"[Drama, History]",7.5,25.0,en
93,1,2105,4.0,1970-01-01,American Pie,"[Comedy, Romance]",6.4,2358.0,en
140,1,2193,2.0,1970-01-01,My Tutor,"[Comedy, Drama, Romance]",5.8,17.0,en
182,1,2294,2.0,1970-01-01,Jay and Silent Bob Strike Back,[Comedy],6.4,491.0,en
235,1,2455,2.5,1970-01-01,Confidentially Yours,"[Drama, Comedy, Crime]",7.1,25.0,fr


In [None]:
#Prediction
algo.predict(1,1371,2.5)

Prediction(uid=1, iid=1371, r_ui=2.5, est=2.6914739810814696, details={'was_impossible': False})

For userId=1 and MovieId=1371, the true rating was 2.5 and the estimated rating was 2.69, which are close to each other.