In [None]:
#Supressing the warning messages
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Mounting your drive to colab

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# unzipping the dataset

import zipfile as z
zip_ref = z.ZipFile("/content/drive/My Drive/the-movies-dataset.zip", "r")
zip_ref.extractall("/content/")
zip_ref.close()

We will use **Surprise** which is a Python scikit building and analyzing recommender systems that deal with explicit rating data.

In [None]:
!pip install surprise  #installing surprise library

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 7.4MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617530 sha256=4a3d76cb6e5657c7638679e3621012d2d7cad58d2f1456af9777d00185b0f53a
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


#Importing the Libraries

In [None]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate   
#The Reader class is used to parse a file containing ratings.
reader = Reader()

In [None]:
import pandas as pd
ratings = pd.read_csv('/content/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


#Matrix Factorization

In [None]:
#Creating a data object telling how to parse data
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [11]:
#calling the SVD class and create an object of it and doing cross validation on the SVD class algotirhm
svd = SVD()

#Run a cross validation procedure for a given algorithm, reporting accuracy measures and computation times.
#5 means 5 folds.
cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9043  0.8920  0.8976  0.8948  0.8979  0.8973  0.0041  
MAE (testset)     0.6971  0.6865  0.6907  0.6913  0.6911  0.6913  0.0034  
Fit time          4.94    5.10    4.96    5.16    4.97    5.02    0.09    
Test time         0.19    0.29    0.28    0.19    0.28    0.25    0.04    


{'fit_time': (4.944015979766846,
  5.095363140106201,
  4.958854675292969,
  5.158387899398804,
  4.967019081115723),
 'test_mae': array([0.69706053, 0.68646728, 0.69066337, 0.69125432, 0.69106968]),
 'test_rmse': array([0.90426093, 0.89204333, 0.8975704 , 0.89484094, 0.89789031]),
 'test_time': (0.19055747985839844,
  0.28804564476013184,
  0.2792184352874756,
  0.19393110275268555,
  0.280078649520874)}

#Training

In [12]:
#Built Model from the whole dataset instead of splitting dataset

trainset = data.build_full_trainset()
svd.fit(trainset)

type(trainset)

surprise.trainset.Trainset

#Testing

In [13]:
#Here people no 3 index and movie of 302 index are taken, and it is observed that how the rating should be done on movie by the selected people
svd.predict(3, 302)

Prediction(uid=3, iid=302, r_ui=None, est=3.5822118493003012, details={'was_impossible': False})

#Hybrid Recommendor

Here I combine the desctription based recommendation along with collaborative filtering.

##Description Based Selection

In [45]:
md = pd.read_csv('/content/movies_metadata.csv')


In [46]:
# droping rows by index
md = md.drop([19730, 29503, 35587])

In [47]:
#performing look up opertion on all movies that are present in links_small dataset
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 24)

In [54]:
smd = smd.reset_index()
titles = smd['title']
# finding indices of every title
indices = pd.Series(smd.index, index=titles)

###Creating a new description column

In [66]:
smd['tagline'] = smd['tagline'].fillna(' ')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna(' ')

###Creating a Vectorizer

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [68]:
# Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [79]:
#Function Creation for description based recommendation
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)#sorts in descending order
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [80]:
import numpy as np

###Maping and merging data based on id from different dataset

In [72]:
id_map = pd.read_csv('/content/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [73]:
#change my index based on 'id' from 'title'
indices_map = id_map.set_index('id')

In [84]:
#Function Creation to combine the description based selection as well as collaborative filtering over that.

def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

##Testing

In [77]:
hybrid(1, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,id,est
6105,A Trip to the Moon,314.0,7.9,775,3.162621
2059,The Matrix,9079.0,7.9,603,3.07275
975,A Grand Day Out,199.0,7.4,530,3.008423
3360,The Dish,62.0,6.6,5257,2.92594
1898,A Simple Plan,191.0,6.9,10223,2.854865
7460,Green Zone,730.0,6.4,22972,2.81263
2910,Pandora and the Flying Dutchman,19.0,6.5,38688,2.812011
3018,Rocketship X-M,15.0,5.1,37744,2.797777
5044,The Men,18.0,6.5,1882,2.783397
4506,Tears of the Sun,582.0,6.4,9567,2.741272


In [78]:
hybrid(500, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,id,est
6105,A Trip to the Moon,314.0,7.9,775,3.420215
975,A Grand Day Out,199.0,7.4,530,3.411042
1898,A Simple Plan,191.0,6.9,10223,3.27389
7460,Green Zone,730.0,6.4,22972,3.256048
5229,Ambush,13.0,6.3,49320,3.209608
2854,The Hidden,85.0,6.7,12476,3.1552
4506,Tears of the Sun,582.0,6.4,9567,3.085752
4328,Dog Soldiers,227.0,6.7,11880,3.021765
7050,Pride and Glory,243.0,6.3,13150,2.94548
3927,Birthday Girl,104.0,6.1,2084,2.941077
