In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle



In [2]:
movies = pd.read_csv('../datasets/tmdb_5000_movies.csv')

df = pd.read_csv('../datasets/tmdb_5000_credits.csv')
df.columns = ['id', 'tittle', 'cast', 'crew']
movies = movies.merge(df, on='id')

In [3]:
movies.info(8)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

#### Content based recommendation engine - TF-IDF Vectorizer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
# removing english stop word like a, and , the 
tfidf = TfidfVectorizer(analyzer = 'word',stop_words = 'english')
#NaN -> ‘’
movies['overview'] = movies['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['overview'])
tfidf_matrix.shape

(4803, 20978)

#### Calculation of similarity score by using COSINE SIMILARITY algorithm
The shape of the TF-IDF matrix is (4803, 20978). which means that here are 20978 different words are used to describe a 4803 movies.

Now, we will find similarity score of this matrix.

As we have a TF_IDF vectorizer, calculating directly a dot product will give us a cosine similarity. here we are using cosine similarity score since it is relatively easy and fast to calculate.

In [5]:
from sklearn.metrics.pairwise import linear_kernel
cosin_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

Next,we perform reverse map for the indices of the movies and titles.

In [6]:
index_of_movies = pd.Series(movies.index,   index=movies['title']).drop_duplicates()

Now, let’s write function for recommendation.

Step 1: Fetch the title
Step 2: Compute the similarity score for the movie from cosin_sim matrix
Step 3: Sort the similarity score
Step 4: OUTPUT- Return top movie base on the input

In [7]:
def get_recommendations(title, cosin_sim=cosin_sim):
    idx = index_of_movies[title]
    
    sim_scores = list(enumerate(cosin_sim[idx]))
    # sorting of moviesidx based on similarity score
    sim_scores = sorted(sim_scores, key = lambda x:x[1], reverse = True)
    # get top 10 of sorted 
    sim_scores = sim_scores[1:31]
    
    movies_idx = [i[0] for i in sim_scores]
    
    return movies['title'].iloc[movies_idx]

In [8]:
get_recommendations('Blood Ties').head(10)

3727          Easy Money
3623                Made
3337       The Godfather
3020          Sugar Hill
2389         Renaissance
3142      American Heist
3284      Brooklyn Rules
2801         The Funeral
1916    Lords of Dogtown
3760      The Jerky Boys
Name: title, dtype: object

#### Improvement of recommender with the help of other metadatas

First, we get the cast, crew, keywords and genres column data. then we will put some preprocessing on that data to get the most useful information for example we will get Director from the ‘crew’ column.

We will create a soup of these information. and apply the CountVectorizer.

One important difference is that we use the CountVectorizer() instead of TF-IDF. This is because we do not want to down-weight the presence of an actor/director if he or she has acted or directed in relatively more movies. It doesn’t make much intuitive sense.

Next step is to compute a Cosine Similarity matrix based on the Count matrix.

In [9]:
from ast import literal_eval
features = ['cast', 'crew', 'keywords', 'genres']
for f in features:
    movies[f] = movies[f].apply(literal_eval)
# to get director from job
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
# get top 3 elements of list
def get_list(x):
    if isinstance(x, list):
        names = [ i['name'] for i in x]
        
        if len(names)  > 3:
            names = names[:3]
        return names
    return []
#apply all functions
movies['director'] = movies['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']
for f in features:
    movies[f] = movies[f].apply(get_list)
#striping
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''
features = ['cast', 'keywords', 'director', 'genres']
for f in features:
    movies[f] = movies[f].apply(clean_data)
#creating a SOUP
def create_soup(x):
    return ' '.join(x['keywords'])+' '+' '.join(x['cast'])+' '+x['director']+' '+' '.join(x['genres'])
movies['soup'] = movies.apply(create_soup, axis=1)
#count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words = 'english')
count_matrix = count.fit_transform(movies['soup'])
# finding similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
cosin_sim2 = cosine_similarity(count_matrix, count_matrix)

In [10]:
get_recommendations('Blood Ties', cosin_sim2)

975                  The International
918                         Inside Man
3789                             Trust
2915                             Trash
4638          Amidst the Devil's Wings
2502                              Zulu
2649                 The Son of No One
3322    Betty Fisher and Other Stories
4488                        Adulterers
4630                        Compliance
4780                       Dutch Kills
3637                Kill the Messenger
3727                        Easy Money
4205                          N-Secure
4449                London to Brighton
4531                       Civil Brand
1948                 Little White Lies
2030                          Derailed
2793              The Killer Inside Me
4589                            Fabled
283         The Taking of Pelham 1 2 3
351                       The Departed
444                  Road to Perdition
510                    Children of Men
714                         Collateral
717                      

### COLLABORATIVE FILTERING - Python's Surprise library 

In [11]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
reader = Reader()
ratings = pd.read_csv('../datasets/ratings_small.csv')

In [12]:
ratings.head(6)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151


#### Cross validation of our data

In [13]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
reader = Reader(rating_scale=(0,5))



In [14]:
data = Dataset.load_from_df(ratings[['userId', 'movieId','rating']], reader)
svd = SVD()

In [15]:
# Run 5 fold cross validation
cross_validate(svd, data,measures=['RMSE'], cv=5,verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9012  0.8925  0.9022  0.8933  0.8954  0.8969  0.0040  
Fit time          0.65    0.65    0.68    0.70    0.64    0.67    0.02    
Test time         0.14    0.06    0.06    0.07    0.13    0.09    0.03    


{'test_rmse': array([0.90121636, 0.89254361, 0.90222344, 0.89326496, 0.89539238]),
 'fit_time': (0.6509079933166504,
  0.6505331993103027,
  0.6848042011260986,
  0.7018105983734131,
  0.6440434455871582),
 'test_time': (0.13632512092590332,
  0.0640103816986084,
  0.06399869918823242,
  0.07197761535644531,
  0.13122177124023438)}

We got a Root Mean Square Error of 0.89 approx which is more than good enough for our case. 
Let us now train on our dataset and arrive at predictions.

In [16]:
train = data.build_full_trainset()
svd.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a6f70f7310>

Let’s predict the user 1’s rating on the movie Id=302

In [17]:
svd.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.7505568985162703, details={'was_impossible': False})

Here , est=2.6280 means that user 1 might  give rating of 2.63 to movie which has Id 302.

That is how we can predict the movie rating based on the users profile and recommend the best movie to them without knowing the past behaviour of the User. 

This is called a  collaborative filtering.

#### HYBRID RECOMMENDER 

A hybrid model for recommendation of movies to users with the best possible efficiency and precision can be designed as mentioned below:


Let's put our content based and CF based together and make a strong recommender.

In [18]:
movie_id = pd.read_csv('../datasets/links_small.csv')

movie_id.dropna()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9120,162672,3859980,402672.0
9121,163056,4262980,315011.0
9122,163949,2531318,391698.0
9123,164977,27660,137608.0


In [19]:
movie_id.head(10)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0


In [20]:


new_movies = movies.filter(['id', 'title','tagline'])

In [21]:
new_movies.head(5)


Unnamed: 0,id,title,tagline
0,19995,Avatar,Enter the World of Pandora.
1,285,Pirates of the Caribbean: At World's End,"At the end of the world, the adventure begins."
2,206647,Spectre,A Plan No One Escapes
3,49026,The Dark Knight Rises,The Legend Ends
4,49529,John Carter,"Lost in our world, found in another."


In [22]:
movie_id.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [23]:
# convert float val to int
def conv_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [24]:

movieid = pd.read_csv('../datasets/links_small.csv')[['movieId', 'tmdbId']]
movieid['tmdbId'] = movie_id['tmdbId'].apply(conv_int)
movieid.columns = ['movieId', 'id']
movieid = movieid.merge(new_movies[['title', 'id']], on='id').set_index('title')
print(movieid.shape)
movieid

(3404, 2)


Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
GoldenEye,10,710.0
The American President,11,9087.0
Nixon,14,10858.0
Cutthroat Island,15,1408.0
...,...,...
The Maid's Room,160440,278348.0
The Legend of Tarzan,160563,258489.0
The Purge: Election Year,160565,316727.0
Nerve,160954,328387.0


Next, we make a index_map to find a index of a movie.

In [26]:
index_map = movieid.set_index('id')




#### Count Vectorization

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words='english')
count_matrix = cv.fit_transform(movies['soup']).toarray()

In [28]:
count_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(count_matrix)


In [30]:
similarity

array([[1.        , 0.3       , 0.21081851, ..., 0.        , 0.        ,
        0.        ],
       [0.3       , 1.        , 0.21081851, ..., 0.        , 0.        ,
        0.        ],
       [0.21081851, 0.21081851, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [31]:
len(similarity[0])

4803

In [32]:
sorted(list(enumerate(similarity[0])), reverse=True, key = lambda x:x[1])

[(0, 0.9999999999999999),
 (786, 0.4743416490252569),
 (1438, 0.4242640687119285),
 (206, 0.4),
 (3993, 0.3651483716701108),
 (71, 0.3585685828003181),
 (131, 0.3585685828003181),
 (2655, 0.3585685828003181),
 (81, 0.33541019662496846),
 (160, 0.33541019662496846),
 (208, 0.33541019662496846),
 (215, 0.33541019662496846),
 (241, 0.33541019662496846),
 (292, 0.33541019662496846),
 (315, 0.33541019662496846),
 (466, 0.33541019662496846),
 (618, 0.33541019662496846),
 (1686, 0.33541019662496846),
 (1892, 0.33541019662496846),
 (1939, 0.33541019662496846),
 (2343, 0.33541019662496846),
 (2390, 0.33541019662496846),
 (2444, 0.33541019662496846),
 (3494, 0.33541019662496846),
 (9, 0.31622776601683794),
 (10, 0.31622776601683794),
 (20, 0.31622776601683794),
 (50, 0.31622776601683794),
 (103, 0.31622776601683794),
 (121, 0.31622776601683794),
 (129, 0.31622776601683794),
 (332, 0.31622776601683794),
 (403, 0.31622776601683794),
 (472, 0.31622776601683794),
 (486, 0.31622776601683794),
 (587, 

####  Below function is the main function i.e.., hybrid recommender function which has the power of two recommendation techniques(Content-based filtering and Collaborative filtering) combined

In [33]:


def recommend_for(userid, title):
   index = index_of_movies[title]
   tmdbId = movieid.loc[title]['id']
   
   
   
    #content based
   sim_scores = list(enumerate(cosin_sim2[int(index)]))
   sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
   sim_scores = sim_scores[1:10]
   movie_indices = [i[0] for i in sim_scores]

   mv = movies.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
   mv = mv[mv['id'].isin(movieid['id'])]
    
   
  
   #collaborative filtering - svd
   

   mv['est'] = mv['id'].apply(lambda x: svd.predict(userid, index_map.loc[x]['movieId']).est)

   mv = mv.sort_values('est', ascending=False)
   
   
   
   return mv.head(10)

In [34]:
recommend_for(343, "Avatar")

Unnamed: 0,title,vote_count,vote_average,id,est
103,The Sorcerer's Apprentice,1470,5.8,27022,3.577596
131,G-Force,510,5.1,19585,3.571658
1,Pirates of the Caribbean: At World's End,4500,6.9,285,3.5242
71,The Mummy: Tomb of the Dragon Emperor,1387,5.2,1735,3.444162
206,Clash of the Titans,2233,5.6,18823,3.221492
715,The Scorpion King,779,5.3,9334,3.136128
215,Fantastic 4: Rise of the Silver Surfer,2589,5.4,1979,3.115914
466,The Time Machine,631,5.8,2135,3.098434


In [35]:
import pickle
pickle.dump(new_movies, open('../model/movies_list.pkl', 'wb'))
pickle.dump(similarity, open('../model/similarity.pkl', 'wb'))