# Recommender Systems Walk Through

In [1]:
import pandas as pd
import numpy as np 
import Utils as ut

Using TensorFlow backend.


In [2]:
df = pd.read_csv('Clean_Item_Data')
df.drop('Unnamed: 0',inplace = True,axis = 1)
df=df.sort_values('score', ascending=False).iloc[0:5000]
df.head(5)

Unnamed: 0,title,overview,genres,vote_average,budget,runtime,adult,vote_count,movieId,imdbId,tmdbId,cast,keywords,cast_size,crew_size,director,score
136,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,18,8.5,25000000,142.0,0,8358.0,318,111161,278,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton']","['prison', 'corruption', 'police brutality', '...",42,90,Frank Darabont,8.372739
307,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",18,8.5,6000000,175.0,0,6024.0,858,68646,238,"['Marlon Brando', 'Al Pacino', 'James Caan']","['italy', 'love at first sight', 'loss of fath...",58,42,Francis Ford Coppola,8.327148
5002,The Dark Knight,Batman raises the stakes in his war on crime. ...,18,8.3,185000000,152.0,0,12269.0,58559,468569,155,"['Christian Bale', 'Michael Caine', 'Heath Led...","['dc comics', 'crime fighter', 'secret identit...",134,81,Christopher Nolan,8.219441
1238,Fight Club,A ticking-time-bomb insomniac and a slippery s...,18,8.3,63000000,139.0,0,9678.0,2959,137523,550,"['Edward Norton', 'Brad Pitt', 'Meat Loaf']","['support group', 'dual identity', 'nihilism',...",77,107,David Fincher,8.198915
126,Pulp Fiction,"A burger-loving hit man, his philosophical par...",53,8.3,8000000,154.0,0,8670.0,296,110912,680,"['John Travolta', 'Samuel L. Jackson', 'Uma Th...","['transporter', 'brothel', 'drug dealer', 'box...",54,87,Quentin Tarantino,8.187793


In [3]:
rating = pd.read_csv('ratings.csv')

In [4]:
rating = rating[rating.userId.isin(ut.random_sample(rating.userId.unique()))]

In [5]:
rating = rating[rating.movieId.isin(df.movieId.unique())]
df = df[df.movieId.isin(rating.movieId.unique())]

In [6]:
df = df.reset_index(drop=True)
rating = rating.reset_index(drop=True)

In [7]:
df.shape

(4575, 17)

In [8]:
# Very naive approach (also to do this properly I need to take into account of number of votes not just avg vote.)

df.sort_values('score', ascending=False).head(5)

Unnamed: 0,title,overview,genres,vote_average,budget,runtime,adult,vote_count,movieId,imdbId,tmdbId,cast,keywords,cast_size,crew_size,director,score
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,18,8.5,25000000,142.0,0,8358.0,318,111161,278,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton']","['prison', 'corruption', 'police brutality', '...",42,90,Frank Darabont,8.372739
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",18,8.5,6000000,175.0,0,6024.0,858,68646,238,"['Marlon Brando', 'Al Pacino', 'James Caan']","['italy', 'love at first sight', 'loss of fath...",58,42,Francis Ford Coppola,8.327148
2,The Dark Knight,Batman raises the stakes in his war on crime. ...,18,8.3,185000000,152.0,0,12269.0,58559,468569,155,"['Christian Bale', 'Michael Caine', 'Heath Led...","['dc comics', 'crime fighter', 'secret identit...",134,81,Christopher Nolan,8.219441
3,Fight Club,A ticking-time-bomb insomniac and a slippery s...,18,8.3,63000000,139.0,0,9678.0,2959,137523,550,"['Edward Norton', 'Brad Pitt', 'Meat Loaf']","['support group', 'dual identity', 'nihilism',...",77,107,David Fincher,8.198915
4,Pulp Fiction,"A burger-loving hit man, his philosophical par...",53,8.3,8000000,154.0,0,8670.0,296,110912,680,"['John Travolta', 'Samuel L. Jackson', 'Uma Th...","['transporter', 'brothel', 'drug dealer', 'box...",54,87,Quentin Tarantino,8.187793


## Content Based Filtering 

In [9]:
df['overview_toke'] = df['overview'].apply(ut.clean_text, toke = True)
                                                                          
df['overview_clean'] = df['overview'].apply(ut.clean_text)

In [10]:
df.head(3)

Unnamed: 0,title,overview,genres,vote_average,budget,runtime,adult,vote_count,movieId,imdbId,tmdbId,cast,keywords,cast_size,crew_size,director,score,overview_toke,overview_clean
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,18,8.5,25000000,142.0,0,8358.0,318,111161,278,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton']","['prison', 'corruption', 'police brutality', '...",42,90,Frank Darabont,8.372739,"[frame, 1940, doubl, murder, hi, wife, lover, ...",frame 1940 doubl murder hi wife lover upstand ...
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",18,8.5,6000000,175.0,0,6024.0,858,68646,238,"['Marlon Brando', 'Al Pacino', 'James Caan']","['italy', 'love at first sight', 'loss of fath...",58,42,Francis Ford Coppola,8.327148,"[span, year, 1945, 1955, chronicl, fiction, it...",span year 1945 1955 chronicl fiction italian-a...
2,The Dark Knight,Batman raises the stakes in his war on crime. ...,18,8.3,185000000,152.0,0,12269.0,58559,468569,155,"['Christian Bale', 'Michael Caine', 'Heath Led...","['dc comics', 'crime fighter', 'secret identit...",134,81,Christopher Nolan,8.219441,"[batman, rais, stake, hi, war, crime, help, lt...",batman rais stake hi war crime help lt jim gor...


## TF-IDF

In [11]:
tfidf_cosine_sim = ut.TF_IDF(df['overview_clean'], ngram = 5)

In [12]:
ut.top_rec("The Dark Knight",
           tfidf_cosine_sim, 
           pd.Series(df.index, index=df['title']), 
           df)

104                       The Dark Knight Rises
611                                      Batman
2304                  In Order of Disappearance
1513                             Batman Returns
580                                         JFK
564                  Batman: Under the Red Hood
1514                           Batman: Year One
421     Batman: The Dark Knight Returns, Part 2
146                               Batman Begins
525          Sherlock Holmes: A Game of Shadows
Name: title, dtype: object

## Word2Vec

In [13]:
word2vec_cosine_sim = ut.Word2Vec_Hybrid(
                                      df['overview_toke'], 
                                      vector_size = 300,
                                      window = 7, 
                                      epochs = 100)

100%|██████████| 17423/17423 [00:11<00:00, 1468.02it/s] 


In [14]:
ut.top_rec("The Dark Knight",
           word2vec_cosine_sim, 
           pd.Series(df.index, index=df['title']), 
           df)

104                         The Dark Knight Rises
1513                               Batman Returns
564                    Batman: Under the Red Hood
611                                        Batman
1514                             Batman: Year One
146                                 Batman Begins
3452                 Batman & Mr. Freeze: SubZero
3627    The Batman Superman Movie: World's Finest
421       Batman: The Dark Knight Returns, Part 2
1239                 Batman: Mask of the Phantasm
Name: title, dtype: object

In [15]:
word2vec_cosine_sim = ut.Word2Vec_Hybrid(
                                      df['overview_toke'], 
                                      vector_size = 300,
                                      window = 7, 
                                      epochs = 100)

100%|██████████| 17423/17423 [00:11<00:00, 1494.99it/s] 


In [16]:
ut.top_rec("The Dark Knight",
           word2vec_cosine_sim, 
           pd.Series(df.index, index=df['title']), 
           df)

104                         The Dark Knight Rises
1513                               Batman Returns
611                                        Batman
564                    Batman: Under the Red Hood
3627    The Batman Superman Movie: World's Finest
1514                             Batman: Year One
146                                 Batman Begins
3452                 Batman & Mr. Freeze: SubZero
2057         Justice League: Crisis on Two Earths
3011              Superman/Batman: Public Enemies
Name: title, dtype: object

### Word embedding

In [17]:
doc2vec_cosine_sim = ut.Doc2Word_embed(df['overview_clean'], 
                                    df['overview_toke'], 
                                    vector_size = 300,
                                    window = 15,
                                    epochs = 100)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [18]:
ut.top_rec("The Dark Knight",
           doc2vec_cosine_sim, 
           pd.Series(df.index, index=df['title']), 
           df)

104                         The Dark Knight Rises
611                                        Batman
2057         Justice League: Crisis on Two Earths
564                    Batman: Under the Red Hood
146                                 Batman Begins
3011              Superman/Batman: Public Enemies
1513                               Batman Returns
3627    The Batman Superman Movie: World's Finest
3870                          Running Out of Time
1511           Batman Beyond: Return of the Joker
Name: title, dtype: object

## Hybrid Content based model

We want to combine the NLP models (i.e TF_IDF, Word2Vec_Hybrid and Doc2Word_embed)


In [51]:
# assuming the NLP model cosine similarities are comparable we will avg

avg_nlp_sim = (doc2vec_cosine_sim+word2vec_cosine_sim+tfidf_cosine_sim)/3


ut.top_rec("The Dark Knight",
           avg_nlp_sim, 
           pd.Series(df.index, index=df['title']), 
           df)

104                         The Dark Knight Rises
611                                        Batman
564                    Batman: Under the Red Hood
1513                               Batman Returns
146                                 Batman Begins
2057         Justice League: Crisis on Two Earths
3627    The Batman Superman Movie: World's Finest
3011              Superman/Batman: Public Enemies
1514                             Batman: Year One
1511           Batman Beyond: Return of the Joker
Name: title, dtype: object

## Collaborative Filtering

### Item based

This is  collaborative filtering although we dont actually map users here. We just find the cosine similarity between movie rating vectors.

i.e If a lot of people who are highly rating to MasterChef are also highly rating Bake Off, these 2 shows will have a high similarity score.

In [60]:
df['index1'] = df.index
new = pd.merge(rating,
               df[["title", 'index1', "movieId"]], 
               how='inner',
               left_on="movieId",
               right_on="movieId")

new = new[["userId","index1","rating"]]
x = pd.pivot_table(new, values='rating', index=['index1'], columns=['userId'], aggfunc=np.max, fill_value=0) 
new.head(2)

Unnamed: 0,userId,index1,rating
0,12,90,4.0
1,400,90,3.0


In [61]:
item_cosine_sim = ut.Rating2Vec(x)

In [62]:
ut.top_rec("The Dark Knight",
           item_cosine_sim, 
           pd.Series(df.index, index=df['title']), 
           df)

12                                         Inception
176                                         Iron Man
146                                    Batman Begins
104                            The Dark Knight Rises
70                                            WALL·E
17     The Lord of the Rings: The Return of the King
3                                         Fight Club
46                              Inglourious Basterds
37                                      The Prestige
59                                      The Departed
Name: title, dtype: object

In [65]:
item_cosine_sim.shape

(4575, 4575)

### User based

In [63]:
user_cosine_sim = ut.Rating2Vec(x.T)

In [64]:
user_cosine_sim.shape

(9935, 9935)

In [None]:
def top_rec(title, sim, indices = pd.Series(df.index, index=df['title'])):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [None]:
movie_indices = [i[0] for i in sorted(list(enumerate(similarity_matrix[indices[title_of_movie]])), key=lambda x: x[1], reverse=True)[1:11]]

to do:
- Evaluation method
- Finish simple memory based collab
- do a model based one

look into how a neural network could combine evrything.

Hybrid

Factorisation machines?
