In [68]:
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy. 
import matplotlib.pyplot as plt
import seaborn as sns

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
imb = pd.read_csv('imdb_data.csv')
movies = pd.read_csv('movies.csv')

In [4]:
train.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739


In [5]:
test.head(2)

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144


In [6]:
imb.head(2)

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game


In [7]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [8]:
##merging data frames
def merge_df(df):
    merge_tr = pd.merge(df,imb[['movieId','director']],how='left',on='movieId')
    merge_tr = pd.merge(merge_tr,movies,on='movieId',how='left')
    return merge_tr


In [9]:
df_train = merge_df(train).drop(['rating','timestamp'],axis=1)

In [10]:
df_train

Unnamed: 0,userId,movieId,director,title,genres
0,5163,57669,Martin McDonagh,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,106343,5,Albert Hackett,Father of the Bride Part II (1995),Comedy
2,146790,5459,Lowell Cunningham,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
3,106362,32296,Marc Lawrence,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime
4,9041,366,Wes Craven,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller
...,...,...,...,...,...
10000033,136395,99114,Quentin Tarantino,Django Unchained (2012),Action|Drama|Western
10000034,140078,553,Kevin Jarre,Tombstone (1993),Action|Drama|Western
10000035,154807,56782,Paul Thomas Anderson,There Will Be Blood (2007),Drama|Western
10000036,85805,327,Alan Martin,Tank Girl (1995),Action|Comedy|Sci-Fi


In [11]:
df_test = merge_df(test)

In [12]:
df_test

Unnamed: 0,userId,movieId,director,title,genres
0,1,2011,,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi
1,1,4144,Kar-Wai Wong,In the Mood For Love (Fa yeung nin wa) (2000),Drama|Romance
2,1,5767,,Teddy Bear (Mis) (1981),Comedy|Crime
3,1,6711,Sofia Coppola,Lost in Translation (2003),Comedy|Drama|Romance
4,1,7318,Benedict Fitzgerald,"Passion of the Christ, The (2004)",Drama
...,...,...,...,...,...
5000014,162541,4079,,Amazon Women on the Moon (1987),Comedy|Sci-Fi
5000015,162541,4467,,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy
5000016,162541,4980,Chris Matheson,Bill & Ted's Bogus Journey (1991),Adventure|Comedy|Fantasy|Sci-Fi
5000017,162541,5689,E.L. Doctorow,Billy Bathgate (1991),Crime|Drama


In [13]:
df_test.director.isnull().sum()

1484532

In [14]:
df_train.director.isnull().sum()

2969695

In [15]:
def drop_director(df):
    return df.drop('director',axis=1)


In [16]:
df_test = drop_director(df_test)
df_train = drop_director(df_train)

In [17]:
df_test.head(2)

Unnamed: 0,userId,movieId,title,genres
0,1,2011,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi
1,1,4144,In the Mood For Love (Fa yeung nin wa) (2000),Drama|Romance


In [18]:
df_train.head(2)

Unnamed: 0,userId,movieId,title,genres
0,5163,57669,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,106343,5,Father of the Bride Part II (1995),Comedy


In [19]:
df_all = pd.concat([df_train,df_test])

In [20]:
df_all

Unnamed: 0,userId,movieId,title,genres
0,5163,57669,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,106343,5,Father of the Bride Part II (1995),Comedy
2,146790,5459,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
3,106362,32296,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime
4,9041,366,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller
...,...,...,...,...
5000014,162541,4079,Amazon Women on the Moon (1987),Comedy|Sci-Fi
5000015,162541,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy
5000016,162541,4980,Bill & Ted's Bogus Journey (1991),Adventure|Comedy|Fantasy|Sci-Fi
5000017,162541,5689,Billy Bathgate (1991),Crime|Drama


In [21]:
len(df_test)

5000019

In [22]:
len(df_train)+len(df_test)

15000057

In [23]:
len(df_all)

15000057

In [24]:
df_all.isnull().sum()

userId     0
movieId    0
title      0
genres     0
dtype: int64

In [25]:
df_all

Unnamed: 0,userId,movieId,title,genres
0,5163,57669,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,106343,5,Father of the Bride Part II (1995),Comedy
2,146790,5459,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
3,106362,32296,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime
4,9041,366,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller
...,...,...,...,...
5000014,162541,4079,Amazon Women on the Moon (1987),Comedy|Sci-Fi
5000015,162541,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy
5000016,162541,4980,Bill & Ted's Bogus Journey (1991),Adventure|Comedy|Fantasy|Sci-Fi
5000017,162541,5689,Billy Bathgate (1991),Crime|Drama


In [26]:
titles = df_all.title.unique()

In [27]:
titles = pd.Series(titles)

In [28]:
titles

0                                         In Bruges (2008)
1                       Father of the Bride Part II (1995)
2        Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...
3           Miss Congeniality 2: Armed and Fabulous (2005)
4        Wes Craven's New Nightmare (Nightmare on Elm S...
                               ...                        
53178                                      Don Juan (1998)
53179                                   Vinterviken (1996)
53180                         Young, Single & Angry (2006)
53181                           Il pesce innamorato (1999)
53182                     The Spy Who Fell to Earth (2019)
Length: 53183, dtype: object

In [29]:
df_all['genres']=df_all['genres'].apply(lambda x:' '.join(x.split('|')))

In [30]:
df = df_all[['title','genres']]

In [31]:
df = df.drop_duplicates(keep='first')

In [32]:
df

Unnamed: 0,title,genres
0,In Bruges (2008),Comedy Crime Drama Thriller
1,Father of the Bride Part II (1995),Comedy
2,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action Comedy Sci-Fi
3,Miss Congeniality 2: Armed and Fabulous (2005),Adventure Comedy Crime
4,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama Horror Mystery Thriller
...,...,...
4991539,Don Juan (1998),Comedy Romance
4993240,Vinterviken (1996),Crime Drama
4993241,"Young, Single & Angry (2006)",Comedy Romance
4994083,Il pesce innamorato (1999),(no genres listed)


In [33]:
a=titles.values

In [34]:
a

array(['In Bruges (2008)', 'Father of the Bride Part II (1995)',
       'Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002)', ...,
       'Young, Single & Angry (2006)', 'Il pesce innamorato (1999)',
       'The Spy Who Fell to Earth (2019)'], dtype=object)

In [35]:
ls =[]
for i in a:
    ls.append(df[df['title']==i]['genres'].values[0])

In [36]:
df[df['title']=='In Bruges (2008)']['genres'].values[0]

'Comedy Crime Drama Thriller'

In [37]:
len(ls)

53183

In [38]:
len(titles)

53183

In [39]:
df = pd.DataFrame({'title':a,'genre':ls})

In [40]:
df

Unnamed: 0,title,genre
0,In Bruges (2008),Comedy Crime Drama Thriller
1,Father of the Bride Part II (1995),Comedy
2,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action Comedy Sci-Fi
3,Miss Congeniality 2: Armed and Fabulous (2005),Adventure Comedy Crime
4,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama Horror Mystery Thriller
...,...,...
53178,Don Juan (1998),Comedy Romance
53179,Vinterviken (1996),Crime Drama
53180,"Young, Single & Angry (2006)",Comedy Romance
53181,Il pesce innamorato (1999),(no genres listed)


In [41]:
ind_titles=pd.Series(df.index,index=titles)

In [42]:
ind_titles

In Bruges (2008)                                                                              0
Father of the Bride Part II (1995)                                                            1
Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002)                                           2
Miss Congeniality 2: Armed and Fabulous (2005)                                                3
Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)        4
                                                                                          ...  
Don Juan (1998)                                                                           53178
Vinterviken (1996)                                                                        53179
Young, Single & Angry (2006)                                                              53180
Il pesce innamorato (1999)                                                                53181
The Spy Who Fell to Earth (2019)        

In [43]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2),
                     min_df=0, stop_words='english')

In [44]:
tf_gen_matrix = tf.fit_transform(df['genre'])

In [45]:
import pickle

In [46]:
cosine_sim_authTags = cosine_similarity(tf_gen_matrix, 
                                        tf_gen_matrix)

In [None]:
with open('matrix','wb') as f:
    pickle.dump(cosine_sim_authTags,f)

In [None]:
with open('matrix','rb') as f:
    cosine_sim_authTags=pickle.load(f)

In [47]:
cosine_sim_authTags[:5]

array([[1.        , 0.24198093, 0.04985315, ..., 0.09315748, 0.        ,
        0.        ],
       [0.24198093, 1.        , 0.20602099, ..., 0.3849786 , 0.        ,
        0.        ],
       [0.04985315, 0.20602099, 1.        , ..., 0.07931367, 0.        ,
        0.        ],
       [0.48119564, 0.24542135, 0.05056195, ..., 0.09448197, 0.        ,
        0.        ],
       [0.11742754, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [48]:
cosine_sim_authTags.shape

(53183, 53183)

In [50]:
def content_generate_top_N_recommendations(movie_title, N=10):
    m_idx = ind_titles[movie_title]
    sim_scores = list(enumerate(cosine_sim_authTags[m_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:N] 
    movie_indices = [i[0] for i in sim_scores] 
    return titles.iloc[movie_indices]

In [51]:
content_generate_top_N_recommendations("In Bruges (2008)", N=10)

81                                    Pulp Fiction (1994)
140                                        Freeway (1996)
276                                          Fargo (1996)
722     Man Bites Dog (C'est arrivé près de chez vous)...
2967                               Informant!, The (2009)
3600                               Leaves of Grass (2009)
3835                           Beautiful Creatures (2000)
8964                                 Party Monster (2003)
9174                                     Nobel Son (2007)
dtype: object

In [52]:
content_generate_top_N_recommendations("Father of the Bride Part II (1995)", N=10)

10                      Orange County (2002)
29                          Airplane! (1980)
60          Robin Hood: Men in Tights (1993)
88     Ace Ventura: When Nature Calls (1995)
92                      Cool Runnings (1993)
101              Mon Oncle (My Uncle) (1958)
108                  House Bunny, The (2008)
133                         Liar Liar (1997)
158      Monty Python's Life of Brian (1979)
dtype: object

In [53]:
"Father of the Bride Part II (1995)" in a

True

In [54]:
len(list(set(list(test['userId'].values))))

162350

In [58]:
df_train1 = merge_df(train).drop('timestamp',axis=1)

In [59]:
df_train1

Unnamed: 0,userId,movieId,rating,director,title,genres
0,5163,57669,4.0,Martin McDonagh,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,106343,5,4.5,Albert Hackett,Father of the Bride Part II (1995),Comedy
2,146790,5459,5.0,Lowell Cunningham,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
3,106362,32296,2.0,Marc Lawrence,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime
4,9041,366,3.0,Wes Craven,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller
...,...,...,...,...,...,...
10000033,136395,99114,5.0,Quentin Tarantino,Django Unchained (2012),Action|Drama|Western
10000034,140078,553,3.0,Kevin Jarre,Tombstone (1993),Action|Drama|Western
10000035,154807,56782,4.0,Paul Thomas Anderson,There Will Be Blood (2007),Drama|Western
10000036,85805,327,4.0,Alan Martin,Tank Girl (1995),Action|Comedy|Sci-Fi


In [60]:
df_train1 = drop_director(df_train1)

In [61]:
df_train1

Unnamed: 0,userId,movieId,rating,title,genres
0,5163,57669,4.0,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,106343,5,4.5,Father of the Bride Part II (1995),Comedy
2,146790,5459,5.0,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Action|Comedy|Sci-Fi
3,106362,32296,2.0,Miss Congeniality 2: Armed and Fabulous (2005),Adventure|Comedy|Crime
4,9041,366,3.0,Wes Craven's New Nightmare (Nightmare on Elm S...,Drama|Horror|Mystery|Thriller
...,...,...,...,...,...
10000033,136395,99114,5.0,Django Unchained (2012),Action|Drama|Western
10000034,140078,553,3.0,Tombstone (1993),Action|Drama|Western
10000035,154807,56782,4.0,There Will Be Blood (2007),Drama|Western
10000036,85805,327,4.0,Tank Girl (1995),Action|Comedy|Sci-Fi


In [62]:
df_test

Unnamed: 0,userId,movieId,title,genres
0,1,2011,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi
1,1,4144,In the Mood For Love (Fa yeung nin wa) (2000),Drama|Romance
2,1,5767,Teddy Bear (Mis) (1981),Comedy|Crime
3,1,6711,Lost in Translation (2003),Comedy|Drama|Romance
4,1,7318,"Passion of the Christ, The (2004)",Drama
...,...,...,...,...
5000014,162541,4079,Amazon Women on the Moon (1987),Comedy|Sci-Fi
5000015,162541,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy
5000016,162541,4980,Bill & Ted's Bogus Journey (1991),Adventure|Comedy|Fantasy|Sci-Fi
5000017,162541,5689,Billy Bathgate (1991),Crime|Drama


In [66]:
def content_generate_rating_estimate(book_title, user, rating_data, k=20, threshold=0.0):
    # Convert the book title to a numeric index for our 
    # similarity matrix
    b_idx = ind_titles[book_title]
    neighbors = [] # <-- Stores our collection of similarity values 
     
    # Gather the similarity ratings between each book the user has rated
    # and the reference book 
    for index, row in rating_data[rating_data['userId']==user].iterrows():
        sim = cosine_sim_authTags[b_idx-1, ind_titles[row['title']]-1]
        neighbors.append((sim, row['rating']))
    # Select the top-N values from our collection
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])

    # Compute the weighted average using similarity scoress and 
    # user item ratings. 
    simTotal, weightedSum = 0, 0
    for (simScore, rating) in k_neighbors:
        # Ensure that similarity ratings are above a given threshold
        if (simScore > threshold):
            simTotal += simScore
            weightedSum += simScore * rating
    try:
        predictedRating = weightedSum / simTotal
    except ZeroDivisionError:
        # Cold-start problem - No ratings given by user. 
        # We use the average rating for the reference item as a proxy in this case 
        predictedRating = np.mean(rating_data[rating_data['title']==book_title]['rating'])
    return predictedRating

In [69]:
t='Back to the Future Part II (1989)'
content_generate_rating_estimate(book_title=t,user=5163,rating_data=df_train1)

3.941152519747739

In [77]:
t='In Bruges (2008)'
content_generate_rating_estimate(book_title=t,user=1,rating_data=df_train1)

4.25347534868987

In [75]:
user_title_pair = []
for i,r in df_test.iterrows():
    user_title_pair.append((r['title'],r['userId']))

In [73]:
user_title_pair

[('Back to the Future Part II (1989)', 1)]

In [82]:
user_title_pair[:5]

[('Back to the Future Part II (1989)', 1),
 ('In the Mood For Love (Fa yeung nin wa) (2000)', 1),
 ('Teddy Bear (Mis) (1981)', 1),
 ('Lost in Translation (2003)', 1),
 ('Passion of the Christ, The (2004)', 1)]

In [76]:
len(user_title_pair)

5000019

In [None]:
predictions_ratings=[]
for i in user_title_pair:
    predictions_ratings.append(content_generate_rating_estimate(i[0],i[1],rating_data=df_train1))

In [84]:
predictions_ratings

[3.941152519747739]

In [None]:
sample_subm = pd.read_csv('sample_submission.csv')

In [None]:
sample_subm['rating']=np.round(np.array(predictions_ratings),1)

In [None]:
sample_subm.head()

In [None]:
sample_subm.to_csv('submission_similarity.csv',index=False)