In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [51]:
def remove_year(val):
    return val[0 : val.find("(") - 1]


def splitting_gen(val):
    return " " + " ".join([str(elem) for elem in val.split("|")])


In [73]:
movies_df =pd.read_csv('movies_df.gzip',compression='gzip')
movies_df

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [74]:
rating_df =pd.read_csv('rating_df.gzip',compression='gzip')
rating_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [53]:
movies_df["Search_Criteria"] = movies_df["Title"].apply(remove_year)
movies_df["Search_Criteria"] = movies_df["Search_Criteria"] + movies_df["Genres"].apply(
    splitting_gen
)

In [54]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies_df["Search_Criteria"])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [55]:
cos_similarty_df = pd.DataFrame(cosine_sim,columns=movies_df["Title"])

In [56]:
cos_similarty_df.index=movies_df["Title"]

In [57]:
cos_similarty_df

Title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000000,0.102821,0.030133,0.034019,0.034027,0.000000,0.044319,0.092887,0.0,0.000000,...,0.045126,0.044135,0.232825,0.000000,0.000000,0.035022,0.000000,0.000000,0.000000,0.000000
Jumanji (1995),0.102821,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.191842,0.0,0.113210,...,0.000000,0.000000,0.188368,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Grumpier Old Men (1995),0.030133,0.000000,1.000000,0.030418,0.030426,0.000000,0.120626,0.000000,0.0,0.000000,...,0.040350,0.039464,0.000000,0.000000,0.000000,0.031315,0.000000,0.000000,0.000000,0.000000
Waiting to Exhale (1995),0.034019,0.000000,0.030418,1.000000,0.034350,0.000000,0.044739,0.000000,0.0,0.000000,...,0.045553,0.078034,0.000000,0.032112,0.000000,0.035354,0.027940,0.034493,0.032076,0.032565
Father of the Bride Part II (1995),0.034027,0.000000,0.030426,0.034350,1.000000,0.000000,0.044750,0.000000,0.0,0.000000,...,0.045564,0.044564,0.000000,0.000000,0.000000,0.035362,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Meet the Parents (2000),0.035022,0.000000,0.031315,0.035354,0.035362,0.000000,0.046058,0.000000,0.0,0.000000,...,0.046896,0.045867,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
Requiem for a Dream (2000),0.000000,0.000000,0.000000,0.027940,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.036248,0.000000,0.034766,0.000000,0.000000,1.000000,0.037344,0.034727,0.035257
Tigerland (2000),0.000000,0.000000,0.000000,0.034493,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.044750,0.000000,0.042920,0.000000,0.000000,0.037344,1.000000,0.042871,0.043526
Two Family House (2000),0.000000,0.000000,0.000000,0.032076,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.041614,0.000000,0.039912,0.000000,0.000000,0.034727,0.042871,1.000000,0.040475


In [58]:
cos_similarty_df.apply(lambda x: round(x,3))

Title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000,0.103,0.030,0.034,0.034,0.000,0.044,0.093,0.0,0.000,...,0.045,0.044,0.233,0.000,0.000,0.035,0.000,0.000,0.000,0.000
Jumanji (1995),0.103,1.000,0.000,0.000,0.000,0.000,0.000,0.192,0.0,0.113,...,0.000,0.000,0.188,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Grumpier Old Men (1995),0.030,0.000,1.000,0.030,0.030,0.000,0.121,0.000,0.0,0.000,...,0.040,0.039,0.000,0.000,0.000,0.031,0.000,0.000,0.000,0.000
Waiting to Exhale (1995),0.034,0.000,0.030,1.000,0.034,0.000,0.045,0.000,0.0,0.000,...,0.046,0.078,0.000,0.032,0.000,0.035,0.028,0.034,0.032,0.033
Father of the Bride Part II (1995),0.034,0.000,0.030,0.034,1.000,0.000,0.045,0.000,0.0,0.000,...,0.046,0.045,0.000,0.000,0.000,0.035,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Meet the Parents (2000),0.035,0.000,0.031,0.035,0.035,0.000,0.046,0.000,0.0,0.000,...,0.047,0.046,0.000,0.000,0.000,1.000,0.000,0.000,0.000,0.000
Requiem for a Dream (2000),0.000,0.000,0.000,0.028,0.000,0.000,0.000,0.000,0.0,0.000,...,0.000,0.036,0.000,0.035,0.000,0.000,1.000,0.037,0.035,0.035
Tigerland (2000),0.000,0.000,0.000,0.034,0.000,0.000,0.000,0.000,0.0,0.000,...,0.000,0.045,0.000,0.043,0.000,0.000,0.037,1.000,0.043,0.044
Two Family House (2000),0.000,0.000,0.000,0.032,0.000,0.000,0.000,0.000,0.0,0.000,...,0.000,0.042,0.000,0.040,0.000,0.000,0.035,0.043,1.000,0.040


In [78]:
cos_similarty_df.to_csv('user_review_cols.gzip',compression={'method': 'gzip'})

In [37]:
df_1 =pd.read_csv('user_review_cols.gzip',compression='gzip',index_col='Title')

In [46]:
df_1=df_1.applymap(lambda x: 0 if round(x)==1 else x)

In [49]:
requested_movie='Meet the Parents (2000)'
df_1.sort_values(by=requested_movie,ascending=False)[requested_movie]

Title
Meet the Deedles (1998)          0.412928
Meet Joe Black (1998)            0.368518
Meet Wally Sparks (1997)         0.346875
Meet John Doe (1941)             0.324201
Meet Me in St. Louis (1944)      0.314482
                                   ...   
Tetsuo II: Body Hammer (1992)    0.000000
Contempt (Le Mépris) (1963)      0.000000
Face/Off (1997)                  0.000000
Fall (1997)                      0.000000
Contender, The (2000)            0.000000
Name: Meet the Parents (2000), Length: 3883, dtype: float64

In [77]:
pd.read_csv('user_review_cols.gzip',compression='gzip',index_col='Title')

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000000,0.102821,0.030133,0.034019,0.034027,0.000000,0.044319,0.092887,0.0,0.000000,...,0.045126,0.044135,0.232825,0.000000,0.000000,0.035022,0.000000,0.000000,0.000000,0.000000
Jumanji (1995),0.102821,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.191842,0.0,0.113210,...,0.000000,0.000000,0.188368,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Grumpier Old Men (1995),0.030133,0.000000,1.000000,0.030418,0.030426,0.000000,0.120626,0.000000,0.0,0.000000,...,0.040350,0.039464,0.000000,0.000000,0.000000,0.031315,0.000000,0.000000,0.000000,0.000000
Waiting to Exhale (1995),0.034019,0.000000,0.030418,1.000000,0.034350,0.000000,0.044739,0.000000,0.0,0.000000,...,0.045553,0.078034,0.000000,0.032112,0.000000,0.035354,0.027940,0.034493,0.032076,0.032565
Father of the Bride Part II (1995),0.034027,0.000000,0.030426,0.034350,1.000000,0.000000,0.044750,0.000000,0.0,0.000000,...,0.045564,0.044564,0.000000,0.000000,0.000000,0.035362,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Meet the Parents (2000),0.035022,0.000000,0.031315,0.035354,0.035362,0.000000,0.046058,0.000000,0.0,0.000000,...,0.046896,0.045867,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
Requiem for a Dream (2000),0.000000,0.000000,0.000000,0.027940,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.036248,0.000000,0.034766,0.000000,0.000000,1.000000,0.037344,0.034727,0.035257
Tigerland (2000),0.000000,0.000000,0.000000,0.034493,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.044750,0.000000,0.042920,0.000000,0.000000,0.037344,1.000000,0.042871,0.043526
Two Family House (2000),0.000000,0.000000,0.000000,0.032076,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.041614,0.000000,0.039912,0.000000,0.000000,0.034727,0.042871,1.000000,0.040475


In [76]:
cos_similarty_df

Title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000000,0.102821,0.030133,0.034019,0.034027,0.000000,0.044319,0.092887,0.0,0.000000,...,0.045126,0.044135,0.232825,0.000000,0.000000,0.035022,0.000000,0.000000,0.000000,0.000000
Jumanji (1995),0.102821,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.191842,0.0,0.113210,...,0.000000,0.000000,0.188368,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Grumpier Old Men (1995),0.030133,0.000000,1.000000,0.030418,0.030426,0.000000,0.120626,0.000000,0.0,0.000000,...,0.040350,0.039464,0.000000,0.000000,0.000000,0.031315,0.000000,0.000000,0.000000,0.000000
Waiting to Exhale (1995),0.034019,0.000000,0.030418,1.000000,0.034350,0.000000,0.044739,0.000000,0.0,0.000000,...,0.045553,0.078034,0.000000,0.032112,0.000000,0.035354,0.027940,0.034493,0.032076,0.032565
Father of the Bride Part II (1995),0.034027,0.000000,0.030426,0.034350,1.000000,0.000000,0.044750,0.000000,0.0,0.000000,...,0.045564,0.044564,0.000000,0.000000,0.000000,0.035362,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Meet the Parents (2000),0.035022,0.000000,0.031315,0.035354,0.035362,0.000000,0.046058,0.000000,0.0,0.000000,...,0.046896,0.045867,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
Requiem for a Dream (2000),0.000000,0.000000,0.000000,0.027940,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.036248,0.000000,0.034766,0.000000,0.000000,1.000000,0.037344,0.034727,0.035257
Tigerland (2000),0.000000,0.000000,0.000000,0.034493,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.044750,0.000000,0.042920,0.000000,0.000000,0.037344,1.000000,0.042871,0.043526
Two Family House (2000),0.000000,0.000000,0.000000,0.032076,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.041614,0.000000,0.039912,0.000000,0.000000,0.034727,0.042871,1.000000,0.040475


In [81]:
df_kwds=pd.read_csv('cosine_similarity.gzip',compression='gzip')

In [82]:
df_kwds

Unnamed: 0.1,Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
0,0,0.000000,0.102821,0.030133,0.034019,0.034027,0.000000,0.044319,0.092887,0.0,...,0.045126,0.044135,0.232825,0.000000,0.000000,0.035022,0.000000,0.000000,0.000000,0.000000
1,1,0.102821,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.191842,0.0,...,0.000000,0.000000,0.188368,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2,0.030133,0.000000,0.000000,0.030418,0.030426,0.000000,0.120626,0.000000,0.0,...,0.040350,0.039464,0.000000,0.000000,0.000000,0.031315,0.000000,0.000000,0.000000,0.000000
3,3,0.034019,0.000000,0.030418,0.000000,0.034350,0.000000,0.044739,0.000000,0.0,...,0.045553,0.078034,0.000000,0.032112,0.000000,0.035354,0.027940,0.034493,0.032076,0.032565
4,4,0.034027,0.000000,0.030426,0.034350,0.000000,0.000000,0.044750,0.000000,0.0,...,0.045564,0.044564,0.000000,0.000000,0.000000,0.035362,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3878,0.035022,0.000000,0.031315,0.035354,0.035362,0.000000,0.046058,0.000000,0.0,...,0.046896,0.045867,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3879,3879,0.000000,0.000000,0.000000,0.027940,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.036248,0.000000,0.034766,0.000000,0.000000,0.000000,0.037344,0.034727,0.035257
3880,3880,0.000000,0.000000,0.000000,0.034493,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.044750,0.000000,0.042920,0.000000,0.000000,0.037344,0.000000,0.042871,0.043526
3881,3881,0.000000,0.000000,0.000000,0.032076,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.041614,0.000000,0.039912,0.000000,0.000000,0.034727,0.042871,0.000000,0.040475
