## Collobrative Filtering

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_rows', 500)

In [2]:
def standardize(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

In [3]:
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")
df = movies.merge(ratings, left_on="movieId", right_on="movieId")
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [4]:
df.isna().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [5]:
user_ratings = df.pivot_table(index=['userId'], columns=['title'], values='rating')
user_ratings

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [6]:
## Drop movies with less than 10 user ratings
user_ratings = user_ratings.dropna(thresh=15, axis=1).fillna(0)
user_ratings

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Young Guns (1988),Zack and Miri Make a Porno (2008),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.5,3.5,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Building the similarity matrix
sim_matrix = user_ratings.corr(method="pearson")
sim_matrix.head(100)


title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Young Guns (1988),Zack and Miri Make a Porno (2008),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.063117,0.143482,0.011998,0.087931,0.224052,0.034223,0.009277,0.008331,0.0497,...,0.248535,0.017477,0.134701,0.153158,0.101301,0.049897,0.003233,0.187953,0.062174,0.353194
(500) Days of Summer (2009),0.063117,1.0,0.273989,0.19396,0.148903,0.142141,0.159756,0.135486,0.200135,0.297152,...,0.073476,0.374515,0.068407,0.414585,0.355723,0.252226,0.216007,0.053614,0.241092,0.125905
10 Things I Hate About You (1999),0.143482,0.273989,1.0,0.24467,0.223481,0.211473,0.011784,0.091964,0.043383,0.321071,...,0.152333,0.243118,0.13246,0.091853,0.158637,0.281934,0.050031,0.121029,0.130813,0.110612
"10,000 BC (2008)",0.011998,0.19396,0.24467,1.0,0.234459,0.119132,0.059187,-0.025882,0.089328,0.167098,...,0.065201,0.260261,0.094913,0.184521,0.242299,0.240231,0.094773,0.088045,0.203002,0.083518
101 Dalmatians (1996),0.087931,0.148903,0.223481,0.234459,1.0,0.285112,0.119843,0.072399,0.029967,0.188467,...,0.033582,0.114968,0.096294,0.067134,0.113224,0.184324,0.054024,0.047804,0.156932,0.078734
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.224052,0.142141,0.211473,0.119132,0.285112,1.0,0.134037,0.017264,-0.046277,0.218406,...,0.143006,0.120302,0.049818,0.08365,0.171654,0.27426,0.077594,0.085606,0.24882,0.171118
12 Angry Men (1957),0.034223,0.159756,0.011784,0.059187,0.119843,0.134037,1.0,0.132979,0.058862,-0.027672,...,0.139655,0.104518,0.079905,0.241435,0.144652,0.122107,0.056742,-0.001708,0.074306,0.102744
12 Years a Slave (2013),0.009277,0.135486,0.091964,-0.025882,0.072399,0.017264,0.132979,1.0,0.249931,0.092133,...,-0.009081,0.024045,0.013786,0.190366,0.10415,0.017351,0.063325,0.002528,0.037469,0.004213
127 Hours (2010),0.008331,0.200135,0.043383,0.089328,0.029967,-0.046277,0.058862,0.249931,1.0,0.043314,...,0.035045,0.223135,0.012907,0.364841,0.198926,0.091416,0.225747,0.128638,0.153335,0.002912
13 Going on 30 (2004),0.0497,0.297152,0.321071,0.167098,0.188467,0.218406,-0.027672,0.092133,0.043314,1.0,...,0.072272,0.103892,0.024181,0.134812,0.10823,0.225685,0.069602,0.010007,0.139022,0.021094


In [8]:
def get_similar_movies(movie, rating):
    """
    :movie: movie for which you want similar reccomendation
    :rating: rating of the movie
    """

    row = (sim_matrix[movie]*(rating - 2.5)).sort_values(ascending=False)
    return row

In [24]:
ratings = [("21 Jump Street (2012)", 4), ("22 Jump Street (2014)", 4.5), ("Rush Hour (1998)", 4)]
# ratings = ["21 Jump Street (2012)", "22 Jump Street (2014)", "Rush Hour (1998)"]
# ratings = ['Fast and the Furious, The (2001)', 'Avengers, The (2012)', 'Avengers: Age of Ultron (2015)']

similar_movies = pd.DataFrame()
for movie, rating in ratings:
    similar_movies = pd.concat([similar_movies, get_similar_movies(movie, 5)], axis=1)

similar_movies = similar_movies.T
similar_movies.sum().sort_values(ascending=False)



22 Jump Street (2014)                 4.385343
21 Jump Street (2012)                 4.306370
Rush Hour (1998)                      3.325966
Spy (2015)                            3.180482
Other Guys, The (2010)                2.973965
                                        ...   
Madness of King George, The (1994)   -0.306591
Dead Man Walking (1995)              -0.353290
Piano, The (1993)                    -0.387106
Postman, The (Postino, Il) (1994)    -0.398106
Disclosure (1994)                    -0.405538
Length: 1650, dtype: float64