In [60]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

### Prepare Dataset

In [61]:
#Not using link.csv -> links to imdbId
movies = pd.read_csv("movies.csv")
rating = pd.read_csv("ratings.csv")
tag = pd.read_csv("tags.csv")

#Combine movie and ratings
movie_rate = movies.merge(rating, on = "movieId", how = "left")
#Rename timestamp
movie_rate = movie_rate.rename(columns = {"timestamp":"rating_timestamp"})

#Combine movie, ratings and tags
movie_rate_tag = movie_rate.merge(tag, on = ["userId", "movieId"], how = "left")
#Rename tag 
data = movie_rate_tag.rename(columns = {"timestamp":"tag_timestamp"})


In [62]:
data[0:2]

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0,,
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847434962.0,,


### Pre-processing

In [63]:
#NaN in rating_timestamp
data["rating_timestamp"] = data["rating_timestamp"].fillna(0)

#NaN in tag
data["tag"] = data["tag"].fillna("")

#NaN in tag_timestamp
data["tag_timestamp"] = data["tag_timestamp"].fillna(0)

In [64]:
print(data.shape)
data[0:2]

(102695, 8)


Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0,,0.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847434962.0,,0.0


In [65]:
#Show userId NA
data[data['userId'].isna() == True]

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
23232,1076,"Innocents, The (1961)",Drama|Horror|Thriller,,,0.0,,0.0
50184,2939,Niagara (1953),Drama|Thriller,,,0.0,,0.0
54298,3338,For All Mankind (1989),Documentary,,,0.0,,0.0
55216,3456,"Color of Paradise, The (Rang-e khoda) (1999)",Drama,,,0.0,,0.0
61360,4194,I Know Where I'm Going! (1945),Drama|Romance|War,,,0.0,,0.0
69332,5721,"Chosen, The (1981)",Drama,,,0.0,,0.0
72884,6668,"Road Home, The (Wo de fu qin mu qin) (1999)",Drama|Romance,,,0.0,,0.0
73426,6849,Scrooge (1970),Drama|Fantasy|Musical,,,0.0,,0.0
74363,7020,Proof (1991),Comedy|Drama|Romance,,,0.0,,0.0
76528,7792,"Parallax View, The (1974)",Thriller,,,0.0,,0.0


In [66]:
#Drop NA in userId
data.dropna(subset = ["userId"], inplace = True)
data.shape

(102677, 8)

In [67]:
#Change genre | to ,
data['genres'] = data['genres'].str.replace('|', ', ')
data[0:2]

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy",1.0,4.0,964982703.0,,0.0
1,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy",5.0,4.0,847434962.0,,0.0


### Statistics of Rows

In [68]:
#Current Dataset Stats
print('Number of Movies:', len(data['movieId'].unique()))
print('Number of Unique Genres:', len(data['genres'].unique()))
print('Number of Users:', len(data['userId'].unique()))
print('Number of Unique Tags:', len(data['tag'].unique()))

Number of Movies: 9724
Number of Unique Genres: 951
Number of Users: 610
Number of Unique Tags: 1544


In [69]:
print(data.dtypes)

movieId               int64
title                object
genres               object
userId              float64
rating              float64
rating_timestamp    float64
tag                  object
tag_timestamp       float64
dtype: object


### Content Based Recommender

In [70]:
sample_data = data.sample(n = 50000, random_state = 42)
sample_data[0:10]

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
75505,7316,Confessions of a Teenage Drama Queen (2004),Comedy,509.0,3.0,1436000000.0,,0.0
76470,7705,Pat and Mike (1952),"Comedy, Romance",474.0,4.0,1137522000.0,sports,1137522000.0
97471,103372,"Heat, The (2013)","Action, Comedy, Crime",111.0,3.5,1516152000.0,,0.0
61768,4239,Blow (2001),"Crime, Drama",246.0,3.5,1355956000.0,,0.0
60541,4025,Miss Congeniality (2000),"Comedy, Crime",590.0,2.5,1258423000.0,,0.0
102581,183197,Dave Chappelle: Equanimity (2017),Comedy,111.0,3.5,1517441000.0,,0.0
47522,2706,American Pie (1999),"Comedy, Romance",267.0,5.0,959807200.0,,0.0
30045,1297,Real Genius (1985),Comedy,555.0,3.0,978822200.0,,0.0
60817,4047,Gettysburg (1993),"Drama, War",28.0,3.5,1242031000.0,,0.0
60568,4027,"O Brother, Where Art Thou? (2000)","Adventure, Comedy, Crime",140.0,2.0,1012506000.0,,0.0


In [71]:
#Current Sampled Dataset Stats
print('Number of Movies:', len(sample_data['movieId'].unique()))
print('Number of Unique Genres:', len(sample_data['genres'].unique()))
print('Number of Users:', len(sample_data['userId'].unique()))
print('Number of Unique Tags:', len(sample_data['tag'].unique()))

Number of Movies: 7497
Number of Unique Genres: 865
Number of Users: 610
Number of Unique Tags: 964


### TF-IDF on genres

In [72]:
#td-idf on genres

sample_data["genres"].unique()[0:20]

array(['Comedy', 'Comedy, Romance', 'Action, Comedy, Crime',
       'Crime, Drama', 'Comedy, Crime', 'Drama, War',
       'Adventure, Comedy, Crime', 'Action, Adventure, Sci-Fi, IMAX',
       'Action, Sci-Fi, Thriller', 'Action, Adventure, Sci-Fi',
       'Drama, Musical, Romance', 'Drama, Romance, Sci-Fi',
       'Action, Thriller', 'Adventure, Fantasy', 'Action, Drama, War',
       'Action, Crime, Thriller', 'Comedy, Drama, Romance, Thriller',
       'Comedy, Drama, Sci-Fi, War', 'Comedy, Drama',
       'Action, Comedy, Romance, War'], dtype=object)

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words = 'english')
#changed from tag to genres
tfidf_matrix = tfidf.fit_transform(sample_data['genres'])#.astype(np.float32)
tfidf_matrix.shape

(50000, 23)

In [74]:
tfidf.get_feature_names()

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'fi',
 'film',
 'genres',
 'horror',
 'imax',
 'listed',
 'musical',
 'mystery',
 'noir',
 'romance',
 'sci',
 'thriller',
 'war',
 'western']

In [75]:
#Cosine Similarity 
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(50000, 50000)

In [76]:
#Extract unique movies' index
get_idx = pd.Series(sample_data.index, index = sample_data['title'])
get_idx = get_idx[~get_idx.index.duplicated(keep = "first")].reset_index()
get_idx = pd.Series(get_idx.index, index = get_idx['title'])
get_idx[0:5]

title
Confessions of a Teenage Drama Queen (2004)    0
Pat and Mike (1952)                            1
Heat, The (2013)                               2
Blow (2001)                                    3
Miss Congeniality (2000)                       4
dtype: int64

### Implementation of Content-Based Filtering

In [77]:
#Implementation of Content-Based Filtering

def extract_recommend(title, cosine_sim = cosine_sim):
    #Get index of wanted movie
    idx = get_idx[title]
    
    #Get only similarities vs wanted movie  
    get_sim = list(enumerate(cosine_sim[idx]))
    
    #Sort by highest similarity 
    get_sim = sorted(get_sim, key = lambda x: x[1], reverse = True)
    
    #Retrieve List of Movie Index
    top_idx = [i[0] for i in get_sim]
        
    #Get Top 10 Unique Titles of List 
    titles = sample_data["title"].iloc[top_idx].unique()[1:11]
    ids = sample_data["movieId"].iloc[top_idx].unique()[1:11]
    
    
    return pd.DataFrame([titles, ids]).transpose()

### Results of Content-Based Filtering

In [78]:
#Example 1

extract_recommend('X-Men Origins: Wolverine (2009)')

Unnamed: 0,0,1
0,Johnny Mnemonic (1995),172
1,Paycheck (2003),7163
2,Blade Runner (1982),541
3,"Matrix, The (1999)",2571
4,Predator 2 (1990),3697
5,Predators (2010),79057
6,Predator (1987),3527
7,"6th Day, The (2000)",3986
8,"Terminator, The (1984)",1240
9,Godzilla (1998),1882


In [79]:
#Example 2

extract_recommend('How to Train Your Dragon (2010)')

Unnamed: 0,0,1
0,"Polar Express, The (2004)",8965
1,Rise of the Guardians (2012),98243
2,Toy Story 3 (2010),78499
3,"Ant Bully, The (2006)",47124
4,Shrek Forever After (a.k.a. Shrek: The Final C...,78637
5,Cloudy with a Chance of Meatballs (2009),71264
6,"Christmas Carol, A (2009)",72294
7,Legend of the Guardians: The Owls of Ga'Hoole ...,80615
8,Happy Feet (2006),49274
9,Cars 2 (2011),87876


### Memory-Based Collaborative Filtering

In [80]:
##Collaborative Filtering

#Choose random id
#rand_id = np.random.choice(sample_data['userId'].values) #76.0
r_id = 76.0

#Info from id
r_info = sample_data[sample_data['userId'] == r_id]

#Keep movieId, title, userId, rating


In [81]:
r_info

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
24596,1148,Wallace & Gromit: The Wrong Trousers (1993),"Animation, Children, Comedy, Crime",76.0,0.5,1439169000.0,,0.0
92847,79132,Inception (2010),"Action, Crime, Drama, Mystery, Sci-Fi, Thrille...",76.0,4.0,1439167000.0,,0.0
92521,77455,Exit Through the Gift Shop (2010),"Comedy, Documentary",76.0,3.5,1439168000.0,,0.0
83205,44195,Thank You for Smoking (2006),"Comedy, Drama",76.0,4.5,1439169000.0,,0.0
42510,2318,Happiness (1998),"Comedy, Drama",76.0,4.5,1439169000.0,,0.0
21372,923,Citizen Kane (1941),"Drama, Mystery",76.0,1.5,1439169000.0,,0.0
88206,58559,"Dark Knight, The (2008)","Action, Crime, Drama, IMAX",76.0,3.0,1439168000.0,,0.0
28104,1247,"Graduate, The (1967)","Comedy, Drama, Romance",76.0,2.0,1439169000.0,,0.0
15815,588,Aladdin (1992),"Adventure, Animation, Children, Comedy, Musical",76.0,0.5,1439168000.0,,0.0
8005,296,Pulp Fiction (1994),"Comedy, Crime, Drama, Thriller",76.0,4.5,1439166000.0,,0.0


In [82]:
userRatings = sample_data.pivot_table(index=['userId'],columns=['title'],values='rating')
userRatings.head()
print(userRatings.shape)

#Fill NA -> 0
userRatings = userRatings.fillna(0, axis = 1)
print(userRatings.shape)

(610, 7494)
(610, 7494)


In [83]:
userRatings.head()

title,'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...And Justice for All (1979),...,Zulu (1964),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
#Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity(userRatings)
cos_sim.shape

(610, 610)

In [85]:
#Creation of Matrix of userId and Movies

pred_ratings = np.dot(cos_sim, userRatings) 
total = np.sum(cos_sim, axis = 1) #
total = total.reshape(len(total), 1) #

pred_ratings = pred_ratings / total #

print(pred_ratings.shape)

(610, 7494)


In [86]:
y_n = userRatings.copy()

y_n = pd.DataFrame(np.where(y_n.values > 0.0, 0.0, 1.0), index = userRatings.index)
y_n.columns = userRatings.columns

In [87]:
y_n

title,'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...And Justice for All (1979),...,Zulu (1964),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
607.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
608.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
609.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [88]:
#Calculate with average weight of ratings

get_user_sim = np.multiply(pred_ratings, y_n)
get_user_sim.head()

title,'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...And Justice for All (1979),...,Zulu (1964),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.011175,0.001514,0.005992,0.003977,0.062633,0.007581,0.140514,0.037984,0.000865,0.009375,...,0.027092,0.023106,0.013975,0.008634,0.0,0.09307,0.049899,0.009842,0.0,0.003085
2.0,0.0,0.0,0.0,0.010666,0.04064,0.004279,0.26885,0.026,0.0,0.0,...,0.01417,0.0,0.005935,0.005935,0.02053,0.029681,0.072944,0.00796,0.087972,0.0
3.0,0.018029,0.025552,0.0,0.000853,0.178981,0.006652,0.089083,0.010036,0.014601,0.051979,...,0.033776,0.025935,0.016275,0.016275,0.0,0.082449,0.055622,0.013808,0.143696,0.004925
4.0,0.007972,0.0,0.016442,0.002843,0.053505,0.013125,0.155976,0.046821,0.0,0.004864,...,0.034202,0.019015,0.010362,0.006289,0.000725,0.105788,0.044722,0.007414,0.099815,0.002943
5.0,0.01422,0.0,0.002791,0.001391,0.03247,0.005866,0.078482,0.031778,0.0,0.006611,...,0.011926,0.008484,0.012188,0.003636,0.0,0.050929,0.025436,0.00246,0.058015,0.000929


In [89]:
#Examine recommendations for last user
get_user_sim.iloc[-1].sort_values(ascending = False)[0:10]

title
Shawshank Redemption, The (1994)                                                  1.331158
Star Wars: Episode IV - A New Hope (1977)                                         1.190208
Fight Club (1999)                                                                 1.121254
Silence of the Lambs, The (1991)                                                  1.110723
Seven (a.k.a. Se7en) (1995)                                                       1.093754
Star Wars: Episode V - The Empire Strikes Back (1980)                             1.031182
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    1.003512
Saving Private Ryan (1998)                                                        0.966067
Inception (2010)                                                                  0.964198
Usual Suspects, The (1995)                                                        0.953299
Name: 610.0, dtype: float64

In [90]:
#Examine recommedations for userid 550
get_user_sim.loc[550.0].sort_values(ascending = False)[0:10]

title
Shawshank Redemption, The (1994)                                                  1.314692
Fight Club (1999)                                                                 1.080301
Pulp Fiction (1994)                                                               1.067495
Lord of the Rings: The Fellowship of the Ring, The (2001)                         1.053656
Schindler's List (1993)                                                           0.984708
Lord of the Rings: The Two Towers, The (2002)                                     0.940251
Star Wars: Episode IV - A New Hope (1977)                                         0.936904
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    0.918097
Silence of the Lambs, The (1991)                                                  0.878003
Seven (a.k.a. Se7en) (1995)                                                       0.856858
Name: 550.0, dtype: float64

### Example of Memory-Based Collaborative Filtering

In [91]:
#Figure out how to input column of movies by movieid or title
newUser = [("WALL-E (2008)", 5), ("Toy Story (1995)", 5), ("Iron Man 2 (2010)", 2)]

action_lover = [("Amazing Spider-Man, The (2012)",5),
                ("Mission: Impossible III (2006)",4),
                ("Toy Story 3 (2010)",2),
                ("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",4)]

romantic_lover = [("(500) Days of Summer (2009)",5),
                  ("Alice in Wonderland (2010)",3),
                  ("Paris, I Love You (Paris, je t'aime) (2006)",4),
                  ("2001: A Space Odyssey (1968)",2)]


In [92]:
sample_data[0:5]

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
75505,7316,Confessions of a Teenage Drama Queen (2004),Comedy,509.0,3.0,1436000000.0,,0.0
76470,7705,Pat and Mike (1952),"Comedy, Romance",474.0,4.0,1137522000.0,sports,1137522000.0
97471,103372,"Heat, The (2013)","Action, Comedy, Crime",111.0,3.5,1516152000.0,,0.0
61768,4239,Blow (2001),"Crime, Drama",246.0,3.5,1355956000.0,,0.0
60541,4025,Miss Congeniality (2000),"Comedy, Crime",590.0,2.5,1258423000.0,,0.0


In [93]:
#Function to allow input of newuser and its ratings

def find_similar(sample_data, newUser):

    toby = sample_data[["title", "userId", "rating"]]
    createUserId = toby['userId'].max() + 1

    for movie, rating in newUser:
        toby = toby.append(pd.Series([movie, createUserId, rating], index = toby.columns), ignore_index = True)

    userRatings = toby.pivot_table(index=['userId'],columns=['title'],values='rating')

    #Fill NA -> 0
    userRatings = userRatings.fillna(0, axis = 1)

    cos_sim = cosine_similarity(userRatings)

    pred_ratings = np.dot(cos_sim, userRatings) 
    total = np.sum(np.absolute(cos_sim), axis = 1) #
    total = total.reshape(len(total), 1) #

    pred_ratings = pred_ratings / total #

    y_n = userRatings.copy()

    y_n = pd.DataFrame(np.where(y_n.values > 0.0, 0.0, 1.0), index = userRatings.index)
    y_n.columns = userRatings.columns

    get_user_sim = np.multiply(pred_ratings, y_n)

    #Get the last user
    return get_user_sim.iloc[-1].sort_values(ascending = False)

### Results of Content-Based Filtering

In [94]:
#Recommedation for new user

find_similar(sample_data, newUser)[0:20]

title
Forrest Gump (1994)                                                               1.028224
Lion King, The (1994)                                                             0.985866
Star Wars: Episode IV - A New Hope (1977)                                         0.979045
Pulp Fiction (1994)                                                               0.970909
Matrix, The (1999)                                                                0.930379
Shawshank Redemption, The (1994)                                                  0.832485
Mission: Impossible (1996)                                                        0.825140
Independence Day (a.k.a. ID4) (1996)                                              0.799676
Finding Nemo (2003)                                                               0.776265
Jurassic Park (1993)                                                              0.770580
Inception (2010)                                                                  0.

In [95]:
#Recommedation for action lover

find_similar(sample_data, action_lover)[0:20]

title
Lord of the Rings: The Fellowship of the Ring, The (2001)                         1.288574
Usual Suspects, The (1995)                                                        1.143147
Star Wars: Episode VI - Return of the Jedi (1983)                                 1.134741
Forrest Gump (1994)                                                               1.110013
Lord of the Rings: The Two Towers, The (2002)                                     1.063807
Shawshank Redemption, The (1994)                                                  1.032498
Inception (2010)                                                                  1.021776
Godfather, The (1972)                                                             1.005815
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    0.995986
Lord of the Rings: The Return of the King, The (2003)                             0.988677
Gladiator (2000)                                                                  0.

In [96]:
#Recommedation for romance lover

find_similar(sample_data, romantic_lover)[0:20]

title
Lord of the Rings: The Fellowship of the Ring, The (2001)    1.158871
Forrest Gump (1994)                                          1.083031
Eternal Sunshine of the Spotless Mind (2004)                 0.979239
Shrek (2001)                                                 0.941198
Lord of the Rings: The Return of the King, The (2003)        0.932182
Finding Nemo (2003)                                          0.930622
Star Wars: Episode IV - A New Hope (1977)                    0.904776
Star Wars: Episode VI - Return of the Jedi (1983)            0.883551
Godfather, The (1972)                                        0.848002
Lord of the Rings: The Two Towers, The (2002)                0.838575
WALL·E (2008)                                                0.825907
Blade Runner (1982)                                          0.806339
Schindler's List (1993)                                      0.800021
Pulp Fiction (1994)                                          0.784244
Seven (a.k.a. 

### Evaluation of Memory-Based Collaborative Filtering

In [97]:
##Evaluate Collaborative Filtering

#Random userid 30% 
#Listed random movies (one of each userid) with ratings
#List of original ratings
#Turn ratings to 0
#Predict ratings
#Compare prediction to original

In [98]:
import random
random.seed(41)

testUsers = random.sample(sample_data['userId'].unique().tolist(), 183)

data = sample_data.copy()

In [99]:
savedOneMovie = []
savedUsers = []

#Get cleaned training and testing set
for user in testUsers:
    #Gets one movie of all users
    oneMovie = random.sample(data[data['userId'] == user][['movieId', 'title', 'userId', 'rating']].values.tolist(), 1)
    savedOneMovie.append(oneMovie)
    
    #Delete one movie from testUsers
    idx = data[(data['userId'] == oneMovie[0][2]) & (data['movieId'] == oneMovie[0][0])].index
    data = data.drop(idx)
    
    #Get saved Users
    savedUsers.append(data[data['userId'] == user][['movieId', 'title', 'userId', 'rating']].to_records())  #not sure if list is needed
    
    #Delete users from training
    user_idx = data[data['userId'] == user].index
    data = data.drop(user_idx) 

In [100]:
#Calc and Get all Ratings for train and Test
def find_similar_pred(dataset, user, testRatings):
    trainValues = []
    testValues = []
    
    #Loops thru all users
    for u in range(len(user)):
        
        #Each user is cleaned
        clean = []

        for i in user[u]:
            clean.append((i[2], i[4]))
            
        #Each users' recommendations
        recommend = find_similar(dataset, clean)
        
        #Each users' ratings
        avg_rate = np.mean([rating[1] for rating in clean])
        
        #Calculate avg weighting
        weight_rating = recommend + avg_rate
        
        #Test Values
        calcValue = weight_rating[[weight_rating.index] == testRatings[u][0][1]]
        testValues.append(calcValue)
        
        #Train Values
        Value = testRatings[u][0][3]
        trainValues.append(Value)
    
    return trainValues, testValues

In [101]:
a, b = find_similar_pred(data, savedUsers, savedOneMovie)


In [102]:
a

[2.0,
 2.5,
 5.0,
 4.5,
 4.0,
 3.5,
 3.0,
 1.0,
 4.5,
 4.0,
 3.0,
 2.5,
 5.0,
 3.0,
 4.0,
 3.0,
 0.5,
 3.0,
 1.5,
 4.0,
 4.5,
 3.0,
 3.0,
 4.0,
 5.0,
 3.5,
 4.5,
 4.0,
 4.0,
 4.5,
 4.0,
 3.0,
 4.0,
 3.0,
 5.0,
 4.5,
 2.0,
 3.5,
 5.0,
 3.5,
 4.0,
 3.5,
 3.0,
 4.0,
 3.5,
 2.0,
 5.0,
 3.5,
 4.0,
 4.0,
 2.5,
 4.0,
 4.0,
 4.0,
 4.5,
 4.0,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 5.0,
 2.0,
 3.0,
 5.0,
 4.0,
 4.0,
 5.0,
 4.5,
 3.5,
 1.0,
 4.0,
 3.5,
 4.0,
 2.5,
 5.0,
 4.5,
 4.0,
 5.0,
 4.0,
 3.0,
 4.0,
 3.5,
 4.0,
 4.5,
 4.0,
 5.0,
 5.0,
 2.0,
 3.0,
 4.0,
 3.0,
 1.0,
 2.0,
 3.0,
 4.0,
 3.0,
 3.0,
 4.0,
 2.0,
 3.5,
 3.5,
 5.0,
 2.0,
 4.0,
 4.0,
 4.0,
 2.0,
 3.0,
 4.0,
 2.0,
 5.0,
 4.0,
 4.5,
 3.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 4.0,
 4.5,
 3.0,
 4.5,
 2.0,
 4.5,
 2.5,
 2.0,
 3.0,
 4.5,
 3.0,
 3.0,
 3.5,
 4.0,
 1.0,
 5.0,
 1.0,
 3.0,
 3.0,
 4.0,
 2.0,
 4.0,
 3.5,
 3.5,
 5.0,
 2.0,
 2.0,
 4.0,
 4.5,
 5.0,
 4.5,
 3.0,
 5.0,
 3.0,
 3.0,
 4.0,
 3.5,
 1.0,
 3.0,
 4.5,
 3.0,
 0.5,
 3.5,
 5.0,
 2.5

In [103]:
b

[4.765105603292618,
 5.0869598570711965,
 5.287210099074429,
 4.65218038964046,
 4.748078996615658,
 5.0113713114054725,
 5.203869200496507,
 5.690413196913492,
 5.489557823156235,
 5.21707914418452,
 5.242487324723241,
 4.3141027682454265,
 4.670732037445756,
 5.435651810105964,
 5.0467647997232445,
 4.921360956415395,
 4.459196268994027,
 4.497811748563789,
 5.000126329713032,
 4.766739935234284,
 4.686820355522763,
 4.541824064761794,
 4.587484195683145,
 4.897029706531638,
 5.77278941148055,
 5.281110474019629,
 5.298574972669275,
 5.110912332025951,
 5.570702200450245,
 4.047674838615503,
 4.820134845201181,
 4.923118481573738,
 5.347476159951405,
 4.99084092862747,
 5.380740457145748,
 5.377166568282783,
 4.540734233044854,
 5.57562846737039,
 5.215417465635521,
 4.113914550641285,
 5.185928386388574,
 4.817777753366564,
 4.487940213769373,
 5.279817322415762,
 4.605379384326481,
 5.588192274653425,
 5.377770816715508,
 4.404972361282926,
 3.989770989400809,
 5.3837851776816645,


In [104]:
from sklearn.metrics import mean_absolute_error

b = np.where(np.array(b) > 5.0, 5.0, np.array(b))

error = mean_absolute_error(a, b)
print('Mean Absolute Error:', error)
print('RMSE:', np.sqrt(error))

Mean Absolute Error: 1.2562213244746696
RMSE: 1.1208127963556938


### Model-Based Collaborative Filtering

In [105]:
pip install scikit-surprise

Note: you may need to restart the kernel to use updated packages.


In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly
from surprise.model_selection import cross_validate
%matplotlib inline

In [107]:
data.head()

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
75505,7316,Confessions of a Teenage Drama Queen (2004),Comedy,509.0,3.0,1436000000.0,,0.0
76470,7705,Pat and Mike (1952),"Comedy, Romance",474.0,4.0,1137522000.0,sports,1137522000.0
97471,103372,"Heat, The (2013)","Action, Comedy, Crime",111.0,3.5,1516152000.0,,0.0
61768,4239,Blow (2001),"Crime, Drama",246.0,3.5,1355956000.0,,0.0
60541,4025,Miss Congeniality (2000),"Comedy, Crime",590.0,2.5,1258423000.0,,0.0


In [108]:
#use data
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(1, 5))
surp_data = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)


In [109]:
#add person to data
#predict for all movies
#rate by highest

In [110]:
#Try all algorithms to compare rmse

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(n_epochs=10), NMF(n_epochs=10), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), NormalPredictor(), SlopeOne()]:
    # Perform cross validation
    results = cross_validate(algorithm, surp_data, measures=['RMSE', 'mae'], cv=3, verbose=False)
 
 # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


In [111]:
#Evaluation of all algorithms

pd.DataFrame(benchmark)

Unnamed: 0,test_rmse,test_mae,fit_time,test_time,Algorithm
0,0.907808,0.701452,0.454335,0.047072,SVD
1,0.991081,0.755018,0.222285,0.039743,NMF
2,0.937386,0.71602,0.038357,0.283017,KNNBaseline
3,1.015026,0.777397,0.011672,0.196818,KNNBasic
4,0.963161,0.732994,0.022502,0.223956,KNNWithMeans
5,0.966996,0.730228,0.041025,0.242388,KNNWithZScore
6,0.900534,0.695799,0.027633,0.033479,BaselineOnly
7,1.405245,1.124591,0.023115,0.081616,NormalPredictor
8,0.982543,0.752774,0.753563,0.822065,SlopeOne


In [112]:
#Use of KNNBasic

knn = KNNBasic()

trainset = surp_data.build_full_trainset()
knn.fit(trainset)

#userid and movieid
knn.predict(uid=10, iid=100)

Computing the msd similarity matrix...
Done computing similarity matrix.


Prediction(uid=10, iid=100, r_ui=None, est=3.08467579584468, details={'actual_k': 4, 'was_impossible': False})

In [113]:
#data['movieId'].unique()

data.head()

Unnamed: 0,movieId,title,genres,userId,rating,rating_timestamp,tag,tag_timestamp
75505,7316,Confessions of a Teenage Drama Queen (2004),Comedy,509.0,3.0,1436000000.0,,0.0
76470,7705,Pat and Mike (1952),"Comedy, Romance",474.0,4.0,1137522000.0,sports,1137522000.0
97471,103372,"Heat, The (2013)","Action, Comedy, Crime",111.0,3.5,1516152000.0,,0.0
61768,4239,Blow (2001),"Crime, Drama",246.0,3.5,1355956000.0,,0.0
60541,4025,Miss Congeniality (2000),"Comedy, Crime",590.0,2.5,1258423000.0,,0.0


In [114]:
#function to add new user and get predictions

def find_similar(sample_data, newUser):

    new_data = sample_data[["movieId", "title", "userId", "rating"]]
    createUserId = new_data['userId'].max() + 1

    for movie, rating in newUser:
        new_data = new_data.append(pd.Series([movie, createUserId, rating], index = new_data.columns), ignore_index = True)

    userRatings = new_data.pivot_table(index=['userId'],columns=['title'],values='rating')

    #Fill NA -> 0
    userRatings = userRatings.fillna(0, axis = 1)

    
    #Get the last user
    return get_user_sim.iloc[-1].sort_values(ascending = False)

In [115]:
#Input of new User

newUser = [("Finding Nemo (2003)", 4), 
           ("Toy Story (1995)", 5), 
           ("Iron Man 2 (2010)", 5), 
           ("Titanic (1953)", 2)]

In [116]:
#function to input data and KNN

def newUserKNNModel(userSeenMovies, data):
    
    new_data = data[["movieId", "title", "userId", "rating"]]
    createUserId = new_data['userId'].max() + 1
    movie_list = data[["movieId", "title"]].drop_duplicates()

    alreadyWatched = []

    for movie, rating in userSeenMovies:
        movieID = int(movie_list[movie_list["title"] == movie]['movieId'].values)
        alreadyWatched.append(movieID)
        new_data = new_data.append(pd.Series([movieID, movie, createUserId, rating], index = new_data.columns), ignore_index = True)

    reader = Reader(rating_scale=(1, 5))
    surp_data = Dataset.load_from_df(new_data[['userId', 'movieId', 'rating']], reader)

    newUserPrediction = []

    knn = KNNBasic()

    trainset = surp_data.build_full_trainset()
    knn.fit(trainset)

    #userid and movieid
    newMoviesId = movie_list[~movie_list['movieId'].isin(alreadyWatched)]['movieId'].values

    for i in newMoviesId:
        rate = round(knn.predict(uid = new_data['userId'].max(), iid = i).est, 3)
        newUserPrediction.append([i, rate])


    newUserRated = pd.DataFrame(newUserPrediction, columns = ['movieId', 'ratings']).sort_values(by=['ratings'], ascending = False)
    newUser_top = newUserRated.iloc[0:20]

    findtitle = movie_list[movie_list['movieId'].isin(newUser_top['movieId'].values)]
    newUser_top = newUser_top.merge(findtitle, on = "movieId", how = "left")
    
    return newUser_top

In [117]:
newUserKNNModel(newUser, data)

Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,movieId,ratings,title
0,3096,5.0,My Man Godfrey (1957)
1,136447,5.0,George Carlin: You Are All Diseased (1999)
2,4278,5.0,Triumph of the Will (Triumph des Willens) (1934)
3,106100,5.0,Dallas Buyers Club (2013)
4,175397,5.0,"In the blue sea, in the white foam. (1984)"
5,2511,5.0,"Long Goodbye, The (1973)"
6,71379,5.0,Paranormal Activity (2009)
7,104780,5.0,"Mystery of the Third Planet, The (Tayna tretey..."
8,120478,5.0,The Salt of the Earth (2014)
9,26326,5.0,"Holy Mountain, The (Montaña sagrada, La) (1973)"
