In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pickle
from joblib import dump

In [182]:
ratings = pd.read_csv('ratings_new.csv')
movies = pd.read_csv('movies_new.csv')

In [183]:
movies['genres'] = movies['genres'].apply(lambda x: x.replace('|',','))

In [184]:
'''
<============== top rated recommendations ==============>
'''



In [185]:
def weighted_rating(v,m,R,C):
    '''
    Calculate the weighted rating
    
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)
    '''
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

In [186]:
# pre processing
vote_count = (
    ratings
    .groupby('movieId',as_index=False)
    .agg( {'userId':'count', 'rating':'mean'} )
    )
vote_count.columns = ['movieId', 'vote_count', 'avg_rating']

# calcuate input parameters
C = np.mean(vote_count['avg_rating'])
m = np.percentile(vote_count['vote_count'], 70)
vote_count = vote_count[vote_count['vote_count'] >= m]
R = vote_count['avg_rating']
v = vote_count['vote_count']
vote_count['weighted_rating'] = weighted_rating(v,m,R,C)

# post processing
vote_count = vote_count.merge(movies, on = ['movieId'], how = 'left')
popular_items = vote_count.loc[:,['movieId','title', 'genres', 'overview','poster_path','tmdbId', 'imdbId', 'weighted_rating',]]
popular_items = popular_items.sort_values('weighted_rating',ascending=False)
popular_items = popular_items.head(10)

file = open('top_rated.pkl', 'wb')
pickle.dump(popular_items, file)
file.close()

In [187]:
f = open('top_rated.pkl', 'rb')
data = pickle.load(f)
file.close()

In [188]:
def get_top_rated():
    return data

In [189]:
get_top_rated()

Unnamed: 0,movieId,title,genres,overview,poster_path,tmdbId,imdbId,weighted_rating
97,278,"Shawshank Redemption, The (1994)","Crime,Drama",Framed in the 1940s for the double murder of h...,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,278.0,111161.0,4.770554
260,658,"Godfather, The (1972)","Crime,Drama","Spanning the years 1945 to 1955, a chronicle o...",/rPdtLWNsZmAtoZl9PK7S2wE3qiS.jpg,238.0,68646.0,4.539675
1853,4801,"Lord of the Rings: The Return of the King, The...","Action,Adventure,Drama,Fantasy",Aragorn is revealed as the heir to the ancient...,/uexxR7Kw1qYbZk0RYaF9Rx5ykbj.jpg,122.0,167260.0,4.373008
1426,3636,"Lord of the Rings: The Fellowship of the Ring,...","Adventure,Fantasy","Young hobbit Frodo Baggins, after inheriting a...",/6oom5QYQ2yQTMJIbnvbkBL9cHo6.jpg,120.0,120737.0,4.316009
2292,7362,Inception (2010),"Action,Crime,Drama,Mystery,Sci-Fi,Thriller,IMAX","Cobb, a skilled thief who commits corporate es...",/edv5CZvWj09upOsy2Y6IwDhK8bt.jpg,27205.0,1375666.0,4.13419
773,1939,"Matrix, The (1999)","Action,Sci-Fi,Thriller","Set in the 22nd century, The Matrix tells the ...",/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg,603.0,133093.0,4.039675
1569,3981,Spirited Away (Sen to Chihiro no kamikakushi) ...,"Adventure,Animation,Fantasy",A ten year old girl who wanders away from her ...,/ynXoOxmDHNQ4UAy0oU6avW71HVW.jpg,129.0,245429.0,4.039675
1024,2628,Me Myself I (2000),"Comedy,Romance","Pamela Drury is unhappy, and alone. On her bir...",/6GTzoxfOa60Hu4sPqrZh0jqIdow.jpg,49721.0,183503.0,3.619024
77,197,Dolores Claiborne (1995),"Drama,Thriller",Dolores Claiborne was accused of killing her a...,/ewmVWV0TP8LTlYZ4OzCcguwC9d1.jpg,11929.0,109642.0,3.492032
82,219,Jefferson in Paris (1995),Drama,"His wife having recently died, Thomas Jefferso...",/qKr3Mgo1HB3HkGTAVoVEpOSWlp2.jpg,87729.0,113463.0,3.492032


In [157]:
'''
<============== content based recommendations ==============>
'''



In [158]:
from sklearn.metrics.pairwise import cosine_similarity

In [159]:
# preprocessing
ratedMovies = movies.loc[movies['movieId'].isin(ratings['movieId'])].copy()

In [191]:
# extract the genre
genre = ratedMovies['genres'].str.split(",", expand=True)

# get all possible genre
all_genre = set()
for c in genre.columns:
    distinct_genre = genre[c].str.lower().str.strip().unique()
    all_genre.update(distinct_genre)
all_genre.remove(None)
# all_genre.remove('(no genres listed)')

# dump matrix
f1 = open('genre.pkl', 'wb')
pickle.dump(all_genre, f1)
f1.close()

In [192]:
# create item-genre matrix
item_genre_mat = movies[['movieId', 'genres']].copy()
item_genre_mat['genres'] = item_genre_mat['genres'].str.lower().str.strip()

# OHE the genres column
for genre in all_genre:
    item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)
item_genre_mat = item_genre_mat.drop(['genres'], axis=1)
item_genre_mat = item_genre_mat.set_index('movieId')

# compute similarity matix
corr_mat = cosine_similarity(item_genre_mat)

# dump matrix
f1 = open('corr_mat.pkl', 'wb')
pickle.dump(corr_mat, f1)
f1.close()
f2 = open('item_genre_mat.pkl', 'wb')
pickle.dump(item_genre_mat, f2)
f2.close()

In [193]:
r = open('item_genre_mat.pkl', 'rb')
item_genre_mat = pickle.load(r)
r.close()
r2 = open('corr_mat.pkl', 'rb')
corr_mat1 = pickle.load(r2)
r2.close()

In [194]:
def top_k_items(item_id, top_k, corr_mat, map_name):
    
    # sort correlation value ascendingly and select top_k item_id
    top_items = corr_mat[item_id,:].argsort()[-top_k:][::-1] 
    top_items = [map_name[e] for e in top_items] 

    return top_items

In [195]:
# get top-k similar items
ind2name = {ind:name for ind,name in enumerate(item_genre_mat.index)}
name2ind = {v:k for k,v in ind2name.items()}

In [196]:
movieId = 1

In [197]:
similar_items = top_k_items(name2ind[movieId],
                            top_k = 10,
                            corr_mat = corr_mat1,
                            map_name = ind2name)
if movieId in similar_items:
    similar_items.remove(movieId)

In [198]:
similar_items

[2808, 9397, 3566, 6187, 2999, 7746, 6938, 2354, 8203]

In [199]:
display(movies.loc[movies['movieId'].isin(similar_items)])

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,g2,overview,poster_path,tagline
1073,2354,Toy Story 2 (1999),"Adventure,Animation,Children,Comedy,Fantasy",120363,863,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Andy heads off to Cowboy Camp, leaving his toy...",/3CmK3XurcLeUyMifCR28ibzupbB.jpg,The toys are back!
1208,2808,"Adventures of Rocky and Bullwinkle, The (2000)","Adventure,Animation,Children,Comedy,Fantasy",131704,17711,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",Rocky and Bullwinkle have been living off the ...,/pjzt6S76Uumm0aZtxNAVhDCLQ0P.jpg,This summer it's not the same old bull.
1304,2999,"Emperor's New Groove, The (2000)","Adventure,Animation,Children,Comedy,Fantasy",120917,11688,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",Kuzco is a self-centered emperor who summons P...,/klhsrqyhvFaoM6Rwjo9dFPYJQSK.jpg,It's All About.....ME!
1521,3566,"Monsters, Inc. (2001)","Adventure,Animation,Children,Comedy,Fantasy",198781,585,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","James Sullivan and Mike Wazowski are monsters,...",/93Y9BGx8blzmZOPSoivkFfaifqU.jpg,We Scare Because We Care.
2882,6187,"Wild, The (2006)","Adventure,Animation,Children,Comedy,Fantasy",405469,9904,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",An adolescent lion is accidentally shipped fro...,/uxTtgh4tpSWVulXLCDqKU10cWXo.jpg,Start spreading the newspaper.
3590,6938,"Tale of Despereaux, The (2008)","Adventure,Animation,Children,Comedy,Fantasy",420238,10199,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",Once upon a time... in the far away kingdom of...,/8Nge4rXAQzU5w9U8OvnXuuJltL9.jpg,Small Mouse. Big Dreams
4339,7746,Asterix and the Vikings (Astérix et les Viking...,"Adventure,Animation,Children,Comedy,Fantasy",371552,9642,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",Asterix and Obelix have been given a tough mis...,/e9IzoqXzxKljDwQrisEiUs056iv.jpg,
4750,8203,Turbo (2013),"Adventure,Animation,Children,Comedy,Fantasy",1860353,77950,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",The tale of an ordinary garden snail who dream...,/kajPlPUgobxci8ME5oqvzjN430q.jpg,SLO NO MO
5818,9397,Moana (2016),"Adventure,Animation,Children,Comedy,Fantasy",3521164,277834,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...","In Ancient Polynesia, when a terrible curse in...",/z4x0Bp48ar3Mda8KiPD1vwSY3D8.jpg,The ocean is calling.


In [128]:
'''
<============== collaborative filtering memory based recommendations ==============>
'''



In [200]:
from scipy.sparse import csr_matrix

NUM_USERS = 1010
NUM_MOVIES = 10000


# preprocess data
row = ratings['userId']
col = ratings['movieId']
data = ratings['rating']

# init user-item matrix
mat = csr_matrix((data, (row, col)))
mat.eliminate_zeros()

# calculate sparsity
sparsity = float(len(mat.nonzero()[0]))
sparsity /= (mat.shape[0] * mat.shape[1])
sparsity *= 100
print(f'Sparsity: {sparsity:4.2f}%. This means that {sparsity:4.2f}% of the user-item ratings have a value.')

# compute similarity
item_corr_mat = cosine_similarity(mat.T)


# dump matrix
f1 = open('cf_memory.pkl', 'wb')
pickle.dump(item_corr_mat, f1)
f1.close()

Sparsity: 0.04%. This means that 0.04% of the user-item ratings have a value.


In [201]:
r = open('cf_memory.pkl', 'rb')
movie_cf_mat = pickle.load(r)
r.close()

In [202]:
movieId = 1

# get top k item
similar_items = top_k_items(name2ind[movieId],
                            top_k = 11,
                            corr_mat = movie_cf_mat,
                            map_name = ind2name)
if movieId in similar_items:
    similar_items.remove(movieId)

display(movies.loc[movies['movieId'].isin(similar_items)])

KeyError: 7362

In [132]:
'''
<============== collaborative filtering model based recommendations ==============>
'''



In [203]:
from sklearn.decomposition import TruncatedSVD

epsilon = 1e-9
n_latent_factors = 10

# calculate item latent matrix
item_svd = TruncatedSVD(n_components = n_latent_factors)
item_features = item_svd.fit_transform(mat.transpose()) + epsilon

# calculate user latent matrix
user_svd = TruncatedSVD(n_components = n_latent_factors)
user_features = user_svd.fit_transform(mat) + epsilon

# compute similarity
item_corr_mat = cosine_similarity(item_features)


# dump matrix
f1 = open('cf_model.pkl', 'wb')
pickle.dump(item_corr_mat, f1)
f1.close()

def top_k_items2(item_id, top_k, corr_mat, map_name):
    
    # sort correlation value ascendingly and select top_k item_id
    top_items = corr_mat[item_id,:].argsort()[-top_k:][::-1] 
    top_items = [map_name[e] for e in top_items] 

    return top_items


# get top k item
similar_items = top_k_items2(name2ind[7594],
                            top_k = 10,
                            corr_mat = item_corr_mat,
                            map_name = ind2name)

# print(similar_items)
display(movies.loc[movies['movieId'].isin(similar_items)])

del user_features
gc.collect();

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,g2,overview,poster_path,tagline
3057,6380,"Hitcher, The (2007)","Action,Horror,Thriller",455960,8398,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",While driving through the New Mexico Desert du...,/mIPCt79baN62Xv6TAEwLDxLUqBz.jpg,Never pick up strangers.
3059,6382,Bridge to Terabithia (2007),"Adventure,Children,Fantasy",398808,1265,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",Jesse Aarons trained all summer to become the ...,/gihMBO2SmUprI1ecYe7Eo7Eg1yq.jpg,"Close your eyes, but keep your mind wide open."
3063,6386,Waitress (2007),"Comedy,Drama,Romance",473308,10758,"[{'id': 35, 'name': 'Comedy'}]","Jenna is a pregnant, unhappily married waitres...",/88FWNhsEvnCGELvUgQEpZ1E5nd5.jpg,If only life were as easy as pie
3064,6388,Catch and Release (2006),"Comedy,Drama,Romance",395495,13668,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Gray Wheeler just lost everything. But it coul...,/zd9B1CwZxAE4Y5h5dya0c2YqQ8q.jpg,Life is messy. Love is messier
3065,6389,Smokin' Aces (2006),"Action,Crime,Drama,Thriller",475394,7516,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",When a Las Vegas performer-turned-snitch named...,/sIxQqvlhqWRiw0czLNuGS8q3P4F.jpg,Nobody gets away clean.
3066,6390,Blood and Chocolate (2007),"Drama,Fantasy,Horror,Romance",397044,10075,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",A young teenage werewolf is torn between honor...,/g4PQ1tNyA7EMi9bxhP1R3Lo9byh.jpg,The hunt never tasted so sweet.
3067,6391,Epic Movie (2007),"Adventure,Comedy",799949,9760,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","When Edward, Peter, Lucy and Susan each follow...",/b4iOOIzR19TnqtUEetXBFxJ54YV.jpg,We know it's big. We measured.
3068,6392,"Messengers, The (2007)","Drama,Horror,Thriller",425430,9966,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",When the Solomons trade in the craziness of bi...,/x2Z5iXhHaoK22vwfxju5vq25apS.jpg,There is evidence to suggest that children are...
3069,6393,Because I Said So (2007),"Comedy,Drama,Romance",490084,1257,"[{'id': 35, 'name': 'Comedy'}]",In an effort to prevent family history from re...,/a7fBuvCoPPP5RQMZuPN25OT4rk9.jpg,"She's just your normal, overprotective, overbe..."
3681,7036,Sweeney Todd (2006),"Crime,Drama,Horror,Thriller",479760,37924,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...","A BBC adaptation of the Victorian ""penny dread...",/40ntPvTqjqEJlME1AtBEzwVUPHC.jpg,The Demon Barber of Fleet Street.
