In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pickle

In [87]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [88]:
movies['genres'] = movies['genres'].apply(lambda x: x.replace('|',','))

In [89]:
'''
<============== top rated recommendations ==============>
'''



In [90]:
def weighted_rating(v,m,R,C):
    '''
    Calculate the weighted rating
    
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)
    '''
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

In [91]:
# pre processing
vote_count = (
    ratings
    .groupby('movieId',as_index=False)
    .agg( {'userId':'count', 'rating':'mean'} )
    )
vote_count.columns = ['movieId', 'vote_count', 'avg_rating']

# calcuate input parameters
C = np.mean(vote_count['avg_rating'])
m = np.percentile(vote_count['vote_count'], 70)
vote_count = vote_count[vote_count['vote_count'] >= m]
R = vote_count['avg_rating']
v = vote_count['vote_count']
vote_count['weighted_rating'] = weighted_rating(v,m,R,C)

# post processing
vote_count = vote_count.merge(movies, on = ['movieId'], how = 'left')
popular_items = vote_count.loc[:,['movieId','title', 'genres', 'overview','poster_path','vote_count', 'avg_rating', 'weighted_rating']]
popular_items = popular_items.sort_values('weighted_rating',ascending=False)
popular_items = popular_items.head(10)

file = open('top_rated', 'wb')
pickle.dump(popular_items, file)
file.close()

In [92]:
f = open('top_rated', 'rb')
data = pickle.load(f)
file.close()

In [93]:
def get_top_rated():
    return data

In [94]:
'''
<============== content based recommendations ==============>
'''



In [95]:
from sklearn.metrics.pairwise import cosine_similarity

In [96]:
# preprocessing
ratedMovies = movies.loc[movies['movieId'].isin(ratings['movieId'])].copy()

In [98]:
# extract the genre
genre = ratedMovies['genres'].str.split(",", expand=True)

# get all possible genre
all_genre = set()
for c in genre.columns:
    distinct_genre = genre[c].str.lower().str.strip().unique()
    all_genre.update(distinct_genre)
all_genre.remove(None)
all_genre.remove('(no genres listed)')

# dump matrix
f1 = open('genre', 'wb')
pickle.dump(all_genre, f1)
f1.close()

In [78]:
# create item-genre matrix
item_genre_mat = ratedMovies[['movieId', 'genres']].copy()
item_genre_mat['genres'] = item_genre_mat['genres'].str.lower().str.strip()

# OHE the genres column
for genre in all_genre:
    item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)
item_genre_mat = item_genre_mat.drop(['genres'], axis=1)
item_genre_mat = item_genre_mat.set_index('movieId')

# compute similarity matix
corr_mat = cosine_similarity(item_genre_mat)

# dump matrix
f1 = open('corr_mat', 'wb')
pickle.dump(corr_mat, f1)
f1.close()
f2 = open('item_genre_mat', 'wb')
pickle.dump(item_genre_mat, f2)
f2.close()

  item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)
  item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)


In [79]:
r = open('item_genre_mat', 'rb')
item_genre_mat = pickle.load(r)
r.close()
r2 = open('corr_mat', 'rb')
corr_mat = pickle.load(r2)
r2.close()

In [80]:
def top_k_items(item_id, top_k, corr_mat, map_name):
    
    # sort correlation value ascendingly and select top_k item_id
    top_items = corr_mat[item_id,:].argsort()[-top_k:][::-1] 
    top_items = [map_name[e] for e in top_items] 

    return top_items

In [81]:
# get top-k similar items
ind2name = {ind:name for ind,name in enumerate(item_genre_mat.index)}
name2ind = {v:k for k,v in ind2name.items()}

In [82]:
movieId = 110102

In [83]:
similar_items = top_k_items(name2ind[movieId],
                            top_k = 10,
                            corr_mat = corr_mat,
                            map_name = ind2name)
similar_items.remove(movieId)

In [84]:
similar_items

[95510, 103228, 106487, 68358, 69526, 72998, 106002, 5378, 102445]

In [53]:
display(movies.loc[movies['movieId'].isin(similar_items)])

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,imdbId,tmdbId,g2,overview,poster_path,tagline
3828,3828,5378,Star Wars: Episode II - Attack of the Clones (...,"Action,Adventure,Sci-Fi,IMAX",121765,1894.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","Ten years after the invasion of Naboo, the gal...",/2vcNFtrZXNwIcBgH5e2xXCmVR8t.jpg,A Jedi Shall Not Know Anger. Nor Hatred. Nor L...
5260,5260,8636,Spider-Man 2 (2004),"Action,Adventure,Sci-Fi,IMAX",316654,558.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",Peter Parker is going through a major identity...,/qtBFrsEQ4oXW8sKvRxkKnYuPLg.jpg,There's a hero in all of us.
6230,6230,46530,Superman Returns (2006),"Action,Adventure,Sci-Fi,IMAX",348150,1452.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Superman returns to discover his 5-year absenc...,/e3aLTaD5ppxo3en0GAGceekEPAe.jpg,
7007,7007,68358,Star Trek (2009),"Action,Adventure,Sci-Fi,IMAX",796366,13475.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",The fate of the galaxy rests in the hands of b...,/6V0CY7pwdDOCDS2XqNWahmIlVYh.jpg,The future begins.
7053,7053,69526,Transformers: Revenge of the Fallen (2009),"Action,Adventure,Sci-Fi,IMAX",1055369,8373.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Sam Witwicky leaves the Autobots behind for a ...,/kVISXAXDYhjQCfu50QZeCCzzbPv.jpg,Revenge is coming.
7201,7201,72998,Avatar (2009),"Action,Adventure,Sci-Fi,IMAX",499549,19995.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","In the 22nd century, a paraplegic Marine is di...",/kmcqlZGaSh20zpTbuoF0Cdn07dT.jpg,Enter the World of Pandora.
7472,7472,82461,Tron: Legacy (2010),"Action,Adventure,Sci-Fi,IMAX",1104001,20526.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","Sam Flynn, the tech-savvy and daring son of Ke...",/vllvystwQjmXzy5OvBKnGl1JREF.jpg,The Game Has Changed.
7911,7911,95510,"Amazing Spider-Man, The (2012)","Action,Adventure,Sci-Fi,IMAX",948470,1930.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",Peter Parker is an outcast high schooler aband...,/eA2D86Y6VPWuUzZyatiLBwpTilQ.jpg,The untold story begins.
8270,8270,106002,Ender's Game (2013),"Action,Adventure,Sci-Fi,IMAX",1731141,80274.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",Based on the classic novel by Orson Scott Card...,/vuZ8uSqyOM5698yNraF7OJRM3mS.jpg,This is not a game.


In [None]:
'''
<============== collaborative filtering recommendations ==============>
'''

In [54]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   10000 non-null  int64  
 1   movieId  10000 non-null  int64  
 2   rating   10000 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 234.5 KB


In [56]:
# preprocess data
row = ratings['userId']
col = ratings['movieId']
data = ratings['rating']

# NUM_USERS = len(ratings['userId'].unique())
NUM_ITEMS = len(ratings['movieId'].unique())

# init user-item matrix
mat = csr_matrix((data, (row, col)), shape=(NUM_ITEMS, NUM_ITEMS))
mat.eliminate_zeros()

ValueError: column index exceeds matrix dimensions

In [38]:

# calculate sparsity
sparsity = float(len(mat.nonzero()[0]))
sparsity /= (mat.shape[0] * mat.shape[1])
sparsity *= 100
print(f'Sparsity: {sparsity:4.2f}%. This means that {sparsity:4.2f}% of the user-item ratings have a value.')

Sparsity: 0.09%. This means that 0.09% of the user-item ratings have a value.


In [39]:

# compute similarity
item_corr_mat = cosine_similarity(mat.T)

# get top k item
print("\nThe top-k similar movie to item_id 99")
similar_items = top_k_items(name2ind['99'],
                            top_k = 10,
                            corr_mat = item_corr_mat,
                            map_name = ind2name)

display(items.loc[items[ITEM_COL].isin(similar_items)])

MemoryError: Unable to allocate 279. GiB for an array with shape (193610, 193610) and data type float64