In [1]:
# Import our regular old heroes 
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy. 
import matplotlib.pyplot as plt
import seaborn as sns

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

In [2]:
test = pd.read_csv(r"C:\Users\AmantleMmokwa\Documents\Amantle\predict\test.csv", nrows=80000)
train = pd.read_csv(r"C:\Users\AmantleMmokwa\Documents\Amantle\predict\train.csv", nrows=80000)

In [3]:
moviesfile = pd.read_csv(r"C:\Users\AmantleMmokwa\Documents\Amantle\predict\movies.csv", nrows=80000)
tags = pd.read_csv(r"C:\Users\AmantleMmokwa\Documents\Amantle\predict\tags.csv", nrows=80000)
imdb = pd.read_csv(r"C:\Users\AmantleMmokwa\Documents\Amantle\predict\imdb_data.csv", nrows=80000)

In [4]:
train.isnull()

Unnamed: 0,userId,movieId,rating,timestamp
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
79995,False,False,False,False
79996,False,False,False,False
79997,False,False,False,False
79998,False,False,False,False


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [6]:
moviesfile.isnull()

Unnamed: 0,movieId,title,genres
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
62418,False,False,False
62419,False,False,False
62420,False,False,False
62421,False,False,False


In [7]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [8]:
moviesfile.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
table = pd.merge(train,moviesfile, on = 'movieId', how = 'outer')

In [10]:
table.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,5163.0,57669,4.0,1518350000.0,In Bruges (2008),Comedy|Crime|Drama|Thriller
1,87388.0,57669,3.5,1237455000.0,In Bruges (2008),Comedy|Crime|Drama|Thriller
2,137050.0,57669,4.0,1425632000.0,In Bruges (2008),Comedy|Crime|Drama|Thriller
3,120490.0,57669,4.5,1408229000.0,In Bruges (2008),Comedy|Crime|Drama|Thriller
4,50616.0,57669,4.5,1446942000.0,In Bruges (2008),Comedy|Crime|Drama|Thriller


In [11]:
print (f'Number of ratings in dataset: {train.shape[0]}')

Number of ratings in dataset: 80000


In [12]:
util_matrix = table.pivot_table(index=['userId'], columns=['title'], values='rating')
util_matrix.shape

(47557, 9353)

In [13]:
len(train)

80000

In [14]:
util_matrix = book_ratings.pivot_table(index=['user_id'],
                                       columns=['title'],
                                       values='rating')
util_matrix.shape

NameError: name 'book_ratings' is not defined

In [None]:
# Normalize each row (a given user's ratings) of the utility matrix
util_matrix_norm = util_matrix.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# Fill Nan values with 0's, transpose matrix, and drop users with no ratings
util_matrix_norm.fillna(0, inplace=True)
util_matrix_norm = util_matrix_norm.T
util_matrix_norm = util_matrix_norm.loc[:, (util_matrix_norm != 0).any(axis=0)]
# Save the utility matrix in scipy's sparse matrix format
util_matrix_sparse = sp.sparse.csr_matrix(util_matrix_norm.values)

In [None]:
# Compute the similarity matrix using the cosine similarity metric
user_similarity = cosine_similarity(util_matrix_sparse.T)
# Save the matrix as a dataframe to allow for easier indexing  
user_sim_df = pd.DataFrame(user_similarity,
                           index = util_matrix_norm.columns,
                           columns = util_matrix_norm.columns)

# Review a small portion of the constructed similartiy matrix  
user_sim_df[:5]

In [None]:
def collab_generate_top_N_recommendations(user, N=10, k=20):
                                                       # Cold-start problem - no ratings given by the reference user. 
                                                      # With no further user data, we solve this by simply recommending
                                                      # the top-N most popular books in the item catalog. 
    if user not in user_sim_df.columns:
        return table.groupby('title').mean().sort_values(by='rating',
                                        ascending=False).index[:N].to_list()
 
                                               # Gather the k users which are most similar to the reference user 
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
    favorite_user_items = []                    # <-- List of highest rated items gathered from the k users  
    most_common_favorites = {}               # <-- Dictionary of highest rated items in common for the k users

    for i in sim_users:
                                                  # Maximum rating given by the current user to an item 
        max_score = util_matrix_norm.loc[:, i].max()
        # Save the names of items maximally rated by the current user   
        favorite_user_items.append(util_matrix_norm[util_matrix_norm.loc[:, i]==max_score].index.tolist())

    # Loop over each user's favorite items and tally which ones are 
    # most popular overall.
    for item_collection in range(len(favorite_user_items)):
        for item in favorite_user_items[item_collection]:
            if item in most_common_favorites:
                most_common_favorites[item] += 1
            else:
                most_common_favorites[item] = 1
    # Sort the overall most popular items and return the top-N instances
    sorted_list = sorted(most_common_favorites.items(), key=operator.itemgetter(1), reverse=True)[:N]
    top_N = [x[0] for x in sorted_list]
    return top_N

In [None]:
# Our recommended list for user 3
collab_generate_top_N_recommendations(3)

In [None]:
# User 3 history
table[table['userId'] == 3][:][['title','rating']].sort_values(by='rating', ascending=False)[:10]

In [None]:
def collab_generate_rating_estimate(movie_title, user, k=20, threshold=0.0):
    # Gather the k users which are most similar to the reference user 
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
    # Store the corresponding user's similarity values 
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:k+1]
    rating_list = [] # <-- List of k user's ratings for the reference item
    weight_list = [] # <-- List of k user's similarities to the reference user

    # Create a weighted sum for each of the k users who have rated the 
    # reference item.
    for sim_idx, user_id in enumerate(sim_users):
        # User's rating of the item
        rating = util_matrix.loc[user_id, movie_title]
        # User's similarity to the reference user 
        similarity = user_values[sim_idx]
        # Skip the user if they have not rated the item, or are too dissimilar to 
        # the reference user
        if (np.isnan(rating)) or (similarity < threshold):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    try:
        # Return the weighted sum as the predicted rating for the reference item
        predicted_rating = sum(rating_list)/sum(weight_list)
    except ZeroDivisionError:
        # If no ratings for the reference item can be collected, return the average 
        # rating given by all users for the item.  
        predicted_rating = np.mean(util_matrix[movie_title])
    return predicted_rating

In [None]:
title = "Planet of the Apes (2001)"
actual_rating = table[(table['userId'] == 3) & (table['title'] == title)]['rating'].values[0]
pred_rating = collab_generate_rating_estimate(movie_title = title, user = 3)
print (f"Title - {title}")
print ("---")
print (f"Actual rating: \t\t {actual_rating}")
print (f"Predicted rating: \t {pred_rating}")

In [None]:
title = "Toy Story (1995)"
pred_rating = collab_generate_rating_estimate(movie_title = title, user = 3)
print (f"Title - {title}")
print ("---")
print (f"Actual rating: \t\t ?")
print (f"Predicted rating: \t {pred_rating}")

In [None]:
title = "Grumpier Old Men (1995)"
pred_rating = collab_generate_rating_estimate(movie_title = title, user = 3)
print (f"Title - {title}")
print ("---")
print (f"Actual rating: \t\t ?")
print (f"Predicted rating: \t {pred_rating}")