In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 4/SENTIMENT ANALYSIS AND RECOMMENDER SYSTEMS PART 4 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 9: Loading the packages  ####

import os
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error




In [None]:
#=================================================-
#### Slide 10: Loading the packages  ####

from math import sqrt
from scipy.sparse.linalg import svds
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate




In [None]:
#=================================================-
#### Slide 11: Directory settings  ####

# Set 'main_dir' to location of the project folder
from pathlib import Path
home_dir = Path(".").resolve()
main_dir = home_dir.parent
data_dir = str(main_dir) + "/data"




In [None]:
#=================================================-
#### Slide 12: Load the subset of data  ####

# Read in the datasets.
rating_subset = pd.read_csv(data_dir+'/ratings-subset.csv')
movies_subset = pd.read_csv(data_dir+'/movies-subset.csv')
# Select only movie ID and title from movies dataset.
movies_subset = movies_subset[['movieId', 'title']]
# Merge both ratings and movies dataframes.
rating_df = pd.merge(movies_subset, rating_subset)
# View the summary and head of the merged dataframe.
print(rating_df.head())




In [None]:
#=================================================-
#### Slide 13: Item-based recommender implementation  ####

userRating = rating_df.pivot_table(index = ['userId'],
                                   columns = ['title'], values = 'rating')
                                    
print(userRating.head())     




In [None]:
#=================================================-
#### Slide 14: Item correlation matrix  ####

# corrMatrix = userRating.corr(method = 'pearson', min_periods = 100)
# corrMatrix.to_csv('corrMatrix.csv', index = True, encoding = 'utf-8')
corrMatrix = pd.read_csv(data_dir+ '/corrMatrix.csv')
print(corrMatrix.head())

corrMatrix = corrMatrix.set_index('title')




In [None]:
#=================================================-
#### Slide 16: Suggest movies to user  ####

user_corr = pd.Series()

user_id = 25

# Create a list of all films with all correlations multiplied by the rating.
for film in userRating.iloc[user_id].dropna().index:
    corr_list = corrMatrix[film].dropna() * userRating.iloc[user_id][film]
    user_corr = user_corr.append(corr_list)

# Group by movie ID and sum the ratings to remove duplicates.
user_corr = user_corr.groupby(user_corr.index).sum()




In [None]:
#=================================================-
#### Slide 17: Suggest movies to user  ####

# Create a list of movies the user has already seen and remove them.
title_list = []

for i in range(len(userRating.iloc[user_id].dropna().index)):
    if userRating.iloc[user_id].dropna().index[i] in user_corr:
        title_list.append(userRating.iloc[user_id].dropna().index[i])
    else:
        pass
user_corr = user_corr.drop(title_list)




In [None]:
#=================================================-
#### Slide 18: Suggest movies to user  ####

print('Hi! Based on the films that you have seen, you might like: \n')
for i in userRating.iloc[user_id].dropna().index:
    print(i)
# Suggest the top 10 movies.
print('\n I would suggest that you watch: \n')

for i in user_corr.sort_values(ascending = False).index[:10]:
    print(i)




In [None]:
#=================================================-
#### Slide 20: Exercise 1  ####






In [None]:
#=================================================-
#### Slide 27: Find total users and movies  ####

# Reading the ratings file.
ratings = pd.read_csv(data_dir+ '/ratings.csv', sep='\t', encoding='latin-1', 
usecols = ['user_id', 'movie_id', 'rating'])
# Find total number of unique users and movies.
n_users = ratings.user_id.unique().shape[0]
n_movies = ratings.movie_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))




In [None]:
#=================================================-
#### Slide 28: Data preparation for SVD  ####

Ratings = ratings.pivot(index = 'user_id',columns = 'movie_id', 
values = 'rating').fillna(0)

print(Ratings.head())




In [None]:
#=================================================-
#### Slide 29: De-normalize the data and check the amount of sparsity  ####

# Normalize the data.
R = Ratings.to_numpy()

user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

# Check the percentage of sparsity.
sparsity = round(1.0 - len(ratings) / float(n_users * n_movies), 3)
print('The sparsity level of MovieLens1M dataset is ' +  str(sparsity * 100) + '%')




In [None]:
#=================================================-
#### Slide 30: SVD implementation  ####

U, sigma, Vt = svds(Ratings_demeaned, k = 50)

# Convert the sigma matrix to the diagonal matrix form.
sigma = np.diag(sigma)




In [None]:
#=================================================-
#### Slide 31: SVD implementation  ####

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)




In [None]:
#=================================================-
#### Slide 32: SVD implementation  ####

preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
print(preds.head())




In [None]:
#=================================================-
#### Slide 33: SVD implementation  ####

def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending = False) # User ID starts at 1
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.user_id == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movie_id', right_on = 'movie_id').sort_values(['rating'], ascending=False))
    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already     rated.'.format(num_recommendations))
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movie_id'].isin(user_full['movie_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movie_id',right_on = 'movie_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1])
    return user_full, recommendations




In [None]:
#=================================================-
#### Slide 34: Recommend movies using SVD  ####

# Reading movies file.
movies = pd.read_csv(data_dir+ '/movies.csv', sep='\t', encoding='latin-1', 
usecols = ['movie_id', 'title', 'genres'])
already_rated, predictions = recommend_movies(preds, 1310, movies, ratings, 20)




In [None]:
#=================================================-
#### Slide 35: Recommend movies using SVD  ####

# Top 20 movies that User 1310 has rated. 
print(already_rated[['user_id', 'title']])




In [None]:
#=================================================-
#### Slide 36: Recommend movies using SVD  ####

# Top 20 movies that User 1310 hopefully will enjoy.
print(predictions[['movie_id', 'title']])




In [None]:
#=================================================-
#### Slide 40: Model evaluation  ####

# Load Reader library.
reader = Reader()

# Load ratings dataset with the Dataset library.
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)




In [None]:
#=================================================-
#### Slide 41: Model evaluation: compute RMSE  ####

# Use the SVD algorithm.
svd = SVD()
# Compute the RMSE of the SVD algorithm.
evaluate_model = cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)




In [None]:
#=================================================-
#### Slide 42: Model evaluation: fitting the model  ####

trainset = data.build_full_trainset()
svd.fit(trainset)




In [None]:
#=================================================-
#### Slide 43: Model evaluation: prediction  ####

# User 1310 and his prior ratings.
ratings[ratings['user_id'] == 1310].head() 
# Average rating user 1310 will give to movie ID 1994.
svd.predict(1310, 1994)


