In [1]:
import pandas as pd
import numpy as np

In [3]:
# Use the read_csv function to store the dataset into a dataframe called ratings
ratings = pd.read_csv('D:\\owd1\\Documents\\GitHub-REPO\\RecommenderSystemLabs\\docs\\data\\ccai422_lab03_part1_data.csv')

# Keep only the columns that we need in this task: "user_id", "movie_id" and "rating"
ratings = ratings[["user_id", "movie_id", "rating"]]


In [4]:
# The total number of data points
print('The number of data points in this dataset: ' + str(len(ratings)))

# The number of items (i.e. movies) in the dataset
print('The number of items (i.e. movies) in the dataset: ' + str(ratings['movie_id'].nunique()))

# The number of users in the dataset
print('The number of users in the dataset: ' + str(ratings['user_id'].nunique()))

# The average ratings per user
ratings_per_users = ratings.groupby('user_id').count()
print('The average ratings per user: '+ str(round(ratings_per_users.mean()[0],2)))

# The number of ratings/user
print('The below table shows the number of ratings per user\n')
print(ratings_per_users)


The number of data points in this dataset: 100000
The number of items (i.e. movies) in the dataset: 1682
The number of users in the dataset: 943
The average ratings per user: 106.04
The below table shows the number of ratings per user

         movie_id  rating
user_id                  
1             272     272
2              62      62
3              54      54
4              24      24
5             175     175
...           ...     ...
939            49      49
940           107     107
941            22      22
942            79      79
943           168     168

[943 rows x 2 columns]


  print('The average ratings per user: '+ str(round(ratings_per_users.mean()[0],2)))


In [5]:
#Build the ratings matrix using pivot_table function
r_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')

#Create a dummy ratings matrix which will have all null values imputed to 0
r_matrix_dummy = r_matrix.copy()

# rename the axis of the new matrix
r_matrix_dummy = r_matrix_dummy.rename_axis('user_id', axis=1).rename_axis(None, axis=0)

# Impute all the NaN values to 0
r_matrix_dummy = r_matrix_dummy.fillna(0)
r_matrix_dummy.head()

r_matrix_dummy = r_matrix_dummy.fillna(0)
r_matrix_dummy.head()


user_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#Get the transpose of rating matrix to compute the pearson correlation between the users not the items
users_rating_matrix = r_matrix_dummy.T

# Use the corr function of pandas to compute the pearson correlation on the users_ratings_matrix
pearson_sim = users_rating_matrix.corr()


In [8]:
# Randomly select a user to rate an item for him/her
userX= 5

# Get the rating data of the specified user and compute the mean value
rXmean=r_matrix[userX].mean()

# Specify the n neighbors to be used in the prediction
n = 2

# Retrieve the top n based on the pearson sim (ignore the first one since it is the item with itself)
topn = pearson_sim[[userX]].nlargest(n+1,userX).index.tolist()[1:]

# Retrieve the similarity values to be used 
neighbors_sim = pearson_sim[[userX]].nlargest(n+1,userX)[1:]

# Get the rating data for the top n neighbors
r_matrix_topn = r_matrix[topn]

# Compute the mean rating's value per neighbor
neighbors_means = r_matrix_topn.mean()

# Compute the differences between the mean rating's value per user and his/her actual ratings
averaged_neighbors_ratings = r_matrix_topn.sub(neighbors_means,axis=1)


In [9]:
# Select all unrated items for the target user
unrated_target = r_matrix[r_matrix[userX].isna()][topn]

# rename the axis of the unrated item matrix
unrated_target = unrated_target.rename_axis('movie_id', axis=1).rename_axis(None, axis=0)

# Remove items that are not rated by all top n neighbors
unrated_target.dropna(axis = 0, how = 'all', inplace = True)


unrated_target.head()


movie_id,307,22
2,3.0,
3,3.0,
6,,3.0
7,5.0,5.0
8,,5.0


In [10]:
# Randomly select the item to be rated
itemX = 7

# Predict the rating value for the unrated item
predicted_value = rXmean + ((neighbors_sim.T.dot(averaged_neighbors_ratings.loc[itemX].T).values[0]) / neighbors_sim.sum())
