In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import scipy
from scipy import spatial

In [2]:
# Important variables
no_of_users = 943
no_of_movies = 1682
no_of_genres = 19

In [3]:
rating_data = pd.read_csv("./dataset/data.csv")

rating_1 = rating_data.values[0:20000]
rating_2 = rating_data.values[40000:]

rating = np.append(rating_1, rating_2, axis=0)

test_dataset = rating_data.values[20000:40000]


In [4]:
# Calculating Sr

ratings_matrix = np.tile(np.zeros(no_of_movies, dtype = float), (no_of_users, 1))

for item in rating:
    ratings_matrix[item[0]-1][item[1]-1] = item[2]

Sr = np.tile(np.zeros(no_of_users, dtype = float), (no_of_users, 1))

i = 0
for user_1 in ratings_matrix:
    j = 0
    for user_2 in ratings_matrix:
        result = scipy.spatial.distance.jaccard(user_1, user_2)
        Sr[i][j] = result
        j = j + 1
    i = i + 1

print(Sr)

[[0.         0.98804781 0.99604743 ... 0.99145299 0.98550725 0.93987342]
 [0.98804781 0.         0.98666667 ... 0.96666667 0.97297297 0.98019802]
 [0.99604743 0.98666667 0.         ... 1.         0.99099099 1.        ]
 ...
 [0.99145299 0.96666667 1.         ... 0.         0.98969072 1.        ]
 [0.98550725 0.97297297 0.99099099 ... 0.98969072 0.         0.96929825]
 [0.93987342 0.98019802 1.         ... 1.         0.96929825 0.        ]]


In [5]:
# Calculating Sb

movies_data = pd.read_csv("./dataset/item.csv")
movies_data.drop('movie_id', axis=1, inplace=True)
movies = movies_data.values
# movie[i-1] will contain the genre vector representation for ith movie
# movie[i-1][j-1] will contain the 1/0 value for ith movie jth genre

behaviour_matrix = np.tile(np.zeros(no_of_genres, dtype = float), (no_of_users, 1))

for item in rating:
    user_id = item[0] - 1
    movie_id = item[1] - 1
    behaviour_matrix[user_id] += movies[movie_id]

intermediate_matrix = np.transpose(behaviour_matrix, axes=None)

for index, genre in enumerate(intermediate_matrix):
    N = np.count_nonzero(genre)
    intermediate_matrix[index] = intermediate_matrix[index] / N

probability_matrix = np.transpose(intermediate_matrix, axes=None)
Sb = np.tile(np.zeros(no_of_users, dtype = float), (no_of_users, 1))

i = 0
for user_1 in probability_matrix:
    j = 0
    for user_2 in probability_matrix:
        result = scipy.spatial.distance.jaccard(user_1, user_2)
        Sb[i][j] = result
        j = j + 1
    i = i + 1

print(Sb)

[[0.         0.94736842 1.         ... 1.         1.         0.94444444]
 [0.94736842 0.         1.         ... 0.9375     1.         0.94117647]
 [1.         1.         0.         ... 1.         0.94117647 1.        ]
 ...
 [1.         0.9375     1.         ... 0.         1.         1.        ]
 [1.         1.         0.94117647 ... 1.         0.         0.94117647]
 [0.94444444 0.94117647 1.         ... 1.         0.94117647 0.        ]]


In [6]:
# Create user profile vectors
# If zip code is not numeric we consider it to be 0

gender = {
    'M': 0,
    'F': 1
}

occupation = {
    'administrator': 1,
    'artist' : 2,
    'doctor' : 3,
    'educator' : 4,
    'engineer' : 5,
    'entertainment' : 6,
    'executive' : 7,
    'healthcare' : 8,
    'homemaker' : 9,
    'lawyer' : 10,
    'librarian' : 11,
    'marketing' : 12,
    'none' : 13,
    'other' : 14,
    'programmer' : 15,
    'retired' : 16,
    'salesman' : 17,
    'scientist' : 18,
    'student' : 19,
    'technician' : 20,
    'writer' : 21
}

user_data = pd.read_csv("./dataset/user.csv")
user_data.drop('user_id', axis=1, inplace=True)
user_profile_data = user_data.values
user_profile = np.tile(np.zeros(4, dtype = float), (943, 1))

for index, profile in enumerate(user_profile_data):
    user_profile[index][0] = profile[0]

    user_profile[index][1] = gender[profile[1]]

    user_profile[index][2] = occupation[profile[2]]

    if profile[3].isnumeric():
        user_profile[index][3] = int(profile[3])
    else:
        user_profile[index][3] = 0

print(user_profile)

[[2.4000e+01 0.0000e+00 2.0000e+01 8.5711e+04]
 [5.3000e+01 1.0000e+00 1.4000e+01 9.4043e+04]
 [2.3000e+01 0.0000e+00 2.1000e+01 3.2067e+04]
 ...
 [2.0000e+01 0.0000e+00 1.9000e+01 9.7229e+04]
 [4.8000e+01 1.0000e+00 1.1000e+01 7.8209e+04]
 [2.2000e+01 0.0000e+00 1.9000e+01 7.7841e+04]]


In [7]:
# Calculate alpha and beta
import statsmodels.api as sm
from statsmodels.formula.api import ols

user_profile_rating = np.tile(np.zeros(5, dtype = float), (943, 1))
user_profile_behaviour = np.tile(np.zeros(5, dtype = float), (943, 1))

for index, profile in enumerate(user_profile):
    user_profile_rating[index][0] = profile[0]
    user_profile_rating[index][1] = profile[1]
    user_profile_rating[index][2] = profile[2]
    user_profile_rating[index][3] = profile[3]
    sum1 = np.sum(ratings_matrix[index])
    N1 = np.count_nonzero(ratings_matrix[index])
    user_profile_rating[index][4] = sum1 / N1


    user_profile_behaviour[index][0] = profile[0]
    user_profile_behaviour[index][1] = profile[1]
    user_profile_behaviour[index][2] = profile[2]
    user_profile_behaviour[index][3] = profile[3]
    sum2 = np.sum(probability_matrix[index])
    N2 = np.count_nonzero(probability_matrix[index])
    user_profile_behaviour[index][4] = sum2 / N2

data_user_rating = pd.DataFrame(user_profile_rating, columns=["age", "gender", "occupation", "location", "rating"])

model_1 = ols("rating ~ age + gender + occupation + location", data=data_user_rating).fit()
# print(model_1.params)
# print(model_1.summary())
alpha = model_1.rsquared**.5

data_user_behaviour = pd.DataFrame(user_profile_behaviour, columns=["age", "gender", "occupation", "location", "behaviour"])

model_2 = ols("behaviour ~ age + gender + occupation + location", data=data_user_behaviour).fit()
# print(model_2.params)
# print(model_2.summary())
beta = model_2.rsquared**.5

print("Alpha value : ", alpha, "\nBeta value : ", beta)

Alpha value :  0.10599478207448824 
Beta value :  0.16251755972256937


In [8]:
# Calculate S
S = np.tile(np.zeros(943, dtype = float), (943, 1))
term_1 = np.multiply(Sr, alpha)
term_2 = np.multiply(Sb, beta)
S = np.add(term_1, term_2)
print(S)

[[0.         0.25869192 0.26809339 ... 0.2676064  0.26697619 0.25311048]
 [0.25869192 0.         0.26709908 ... 0.25482183 0.26564762 0.25685358]
 [0.26809339 0.26709908 0.         ... 0.26851234 0.25799758 0.26851234]
 ...
 [0.2676064  0.25482183 0.26851234 ... 0.         0.26741961 0.26851234]
 [0.26697619 0.26564762 0.25799758 ... 0.26741961 0.         0.25569826]
 [0.25311048 0.25685358 0.26851234 ... 0.26851234 0.25569826 0.        ]]


In [9]:
import math

def make_prediction_upcsim(user_for_prediction, movie_for_prediction, k):

  # Find neighbours
  similarities = S[user_for_prediction - 1]
  sort_matrix = np.tile(np.zeros(2, dtype = float), (943, 1))
  neighbours = np.tile(np.zeros(2, dtype = float), (k, 1))
  for index, value in enumerate(similarities):
    sort_matrix[index][0] = index + 1
    sort_matrix[index][1] = value
  sort_matrix = sort_matrix[sort_matrix[:, 1].argsort()]
  i = 0
  j = 0
  while j != k and i < 943:
    if ratings_matrix[int(sort_matrix[i][0] - 1)][movie_for_prediction - 1] != 0 and sort_matrix[i][0] != user_for_prediction:
      neighbours[j] = sort_matrix[i]
      j = j + 1
    i = i + 1

  # Make prediction
  summation = 0
  total_abs_similarity = 0

  # Calculating average rating value for the user
  user_total_rating = np.sum(ratings_matrix[user_for_prediction - 1])
  Num = np.count_nonzero(ratings_matrix[user_for_prediction - 1])
  user_average_rating = user_total_rating / Num

  # Loop through neighbours
  for neighbour in neighbours:

    # obtain the id and similarity value
    neighbour_id = neighbour[0]
    neighbour_similarity = neighbour[1]

    # get the rating for the movie given by the neighbour from the ratings matrix
    neighbour_rating = ratings_matrix[int(neighbour_id - 1)][int(movie_for_prediction - 1)]
    sum_rating = np.sum(ratings_matrix[int(neighbour_id - 1)])

    # get average of the rating given by the neighbour
    N = np.count_nonzero(ratings_matrix[int(neighbour_id - 1)])
    if(N == 0):
      N = 1
    mean_rating = sum_rating / N

    # update summation
    if(neighbour_rating != 0):
      summation = summation + neighbour_similarity * (neighbour_rating - mean_rating)
    # update total absolute similarity
    total_abs_similarity = total_abs_similarity + np.abs(neighbour_similarity)

  if(total_abs_similarity) == 0:
    total_abs_similarity = 0.001

  # calculate the prediction value
  prediction = user_average_rating + summation / total_abs_similarity

  return prediction

In [10]:
# MAE Calculation
def MAE_Calculation(k):
  testing = pd.read_csv('./dataset/data.csv')
  # testing_data = testing.values[80000:]

  no = len(test_dataset)
  print(no, "test values")
  total_mae_error = 0
  for test in test_dataset:

    # prediction by UPCSim
    prediction = make_prediction_upcsim(test[0], test[1], k)

    # prediction value in test dataset
    test_prediction = test[2]

    # Error value
    error = abs(prediction - test_prediction)

    # update total MAE
    total_mae_error = total_mae_error + error
    
  # divide by the number of test values
  mae_error = total_mae_error / no
  print("MAE", mae_error)

In [11]:
MAE_Calculation(20)

20000 test values
MAE 0.7606065592796457


In [12]:
MAE_Calculation(40)

20000 test values
MAE 0.7577952507387333


In [13]:
MAE_Calculation(60)

20000 test values
MAE 0.7570877830932414


In [14]:
MAE_Calculation(80)

20000 test values
MAE 0.7573447159282203


In [15]:
MAE_Calculation(100)

20000 test values
MAE 0.7577596800487795


In [16]:
# RMSE Calculation
def RMSE_Calculation(k):
  no = len(test_dataset)
  print(no, "test values")

  total_rmse_error = 0

  for test in test_dataset:
    # Make prediction from UPCSim
    prediction = make_prediction_upcsim(test[0], test[1], k)

    # Prediction value from test dataset
    test_prediction = test[2]

    # Error value
    error = (prediction - test_prediction) ** 2
    total_rmse_error = total_rmse_error + error

  # divide by the number of test values
  rmse_error = math.sqrt(total_rmse_error / no)
  print("RMSE", rmse_error)

In [17]:
RMSE_Calculation(20)

20000 test values
RMSE 0.9675390663738045


In [18]:
RMSE_Calculation(40)

20000 test values
RMSE 0.9623833491562901


In [19]:
RMSE_Calculation(60)

20000 test values
RMSE 0.9618045321232445


In [20]:
RMSE_Calculation(80)

20000 test values
RMSE 0.9620900758142734


In [21]:
RMSE_Calculation(100)

20000 test values
RMSE 0.9630365092957567
