In [1]:
import os
import re
import numpy as np
import pandas as pd
import unicodedata
import math
import copy
from zipfile import ZipFile
# from zipfile import ZipFile as Zip

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
os.mkdir("/content/rhugaved_data")
os.chdir("/content/rhugaved_data")

You need to upload the netflix.zip file to rhugaved_data folder and then run the next cells


In [4]:
for filename in os.listdir("/content/rhugaved_data"):
    if not os.path.isdir(filename):
        with ZipFile(filename, 'r') as zip:
            # extracting all the files
            zip.extractall()

In [7]:
# Reading the Train and Test File
train_data = pd.read_csv("TrainingRatings.txt", names=['movie_id', 'user_id', 'rating'], dtype={0:'int32',1:'int32',2:'int8'})

test_data = pd.read_csv("TestingRatings.txt", names=['movie_id', 'user_id', 'rating'], dtype={0:'int32',1:'int32',2:'int8'})

train_data

Unnamed: 0,movie_id,user_id,rating
0,8,1744889,1
1,8,1395430,2
2,8,1205593,4
3,8,1488844,4
4,8,1447354,1
...,...,...,...
3255347,17742,46222,3
3255348,17742,2534701,1
3255349,17742,208724,3
3255350,17742,483107,2


In [6]:
len(test_data['user_id'].unique())

27555

In [8]:
# Finding the Unique values for all the columns
movie_ids_unique = train_data['movie_id'].unique()
user_ids_unique = train_data['user_id'].unique()
ratings_unique = train_data['rating'].unique()
user_ids_unique.shape

(28978,)

In [9]:
# Making an np array from the training dataframe
train_np = pd.pivot_table(train_data, values="rating", index="movie_id", columns="user_id").fillna(0).to_numpy()
train_np



array([[5., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
# Initializing the weights to 0
weights = np.zeros((train_np.shape[1], train_np.shape[1]))
print(weights.shape)

(28978, 28978)


In [11]:
# Function to calculate mean ratings
def calculate_mean(row):
    s = sum(row)
    return s/np.count_nonzero(row)

In [13]:
# Finding and storing means of all the users for future use
users_means = np.zeros(train_np.shape[1])

for i, users in enumerate(train_np.transpose()):
    users_means[i] = calculate_mean(users)
users_means.shape

(28978,)

In [14]:
train_np.shape

(1821, 28978)

In [15]:
# We calculate v(i,j) - mean(v(i))
train_np = np.where(train_np, train_np - users_means, train_np)
train_np

array([[1.09615385, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.09615385, 0.        , 0.        , ..., 0.        , 0.        ,
        1.01923077],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [16]:
# We calculate (v(i,j) - mean(v(i)))^2 and store it for future use in weight calculation
users_squares = np.einsum('ij,ji->i', train_np.transpose(), train_np)
users_squares.shape

(28978,)

In [17]:
# CALCULATING THE WEIGHTS for USING THE TRAINGIn DATA
for i, users in enumerate(train_np.transpose()):
  
    deno_first_term = np.matmul(users.transpose(), users)
    weights[i] = np.divide(np.matmul(users, train_np), np.sqrt(np.multiply(users_squares[i], deno_first_term)))


In [18]:
train_np.transpose().shape

(28978, 1821)

In [19]:
weights.shape

(28978, 28978)

In [20]:
# Value of K
users_k = 1 / np.sum(weights, axis=1)
users_k.shape

(28978,)

In [21]:
def predict():
    output = np.matmul(weights[:28978, :], train_np.transpose())
    output *= users_k[:28978].reshape(28978, 1)
    output += users_means[:28978].reshape(28978, 1)
    return output
output = predict()

In [22]:
output.shape

(28978, 1821)

In [25]:
temp = pd.pivot_table(train_data, values="rating", index="movie_id", columns="user_id").fillna(0)

In [26]:
# Storing a list of users and movies in order that they are present in temp array
users_list = list(temp.columns)
movies_list = list(temp.index)

u = list(np.arange(len(users_list)))
m = list(np.arange(len(movies_list)))

# Storing a dictionary of users and movies in order that they are present in temp array and their index
# users_dict = dict(zip(users_list, u))
users_dict = dict(map(lambda i,j : (i,j) , users_list,u))
# movies_dict = dict(zip(movies_list, m))
movies_dict = dict(map(lambda i,j : (i,j) , movies_list,u))



In [27]:
test_temp = pd.pivot_table(test_data, values="rating", index="movie_id", columns="user_id").fillna(0)


In [28]:
test_temp

user_id,7,79,199,481,769,906,1310,1333,1427,1442,...,2648572,2648589,2648730,2648734,2648853,2648869,2648885,2649120,2649267,2649285
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
print(sum(test_temp.iloc[8]))

24.0


In [41]:
# TRINAING ABSOLUTE ERROR FOR 10K

ae = 0
mse = 0
total = 0
for i in list(users_dict.keys())[:10000]:
    for j in movies_dict:
        t = temp[i][j]
        if t:
            p = output[users_dict[i]][movies_dict[j]]
            if math.isnan(p):
                continue
            ae += abs(t - p)
            mse += ((t-p) * (t-p))
            total += 1

            

In [43]:
# TRAINING ERROR:
print(ae/total)
print(math.sqrt(mse/total))

0.7311922948753752
2.0225304484670485


In [44]:
temp[7][8]

5.0

0.7311922948753752

In [33]:
#TESTING ABSOLUTE ERROR FOR 10K
ae = 0
mse = 0
total = 0
for i in list(users_dict.keys())[:28900]:
    for j in movies_dict:
        try:
            t = test_temp[i][j]
        except:
            continue
        if t:
            p = output[users_dict[i]][movies_dict[j]]
            if math.isnan(p):
                continue
            ae += abs(t - p)
            mse += ((t-p) * (t-p))
            total += 1

In [34]:
# TESTING ERROR:
print(ae/total)
print(math.sqrt(mse/total))

0.7836825201643344
3.729596269279923
