In [1]:
import pandas as pd
import torch
import numpy as np


## Data Preprocessing

In [2]:
train = pd.read_csv('../Data/UserRec/rating_train.csv')
valid = pd.read_csv('../Data/UserRec/rating_validation.csv')
test = pd.read_csv('../Data/UserRec/rating_test.csv')

In [3]:
train_table = train.pivot_table(index='userId', columns='movieId', values='rating')
train_table

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,,,2.5,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,,,,,,4.0,,,...,,,,,,,,,,
609,3.0,,,,,,,,,,...,,,,,,,,,,


In [4]:
movie_mean = train_table.mean()
movie_mean

movieId
1         3.974286
2         3.427778
3         3.255556
5         3.024390
6         3.929412
            ...   
174055    3.423077
176371    3.750000
177765    3.416667
179819    3.500000
187593    3.800000
Length: 2121, dtype: float64

In [5]:
train_avg_fill = train_table.fillna(movie_mean)
train_avg_fill


movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.000000,3.427778,4.000000,3.02439,4.000000,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
2,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
3,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
4,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
5,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.500000,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,2.500000,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
607,4.000000,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
608,2.500000,2.000000,3.255556,3.02439,3.929412,3.177778,3.107143,4.000000,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
609,3.000000,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8


In [6]:
U, S, V = np.linalg.svd(train_avg_fill, full_matrices=False)
U.shape, S.shape, V.shape


((608, 608), (608,), (608, 2121))

In [7]:
rating_hat_svd = pd.DataFrame(U.dot(np.diag(S)).dot(V), index=train_table.index, columns=train_table.columns)
rating_hat_svd

movieId,1,2,3,5,6,7,9,10,11,12,...,159093,164179,166528,168250,168252,174055,176371,177765,179819,187593
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.000000,3.427778,4.000000,3.02439,4.000000,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
2,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
3,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
4,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
5,3.974286,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.500000,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,2.500000,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
607,4.000000,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
608,2.500000,2.000000,3.255556,3.02439,3.929412,3.177778,3.107143,4.000000,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8
609,3.000000,3.427778,3.255556,3.02439,3.929412,3.177778,3.107143,3.466667,3.652542,2.5625,...,3.428571,4.022727,3.95,3.607143,4.26087,3.423077,3.75,3.416667,3.5,3.8


### RMSE

In [9]:
valid

Unnamed: 0,userId,movieId,rating
0,1,500,3.0
1,1,552,4.0
2,1,661,5.0
3,1,1029,5.0
4,1,1032,5.0
...,...,...,...
7957,610,122886,4.5
7958,610,122892,4.0
7959,610,122900,3.5
7960,610,135536,2.5


In [10]:
valid_table = valid.pivot_table(index='userId', columns='movieId', values='rating')

In [11]:
# movie_inter = valid_table.columns.intersection(rating_hat_svd.columns)

In [12]:
def rmse(test, pred):
    error_df = test - pred
    count = np.sum(error_df.count())
    return np.sqrt(np.sum(np.sum((error_df)**2)) / count)

rmse(valid_table, rating_hat_svd)

0.9316328744216835

### Save prediction table

In [14]:
rating_hat_svd.to_csv('../Data/UserRec/Prediction/rating_hat_svd.csv')

## NN