In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [14]:
ratings = pd.read_table("ratings.dat", sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], engine="python")

In [15]:
kf = KFold(n_splits=5, shuffle=True)

for _train, _test in kf.split(ratings):
    train = ratings.loc[_train.tolist(), ["UserID", "MovieID", "Rating"]]
    test = ratings.loc[_test.tolist(), ["UserID", "MovieID", "Rating"]]
    break

train

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
1000200,6040,2028,5
1000203,6040,1090,3
1000204,6040,1091,1
1000206,6040,562,5


In [16]:
def update_U(index, M, U, V):
    r, s = index
    m_nrow = M.shape[0]
    m_ncol = M.shape[1]
    sum_1 = 0
    sum_2 = 0

    for j in range(m_ncol):
        if not np.isnan(M[r, j]):
            prod = np.matmul(U[r, :], V[:, j]) - U[r, s] * V[s, j]
            sum_1 += V[s, j] * (M[r, j] - prod)
            sum_2 += V[s, j] ** 2

    U[r, s] = sum_1 / sum_2

    return(U)


def update_V(index, M, U, V):
    r, s = index
    m_nrow = M.shape[0]
    m_ncol = M.shape[1]
    sum_1 = 0
    sum_2 = 0

    for i in range(m_nrow):
        if not np.isnan(M[i, s]):
            prod = np.matmul(U[i, :], V[:, s]) - U[i, r] * V[r, s]
            sum_1 += U[i, r] * (M[i, s] - prod)
            sum_2 += U[i, r] ** 2

    V[r, s] = sum_1 / sum_2

    return(V)

In [17]:
def train_iteration(M, U, V):
    u_index = list(np.ndindex(U.shape))
    v_index = list(np.ndindex(V.shape))
    np.random.shuffle(u_index)
    np.random.shuffle(v_index)
    while (len(u_index)>0) | (len(v_index)>0):
        try:
            u = u_index.pop()
            U = update_U(u, M, U, V)
        except:
            pass
        try:
            v = v_index.pop()
            V = update_V(v, M, U, V)
        except:
            pass
    return U, V

In [18]:
def error(truth, pred):
    """
    Computing the RMSE and MAE.
    truth: pandas Dataframe,
    pred: 2d numpy array
    """
    err = []

    for _, row in truth.iterrows():
        err.append(row["Rating"] - pred[(row["UserID"]-1, row["MovieID"]-1)])
    
    rmse = np.sqrt(np.mean(np.square(err)))
    mae = np.mean(np.abs(err))

    return rmse, mae

In [20]:
num_users = ratings["UserID"].max()
num_movies = ratings["MovieID"].max()
def df_to_arr(df):
    arr = np.full([num_users, num_movies], np.nan)

    for _, row in df.iterrows():
        arr[(row["UserID"]-1, row["MovieID"]-1)] = row["Rating"]

    arr[np.isnan(arr)] = arr.mean()

    return arr

train_table = df_to_arr(train)

In [21]:
num_factors = 5

U = np.ones((num_users, num_factors))
V = np.ones((num_factors, num_movies))

for _ in range(20):
    U, V = train_iteration(train_table, U, V)
    
    print(_+1, error(train, np.matmul(U, V)), error(test, np.matmul(U, V)))

1 (1.0104654000565192, 0.787926380742584) (1.0421330202513654, 0.8114810454357232)
2 (0.9279877134740774, 0.7311627867098542) (0.9631884707222148, 0.7572132047300657)
3 (0.9041795667695903, 0.7139258469911866) (0.9454977848744442, 0.7420091416499481)
4 (0.8946704303355777, 0.7066947954302284) (0.9384750199969268, 0.7367951176892252)
5 (0.8894705920959131, 0.7026497552522759) (0.9352485181573462, 0.7343675027008939)
6 (0.8859505637519851, 0.69986664328855) (0.9324790723459099, 0.7332891586815078)
7 (0.8831773493390158, 0.6976258281600012) (0.931919129629897, 0.732732411015362)
8 (0.8806908356107092, 0.6955576708785715) (0.9323574986242212, 0.7324750833529172)
9 (0.8782998204322278, 0.6935650456702459) (0.9324045511165462, 0.7321859623564785)
10 (0.8758782342704721, 0.6914382774915089) (0.9322716679588289, 0.7316793055382874)
11 (0.8733198390534729, 0.6891714812586446) (0.9301701789800138, 0.7308357678830049)
12 (0.8705955337915536, 0.6869018627467482) (0.9295708350686724, 0.730018883518

In [22]:
for _ in range(20):
    U, V = train_iteration(train_table, U, V)
    
    print(_+21, error(train, np.matmul(U, V)), error(test, np.matmul(U, V)))

21 (0.844085333267103, 0.6639285583696057) (0.9151768498750912, 0.7153339720648282)
