In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
ratings = pd.read_table("ratings.dat", sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], engine="python")

In [None]:
kf = KFold(n_splits=5, shuffle=True)

for _train, _test in kf.split(ratings):
    train = ratings.loc[_train.tolist(), ["UserID", "MovieID", "Rating"]]
    test = ratings.loc[_test.tolist(), ["UserID", "MovieID", "Rating"]]
    break

train

Unnamed: 0,UserID,MovieID,Rating
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
5,1,1197,3
...,...,...,...
1000204,6040,1091,1
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4


In [None]:
def update_U(index, M, U, V):
    r, s = index
    m_nrow = M.shape[0]
    m_ncol = M.shape[1]
    sum_1 = 0
    sum_2 = 0

    for j in range(m_ncol):
        if not np.isnan(M[r, j]):
            prod = np.matmul(U[r, :], V[:, j]) - U[r, s] * V[s, j]
            sum_1 += V[s, j] * (M[r, j] - prod)
            sum_2 += V[s, j] ** 2

    U[r, s] = sum_1 / sum_2

    return(U)


def update_V(index, M, U, V):
    r, s = index
    m_nrow = M.shape[0]
    m_ncol = M.shape[1]
    sum_1 = 0
    sum_2 = 0

    for i in range(m_nrow):
        if not np.isnan(M[i, s]):
            prod = np.matmul(U[i, :], V[:, s]) - U[i, r] * V[r, s]
            sum_1 += U[i, r] * (M[i, s] - prod)
            sum_2 += U[i, r] ** 2

    V[r, s] = sum_1 / sum_2

    return(V)

In [None]:
def train_iteration(M, U, V):
    u_index = list(np.ndindex(U.shape))
    v_index = list(np.ndindex(V.shape))
    np.random.shuffle(u_index)
    np.random.shuffle(v_index)
    while (len(u_index)>0) | (len(v_index)>0):
        try:
            u = u_index.pop()
            U = update_U(u, M, U, V)
        except:
            pass
        try:
            v = v_index.pop()
            V = update_V(v, M, U, V)
        except:
            pass
    return U, V

In [None]:
def error(truth, pred):
    """
    Computing the RMSE and MAE.
    truth: pandas Dataframe,
    pred: 2d numpy array
    """
    err = []

    for _, row in truth.iterrows():
        err.append(row["Rating"] - pred[(row["UserID"]-1, row["MovieID"]-1)])
    
    rmse = np.sqrt(np.mean(np.square(err)))
    mae = np.mean(np.abs(err))

    return rmse, mae

In [None]:
num_users = ratings["UserID"].max()
num_movies = ratings["MovieID"].max()
def df_to_arr(df):
    arr = np.full([num_users, num_movies], np.nan)

    for _, row in df.iterrows():
        arr[(row["UserID"]-1, row["MovieID"]-1)] = row["Rating"]

    arr[np.isnan(arr)] = arr.mean()

    return arr

train_table = df_to_arr(train)

In [None]:
num_factors = 10

U = np.ones((num_users, num_factors))
V = np.ones((num_factors, num_movies))

for _ in range(50):
    U, V = train_iteration(train_table, U, V)
    print(_+1, error(train, np.matmul(U, V)))

(3.1407328159574774, 2.1575741742855787)
