In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time

In [2]:
!gdown --id 1PnTfrm4c5VSLih-dKtV9j5tVyVeDhi5b
!head -n 2000000 /content/combined_data_1.txt > data.txt

Downloading...
From: https://drive.google.com/uc?id=1PnTfrm4c5VSLih-dKtV9j5tVyVeDhi5b
To: /content/combined_data_1.txt
100% 495M/495M [00:04<00:00, 113MB/s]


In [3]:
df = pd.read_csv('data.txt', sep=',', header=0, names=["CustomerID", "Rating", "Date"])
df_null = pd.DataFrame(pd.isnull(df['Date']))
df_movies = df
movies_ids = df_movies['CustomerID'].copy().to_numpy()
movie_loc = df_null[df_null['Date'] == True]
movie_loc = np.append(movie_loc, len(df_null))
for i in range(len(movie_loc) - 1):
    movies_ids[movie_loc[i] + 1: movie_loc[i + 1]] = i + 1
df_movies.insert(0, 'MovieId', movies_ids)
df_movies = df_movies[~pd.isnull(df_movies['Date'])]

In [4]:
pd_null = pd.isna(df["Date"]).to_numpy()
df_with_movies = df
movie_ids = df["CustomerID"].copy().to_numpy()
movie_pos = np.where(pd_null == True)[0]
movie_pos = np.append(movie_pos, len(pd_null))
for i in range(len(movie_pos) - 1):
  movie_ids[movie_pos[i] + 1 : movie_pos[i + 1]] = i + 1
df_with_movies.insert(0, "ID", movie_ids)
df_with_movies = df_with_movies[~pd.isna(df_with_movies["Date"])]
df_with_movies = df_with_movies.sample(frac=1).reset_index(drop=True)

In [5]:
customer_TID = {int(customer_id):i for i, customer_id in enumerate(df_with_movies["CustomerID"].unique())}
movie_TID = {movie_id:(i + len(customer_TID)) for i, movie_id in enumerate(df_with_movies["ID"].unique())}

customers = df_with_movies["CustomerID"].apply(lambda x: customer_TID[int(x)]).to_numpy().astype(int)
movies = df_with_movies["ID"].apply(lambda x: movie_TID[x]).to_numpy()

customer_cnt = df_with_movies["CustomerID"].nunique()
movie_cnt = df_with_movies["ID"].nunique()
rating_cnt = df_with_movies.shape[0]

col_id = np.dstack((customers, movies)).flatten()
row_id = np.repeat(np.arange(rating_cnt), 2)

df = np.ones(rating_cnt * 2, dtype=int)
hot_matrix = sp.csr_matrix((df, (row_id, col_id)), shape=(rating_cnt, customer_cnt + movie_cnt))

In [6]:
def rmse(gt, prediction):
  return np.sqrt(sum((gt - prediction) ** 2) / len(gt))

In [7]:
def generate_cross_validation_ids(x, y, sample_cnt=5):
    ids = np.arange(x.shape[0])
    np.random.shuffle(ids)

    sample_size = x.shape[0] // sample_cnt
    for i in range(sample_cnt):
        test = np.zeros(x.shape[0], dtype=bool)
        train = np.zeros(x.shape[0], dtype=bool)
        test[ids[i * sample_size:(i + 1) * sample_size]] = True
        train[ids[:i * sample_size]] = True
        train[ids[(i + 1) * sample_size:]] = True
        yield x[train], y[train], x[test], y[test]

In [8]:
def generate_batches(x, y, batch_size):
        ids = np.arange(x.shape[0])
        np.random.shuffle(ids)
        for i in range(0, x.shape[0], batch_size):
            batch_ids = ids[i:min(start + batch_size, x.shape[0])]
            yield x[batch_ids], y[batch_ids]

In [9]:
class factorization_machine(object):
    def __init__(self):
      self.k = 3
      self.W0 = 0
      self.W = None
      self.V = None
      self.XV = None
      self.X_squared = None
      self.start_lr = 0.1
      self.lr = self.start_lr
    
    def predict(self, X):
      self.XV = X @ self.V
      self.X_squared = X.power(2)
      return self.W0 + X @ self.W + np.sum(self.XV ** 2 - (self.X_squared @ self.V ** 2), axis=1, keepdims=True) / 2

    def get_batches(self, dataset, batch_size):
        X, Y = dataset
        n_samples = X.shape[0]

        ids = np.arange(n_samples)
        np.random.shuffle(ids)

        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_id = ids[start:end]

            yield X[batch_id], Y[batch_id]

    def train(self, X, y, epoch_count=1, batch_size=512):
      self.W = np.random.rand(X.shape[1], 1)
      self.V = np.random.rand(X.shape[1], self.k)
      for epoch_num in range(epoch_count):
        for x_batch, y_batch in self.get_batches((X, y), batch_size):
            self.lr = self.start_lr / np.sqrt(epoch_num + 1)
            predictions = self.predict(x_batch)
            dLoss = 2 * np.subtract(predictions, y_batch.reshape(-1, 1)) / len(y_batch)
            self.W0 -= np.multiply(self.lr, np.sum(dLoss))
            self.W -= np.multiply(self.lr, x_batch.transpose() @ dLoss)
            np.multiply(self.lr, dLoss, out=dLoss)
            
            for f in range(self.k):
                dV = x_batch.multiply(self.XV[:, f].reshape(-1, 1))
                dV -= self.X_squared.multiply(self.V[:, f])
                self.V[:, f] -= dLoss.reshape(-1) @ dV

In [10]:
train_time = []
train_rmse = []
test_rmse = []
fm = factorization_machine()
for x_train, y_train, x_test, y_test in generate_cross_validation_ids(hot_matrix, df_with_movies["Rating"].to_numpy(), 5):
    start = time.time()
    fm.train(x_train, y_train)
    stop = time.time()
    train_time.append((stop - start) / 60)
    predictions_train = fm.predict(x_train).reshape(-1)
    predictions_test = fm.predict(x_test).reshape(-1)
    train_rmse.append(rmse(y_train, predictions_train))
    test_rmse.append(rmse(y_test, predictions_test))

In [11]:
result = pd.DataFrame({"train_RMSE": train_rmse, "test_RMSE": test_rmse, "Time": train_time}).T
result

Unnamed: 0,0,1,2,3,4
train_RMSE,1.078224,1.077928,1.077464,1.079928,1.078557
test_RMSE,1.079226,1.078848,1.079831,1.079467,1.079136
Time,0.658628,0.659226,0.626631,0.64458,0.661871
