In [1]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
import tensorflow as tf
np.random.seed(0)

In [2]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [3]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [4]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [5]:
tasks = []
header = ""

for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        #header
        header = l
        continue
    u,g = l.strip().split(',')
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)
    u = umap[u]
    g = gmap[g]
    tasks.append((u,g))


In [6]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [7]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=0)

In [8]:
training = True
if not training:
    trainset = dataset

In [9]:
mean_hours = np.mean([d[2] for d in trainset]).astype(np.float32)

In [10]:
Xiu = np.zeros([len(gmap), len(umap)], dtype=np.float32)

for d in trainset:
    hours = d[2]
    Xiu[d[1],d[0]] = hours

In [36]:
class Autoencoder(tf.keras.Model):
    def __init__(self, layers):
        super(Autoencoder, self).__init__()

        self.ffs = []
        if len(layers) == 1:
            self.ff = tf.Variable(tf.random.normal([len(umap),layers[0]],stddev=0.001))
            return

        layers = [len(umap)] + layers + [len(umap)]
        for i in range(len(layers)-1):
            self.ffs.append(tf.Variable(tf.random.normal([1+layers[i],layers[i+1]],stddev=0.001)))

    # Prediction for a many instances
    def predict(self, X):
        if len(self.ffs) == 0:
            X = tf.matmul(X, self.ff)
            X = tf.transpose(X)
            X = tf.matmul(tf.transpose(X), tf.transpose(self.ff))
            return X
        
        for layer in self.ffs:
            X = tf.concat([tf.ones([tf.shape(X)[0], 1]), X], 1)
            X = tf.nn.relu(tf.matmul(X, layer))

        return X

    # Regularizer
    def reg(self):
        return 0.
    
    # Loss
    def call(self, X):
        pred = self.predict(X)

        loss = tf.nn.l2_loss(X - (pred * tf.cast(X != 0, tf.float32))) / tf.reduce_sum(tf.cast(X != 0, tf.float32))

        return loss + self.reg()

In [49]:
optimizer = tf.keras.optimizers.legacy.Adam(0.01)
layers = [32, 32, 16]
layers = [16, 16]
layers = [64, 64]
layers = [2]
modelLFM = Autoencoder(layers)
modelLFM.compile(optimizer=optimizer)

In [50]:
def trainingStep(model):
    with tf.GradientTape() as tape:
        loss = model(Xiu)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [51]:
def MSE(model):
    pred_mat = model.predict(Xiu).numpy()
    predictions = [pred_mat[g,u] for u,g,h in validset]
    MSE = mean_squared_error([h for u,g,h in validset], predictions)
    return MSE

In [52]:
prevMSE = MSE(modelLFM)
for i in range(100):
    obj = trainingStep(modelLFM)
    valid_MSE = MSE(modelLFM)
    if (i % 10 == 9): 
        print("iteration", i+1, ", train MSE =", obj, ", valid MSE =", valid_MSE)
    # if i > 20 and valid_MSE > prevMSE:
    #     break
    # prevMSE = valid_MSE

iteration 10 , train MSE = 4.352994 , valid MSE = 10.531914397248247
iteration 20 , train MSE = 4.095663 , valid MSE = 13.472367154267033
iteration 30 , train MSE = 3.9397752 , valid MSE = 11.085717292695414
iteration 40 , train MSE = 3.8344035 , valid MSE = 11.229758544269535
iteration 50 , train MSE = 3.7653432 , valid MSE = 11.086253238848942
iteration 60 , train MSE = 3.7163222 , valid MSE = 11.357400602424363
iteration 70 , train MSE = 3.6687326 , valid MSE = 11.537555382938413
iteration 80 , train MSE = 3.6243973 , valid MSE = 11.752064674355855
iteration 90 , train MSE = 3.5853388 , valid MSE = 11.965024330787609
iteration 100 , train MSE = 3.5531325 , valid MSE = 12.179719481723051
