In [1]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
import tensorflow as tf
np.random.seed(0)

In [2]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [3]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [4]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [5]:
tasks = []
header = ""

for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        #header
        header = l
        continue
    u,g = l.strip().split(',')
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)
    u = umap[u]
    g = gmap[g]
    tasks.append((u,g))


In [6]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [7]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=0)

In [8]:
training = True
if not training:
    trainset = dataset

In [9]:
mean_hours = np.mean([d[2] for d in dataset]).astype(np.float32)

In [10]:
mat = np.zeros([len(umap), len(gmap)], dtype=np.float32)

for d in trainset:
    hours = d[2]
    mat[d[0],d[1]] = hours

In [11]:
class Autoencoder(tf.keras.Model):
    def __init__(self, layers):
        super(Autoencoder, self).__init__()

        layers = [len(umap) * len(gmap)] + layers + [len(umap) * len(gmap)]
        self.ffs = []
        for i in range(len(layers)-1):
            self.ffs.append(tf.Variable(tf.random.normal([1+layers[i],layers[i+1]],stddev=0.001)))

    # Prediction for a single instance (useful for evaluation)
    def predict(self, mat):
        x = tf.reshape(mat, [-1])
        for layer in self.ffs:
            x = tf.concat([tf.ones([1]), x], 0)
            x = tf.nn.relu(tf.tensordot(x, layer, 1))
        x = tf.reshape(x, [len(umap), len(gmap)])

        return x

    # Regularizer
    def reg(self, pred):
        return 0.
    
    # Loss
    def call(self, mat):
        pred = self.predict(mat)

        loss = tf.nn.l2_loss(mat - pred * tf.cast(mat > 0., tf.float32))

        return tf.reduce_sum(loss) / (len(umap) * len(gmap)) + self.reg(pred)

In [12]:
optimizer = tf.keras.optimizers.legacy.Adam(0.01)
layers = [32, 32, 16]
layers = [16, 16]
layers = [64, 64]
layers = [16, 256, 128]
modelLFM = Autoencoder(layers)
modelLFM.compile(optimizer=optimizer)

In [13]:
def trainingStep(model):
    with tf.GradientTape() as tape:
        loss = model(mat)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [14]:
validU, validI, validR = [], [], []
for u,g,r in validset:
    validU.append(u)
    validI.append(g)
    validR.append(r)

def MSE(model):
    pred_mat = model.predict(mat).numpy()
    predictions = [pred_mat[u,g] for u,g in zip(validU, validI)]
    MSE = mean_squared_error(validR, predictions)
    return MSE

In [15]:
prevMSE = MSE(modelLFM)
for i in range(100):
    obj = trainingStep(modelLFM)
    valid_MSE = MSE(modelLFM)
    if (i % 1 == 0): 
        print("iteration " + str(i+1) + ", valid MSE = ", valid_MSE)
    if valid_MSE > prevMSE:
        break

iteration 1, valid MSE =  19.187896841044203
iteration 2, valid MSE =  19.187896847481454
iteration 3, valid MSE =  19.18789684170866
iteration 4, valid MSE = 

KeyboardInterrupt: 