In [2]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
import tensorflow as tf
np.random.seed(0)

In [3]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [4]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [5]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [6]:
tasks = []
header = ""

for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        #header
        header = l
        continue
    u,g = l.strip().split(',')
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)
    u = umap[u]
    g = gmap[g]
    tasks.append((u,g))


In [7]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [8]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=0)

In [9]:
training = True
if not training:
    trainset = dataset

In [10]:
mean_hours = np.mean([d[2] for d in dataset]).astype(np.float32)

In [11]:
mat = np.zeros([len(umap), len(gmap)], dtype=np.float32)

for d in trainset:
    hours = d[2]
    mat[d[0],d[1]] = hours

In [38]:
class NN(tf.keras.Model):
    def __init__(self, layers):
        super(NN, self).__init__()

        layers = [len(gmap) + len(umap)] + layers + [1]
        self.ffs = []
        for i in range(len(layers)-1):
            self.ffs.append(tf.Variable(tf.random.normal([1+layers[i],layers[i+1]],stddev=0.001)))

    # Prediction for a many instances
    def predict(self, X):
        for layer in self.ffs:
            X = tf.concat([tf.ones([tf.shape(X)[0], 1], dtype=tf.float32), X], 1)
            X = tf.nn.relu(tf.matmul(X, layer))

        return X

    # Regularizer
    def reg(self):
        return 0.
    
    # Loss
    def call(self, X, y):
        y = tf.cast(y, tf.float32)
        pred = self.predict(X)

        return tf.reduce_sum(tf.nn.l2_loss(y - pred)) / tf.cast(tf.shape(y)[0], tf.float32) + self.reg()

In [106]:
optimizer = tf.keras.optimizers.legacy.Adam(0.0001)
layers = [16, 16, 16, 16, 16]
modelLFM = NN(layers)
modelLFM.compile(optimizer=optimizer)

In [107]:
def trainingStep(model, trainset):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        X = np.zeros([Nsamples, len(umap) + len(gmap)], dtype=np.float32)
        y = np.zeros([Nsamples, 1], dtype=np.float32)
        for i in range(Nsamples):
            u,g,h = random.choice(trainset)
            X[i] = np.concatenate([mat[u], mat[:,g]])
            y[i] = h
        loss = model(X, y)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [108]:
valid_X = np.zeros([len(validset), len(umap) + len(gmap)], dtype=np.float32)
valid_y = np.zeros([len(validset), 1], dtype=np.float32)
for i in range(len(validset)):
    u,g,h = validset[i]
    valid_X[i] = np.concatenate([mat[u], mat[:,g]])
    valid_y[i] = h

def MSE(model):
    predictions = model.predict(valid_X).numpy()
    MSE = mean_squared_error(valid_y, predictions)
    return MSE

In [109]:
prevMSE = MSE(modelLFM)
for i in range(100):
    obj = trainingStep(modelLFM, trainset)
    valid_MSE = MSE(modelLFM)
    if (i % 1 == 0): 
        print("iteration " + str(i+1) + ", valid MSE = ", valid_MSE)
    # if valid_MSE > prevMSE:
    #     break

iteration 1, valid MSE =  19.190874
iteration 2, valid MSE =  19.190874


KeyboardInterrupt: 