In [1]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
import tensorflow as tf
np.random.seed(0)

In [2]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [3]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [4]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [5]:
tasks = []
header = ""

for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        #header
        header = l
        continue
    u,g = l.strip().split(',')
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)
    u = umap[u]
    g = gmap[g]
    tasks.append((u,g))


In [6]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [7]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=0)

In [8]:
mean_hours = np.mean([d[2] for d in dataset]).astype(np.float32)

In [9]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, Ku, Ki, lamb, layers, gammaU=None, gammaI=None):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(umap)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(gmap)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(umap),Ku],stddev=0.001) if gammaU is None else gammaU)
        self.gammaI = tf.Variable(tf.random.normal([len(gmap),Ki],stddev=0.001) if gammaI is None else gammaI)
        self.Ku = gammaU.shape[1]
        self.Ki = gammaI.shape[1]
        self.lamb = lamb

        layers = [2+self.Ku+self.Ki] + layers + [1]
        self.ffs = []
        for i in range(len(layers)-1):
            self.ffs.append(tf.Variable(tf.random.normal([1+layers[i],layers[i+1]],stddev=0.001)))

    # Similarity between user and item latent factors
    def sim(self, u, i):
        # return tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        dp = [tf.tensordot(self.gammaU[u], self.gammaI[i], 1) if self.Ku == self.Ki else np.zeros([1])]
        dst = [tf.norm(self.gammaU[u] - self.gammaI[i]) if self.Ku == self.Ki else np.zeros([1])]
        x = tf.concat([dp, dst, self.gammaU[u], self.gammaI[i]], 0)
        # x.assign(tf.reshape(x, [1,-1]))
        for layer in self.ffs:
            x = tf.concat([tf.ones([1]), x], 0)
            x = tf.nn.relu(tf.tensordot(layer, x, 1))
        return x

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            self.sim(u, i)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        dp = tf.reduce_sum(gamma_u * gamma_i, 1, keepdims=True) if self.Ku == self.Ki else np.zeros([len(sampleU),1])
        dst = tf.norm(gamma_u - gamma_i, axis=1, keepdims=True) if self.Ku == self.Ki else np.zeros([len(sampleU),1])
        x = tf.concat([dp, dst, gamma_u, gamma_i], 1)
        for layer in self.ffs:
            x = tf.concat([tf.ones([len(sampleU),1]), x], 1)
            x = tf.nn.relu(tf.matmul(x, layer))
        pred = self.alpha + beta_u + beta_i +\
               tf.reshape(x, [-1])
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [10]:
gammaU = np.zeros([len(umap), len(gmap)], dtype=np.float32)
gammaI = np.zeros([len(gmap), len(umap)], dtype=np.float32)
for d in dataset:
    hours = d[2]
    gammaU[d[0],d[1]] = hours
    gammaI[d[1],d[0]] = hours

In [11]:
optimizer = tf.keras.optimizers.legacy.Adam(0.01)
layers = [32, 32, 16]
layers = [16, 16]
layers = [64, 64]
layers = [256, 256, 128]
modelLFM = LatentFactorModel(mean_hours, 32, 32, 0.0001, layers, gammaU=gammaU, gammaI=gammaI)
modelLFM.compile(optimizer=optimizer)

In [12]:
def trainingStep(model, trainset):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,g,r = random.choice(trainset)
            sampleU.append(u)
            sampleI.append(g)
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [13]:
validU, validI, validR = [], [], []
for u,g,r in validset:
    validU.append(u)
    validI.append(g)
    validR.append(r)

def MSE(model):
    predictions = model.predictSample(validU, validI).numpy()
    MSE = mean_squared_error(validR, predictions)
    return MSE

In [14]:
prevMSE = MSE(modelLFM)
for i in range(1000):
    obj = trainingStep(modelLFM, trainset)
    valid_MSE = MSE(modelLFM)
    if (i % 10 == 9): 
        print("iteration " + str(i+1) + ", valid MSE = ", valid_MSE)
    if valid_MSE > prevMSE:
        break

iteration 10, valid MSE =  5.0794759559495635
iteration 20, valid MSE =  4.845770702433857
iteration 30, valid MSE =  4.646609819891311
iteration 40, valid MSE =  4.477126168145637
iteration 50, valid MSE =  4.334492923516461
iteration 60, valid MSE =  4.214443509126016
iteration 70, valid MSE =  4.111781991581383
iteration 80, valid MSE =  4.022071921523742
iteration 90, valid MSE =  3.9443659764013512
iteration 100, valid MSE =  3.877528145284004
iteration 110, valid MSE =  3.81853238340689
iteration 120, valid MSE =  3.765785383595601
iteration 130, valid MSE =  3.7194293886652945
iteration 140, valid MSE =  3.6778717861417833
iteration 150, valid MSE =  3.6409208857623683
iteration 160, valid MSE =  3.6080005768859698
iteration 170, valid MSE =  3.5780493116200236
iteration 180, valid MSE =  3.550144788190334
iteration 190, valid MSE =  3.524779457328636
iteration 200, valid MSE =  3.5024265364917797
iteration 210, valid MSE =  3.482304660639027
iteration 220, valid MSE =  3.464849

KeyboardInterrupt: 