In [44]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
np.random.seed(0)

In [45]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [41]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [42]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [43]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [87]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]

interactions = {}
gamesPerUser = defaultdict(set)
usersPerGame = defaultdict(set)
usersPerUser = defaultdict(list)
fits = {}

def train(dataset) -> {int:Ridge}:
    global interactions, gamesPerUser, usersPerGame, fits, usersPerUser
    
    interactions = defaultdict(int)
    gamesPerUser = defaultdict(list)
    usersPerGame = defaultdict(list)
    usersPerUser = defaultdict(list)
    fits = {}
    
    for d in dataset:
        u, g, h = d
        interactions[(u,g)] = h
        gamesPerUser[u].append(g)
        usersPerGame[g].append(u)

    for u1 in gamesPerUser:
        for u2 in gamesPerUser:
            if u1 == u2: continue
            gamesPerUser1 = gamesPerUser[u1]
            gamesPerUser2 = gamesPerUser[u2]
            sim = len(intersection(gamesPerUser1, gamesPerUser2))
            if sim > 5:
                usersPerUser[u1].append(u2)

    models = {}
    for u1 in usersPerUser:
        models[u1] = Ridge(alpha=10)
        X = [[interactions[(u2,g)] for u2 in usersPerUser[u1]] + [interactions[(u2,g)]**2 for u2 in usersPerUser[u1]] for g in gamesPerUser[u]]
        y = [interactions[(u1,g)] for g in gamesPerUser[u]]
        models[u1].fit(X, y)

    return models

In [88]:
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=0)

models = train(trainset)

In [89]:
mean_hours = np.mean([d[2] for d in dataset])
hoursPerGame = defaultdict(lambda: mean_hours)
for g in usersPerGame:
    hoursPerGame[g] = np.mean([interactions[(u,g)] for u in usersPerGame[g]])
    if (hoursPerGame[g] == 0):
        hoursPerGame[g] = mean_hours

In [90]:
predictions = []
for (u,g,h) in validset:
    if u in models:
        predictions.append(models[u].predict([[interactions[(u2,g)] for u2 in usersPerUser[u]] + [interactions[(u2,g)]**2 for u2 in usersPerUser[u]]])[0])
    else:
        predictions.append(hoursPerGame[g])

MSE = mean_squared_error(predictions, [h for (u,g,h) in validset])
print(MSE)

3.2893757638192076
