In [41]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
np.random.seed(0)

In [42]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [43]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [44]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [70]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]

interactions = {}
gamesPerUser = defaultdict(list)
usersPerGame = defaultdict(list)
fits = {}

def train(dataset):
    global interactions, gamesPerUser, usersPerGame, fits
    
    interactions = {}
    gamesPerUser = defaultdict(list)
    usersPerGame = defaultdict(list)
    fits = {}
    
    for d in dataset:
        u, g, h = d
        interactions[(u,g)] = h
        gamesPerUser[u].append(g)
        usersPerGame[g].append(u)

    for u in gamesPerUser:
        gamesPerUser[u].sort()

    for g in usersPerGame:
        usersPerGame[g].sort()

In [71]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [72]:
def Custom(u1, u2, same):
    sim = 0
    for g in same:
        sim += (interactions[(u1,g)] + interactions[(u2,g)]) / (0.5+np.abs(interactions[(u1,g)] - interactions[(u2,g)]))
    return sim

In [79]:
def fit(u1, u2):
    games1 = gamesPerUser[u1]
    games2 = gamesPerUser[u2]
    games = intersection(games1, games2)
    
    hours1 = [interactions[(u1, g)]*1.0 for g in games]
    hours2 = [interactions[(u2, g)]*1.0 for g in games]

    if len(games) < 3  or hours1[0] == 0:
        return 1, 0, 0

    m, b = np.polyfit(hours1, hours2, 1)
    # r = (len(games) ** 0.5) / (1 + mean_squared_error([m * x + b for x in hours1], hours2))
    # r = Custom(u1, u2, games)
    r = np.corrcoef(hours1, hours2)[0][1]

    if math.isnan(r):
        r = 0

    return m, b, abs(r)


In [80]:
mean_hours = np.mean([d[2] for d in dataset])
hoursPerGame = defaultdict(lambda: mean_hours)
for g in usersPerGame:
    hoursPerGame[g] = np.mean([interactions[(u,g)] for u in usersPerGame[g]])
    if (hoursPerGame[g] == 0):
        hoursPerGame[g] = mean_hours

In [81]:
# y = mx + b
# y - b = mx
# (y - b) / m = x
# x = y/m - b / m

fits = {}

def predict(u, g, refit=False):
    global fits

    wsum, wtot = 0, 0
    for v in usersPerGame[g]:
        if refit or (u,v) not in fits:
            m, b, r = fit(u,v)
            fits[(u,v)] = (m, b, r)
            if m == 0:
                fits[(u,v)] = (1, 0, 0)
            else:
                fits[(v,u)] = (1/m, b/m, r)

        m, b, r = fits[(v,u)]
        if (math.isnan(m) or math.isnan(b) or math.isnan(r)):
            print(m, b, r, u, v, g)
        pred = max(0, min(15, m * interactions[(v,g)] - b))

        w = r

        wsum += w * pred
        wtot += w

    if wtot == 0:
        return hoursPerGame[g]

    return wsum / wtot

In [82]:
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=0)

train(trainset)
predictions = [predict(u,g,refit=True) for (u,g,h) in validset]
MSE = mean_squared_error(predictions, [h for (u,g,h) in validset])
print(MSE)

  m, b, r = fit(u,v)
  c /= stddev[:, None]
  c /= stddev[None, :]
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)
  m, b, r = fit(u,v)


7.495923337210688


  m, b, r = fit(u,v)
