In [12]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
np.random.seed(0)

In [13]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [43]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

KeyboardInterrupt: 

In [15]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [16]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [37]:
def Cosine(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / (math.sqrt(len(lst1)) * math.sqrt(len(lst2)))

def Jaccard(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / len(lst1.union(lst2))

def Pearson(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / (len(lst1) * len(lst2))

def ModJaccard(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / len(lst1.union(lst2).difference(lst1.intersection(lst2)))

def LopJaccard(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / len(lst1)

def Custom(i, j, lst1, lst2):
    sim = 0
    for u in intersection(lst1, lst2):
        sim += (interactions[(u,i)] + interactions[(u,j)]) / (1+np.abs(interactions[(u,i)] - interactions[(u,j)])) / 28
    return sim

In [38]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]

interactions = {}
gamesPerUser = defaultdict(set)
usersPerGame = defaultdict(set)
sim_ij = defaultdict(int)

def sim(i, j, func=Custom):
    if (i, j) in sim_ij:
        return sim_ij[(i, j)]
    
    users_i = usersPerGame[i]
    users_j = usersPerGame[j]
    if func == Custom:
        sim_ij[(i, j)] = func(i, j, users_i, users_j)
        sim_ij[(j, i)] = func(i, j, users_j, users_i)
    else:
        sim_ij[(i, j)] = func(users_i, users_j)
        sim_ij[(j, i)] = func(users_j, users_i)
    return sim_ij[(i, j)]

def train(dataset):
    global interactions, gamesPerUser, usersPerGame
    
    interactions = {}
    gamesPerUser = defaultdict(list)
    usersPerGame = defaultdict(list)
    
    for d in dataset:
        u, g, h = d
        interactions[(u,g)] = h
        gamesPerUser[u].append(g)
        usersPerGame[g].append(u)

In [39]:
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=0)

models = train(trainset)

In [40]:
mean_hours = np.mean([d[2] for d in dataset])
hoursPerGame = defaultdict(lambda: mean_hours)
for g in usersPerGame:
    hoursPerGame[g] = np.mean([interactions[(u,g)] for u in usersPerGame[g]])
    if (hoursPerGame[g] == 0):
        hoursPerGame[g] = mean_hours

hoursPerUser = defaultdict(lambda: mean_hours)
for u in gamesPerUser:
    hoursPerUser[u] = np.mean([interactions[(u,g)] for g in gamesPerUser[u]])
    if (hoursPerUser[u] == 0):
        hoursPerUser[u] = mean_hours

In [41]:
def predict(u, i, func=Custom):
    if len(gamesPerUser[u]) == 0:
        return hoursPerGame[i]
    return hoursPerGame[i] + 1 / len(gamesPerUser[u]) * sum([sim(i, j, func) * (interactions[(u, j)] - hoursPerGame[j]) for j in gamesPerUser[u] if j != i])

In [42]:
for func in [Cosine, Jaccard, Pearson, ModJaccard, LopJaccard, Custom]:
    sim_ij = defaultdict(int)
    predictions = [predict(u, i, func) for (u, i, h) in validset]
    MSE = mean_squared_error(predictions, [h for (u,g,h) in validset])
    print(func.__name__, MSE)

Cosine 3.2753024090022755
Jaccard 3.282977362628762
Pearson 3.2892170157684495
ModJaccard 3.2828148047791395
LopJaccard 3.2719677254862054
Custom 3.2883176368758478
