# Basic Similarity Models

In [1]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
np.random.seed(0)

In [2]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [3]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [4]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [5]:
def intersection(lst1, lst2):
    j = 0
    ans = []
    for i in range(len(lst1)):
        while (j < len(lst2) and lst2[j] < lst1[i]):
            j += 1
        if j >= len(lst2):
            break
        if lst2[j] == lst1[i]:
            ans.append(lst1[i])
            j += 1
    return ans

In [6]:
def Cosine(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / (math.sqrt(len(lst1)) * math.sqrt(len(lst2)))

def Jaccard(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / len(lst1.union(lst2))

def Pearson(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / (len(lst1) * len(lst2))

def ModJaccard(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / len(lst1.union(lst2).difference(lst1.intersection(lst2)))

def LopJaccard(lst1, lst2):
    lst1 = set(lst1)
    lst2 = set(lst2)
    return len(lst1.intersection(lst2)) / len(lst1)

def Custom(i, j, lst1, lst2):
    sim = 0
    for u in intersection(lst1, lst2):
        sim += (interactions[(u,i)] + interactions[(u,j)]) / (1+np.abs(interactions[(u,i)] - interactions[(u,j)])) / 28
    return sim

In [7]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]

interactions = {}
gamesPerUser = defaultdict(set)
usersPerGame = defaultdict(set)
sim_ij = defaultdict(int)

def sim(i, j, func=Custom):
    if (i, j) in sim_ij:
        return sim_ij[(i, j)]
    
    users_i = usersPerGame[i]
    users_j = usersPerGame[j]
    if func == Custom:
        sim_ij[(i, j)] = func(i, j, users_i, users_j)
        sim_ij[(j, i)] = func(i, j, users_j, users_i)
    else:
        sim_ij[(i, j)] = func(users_i, users_j)
        sim_ij[(j, i)] = func(users_j, users_i)
    return sim_ij[(i, j)]

def train(dataset):
    global interactions, gamesPerUser, usersPerGame
    
    interactions = {}
    gamesPerUser = defaultdict(list)
    usersPerGame = defaultdict(list)
    
    for d in dataset:
        u, g, h = d
        interactions[(u,g)] = h
        gamesPerUser[u].append(g)
        usersPerGame[g].append(u)

In [8]:
trainset, validset = train_test_split(dataset, test_size=0.1, shuffle=False)

models = train(trainset)

In [9]:
mean_hours = np.mean([d[2] for d in dataset])
hoursPerGame = defaultdict(lambda: mean_hours)
for g in usersPerGame:
    hoursPerGame[g] = np.mean([interactions[(u,g)] for u in usersPerGame[g]])
    if (hoursPerGame[g] == 0):
        hoursPerGame[g] = mean_hours

hoursPerUser = defaultdict(lambda: mean_hours)
for u in gamesPerUser:
    hoursPerUser[u] = np.mean([interactions[(u,g)] for g in gamesPerUser[u]])
    if (hoursPerUser[u] == 0):
        hoursPerUser[u] = mean_hours

In [10]:
def predict(u, i, func=Custom):
    if len(gamesPerUser[u]) == 0:
        return hoursPerGame[i]
    return hoursPerGame[i] + 1 / len(gamesPerUser[u]) * sum([sim(i, j, func) * (interactions[(u, j)] - hoursPerGame[j]) for j in gamesPerUser[u] if j != i])

In [11]:
predictions = []

for func in [Cosine, Jaccard, Pearson, ModJaccard, LopJaccard, Custom]:
    sim_ij = defaultdict(int)
    prediction = [predict(u, i, func) for (u, i, h) in validset]
    MSE = mean_squared_error(prediction, [h for (u,g,h) in validset])
    print(func.__name__, MSE)

    predictions.append(prediction)

Cosine 3.18850048322312
Jaccard 3.1968137372184953
Pearson 3.2035744804366897
ModJaccard 3.1966175110417834
LopJaccard 3.1854065150428927
Custom 3.202601772422115


# Other models

In [12]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, CoClustering
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error
import pandas as pd
import math
np.random.seed(0)

In [13]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [14]:
tasks = []
otasks = []
header = ""

for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        #header
        header = l
        continue
    u,g = l.strip().split(',')
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)
    otasks.append((u,g))
    u = umap[u]
    g = gmap[g]
    tasks.append((u,g))


In [15]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]
df = pd.DataFrame(dataset, columns =['user', 'game', 'hours'])

In [16]:
print(df['hours'].min(), df['hours'].max())

0.0 14.013750114071462


In [17]:
reader = Reader(rating_scale=(df['hours'].min(), df['hours'].max()))
data = Dataset.load_from_df(df[["user", "game", "hours"]], reader)

train, valid = train_test_split(data, test_size=0.1, shuffle=False)

In [18]:
# factors = [2, 3, 5, 10, 15, 20, 25]
# for factor in factors:
#     param_grid = {"n_epochs": [5, 10, 20, 50], "reg_all": [0.01, 0.05, 0.1, 0.2], "n_factors": [factor]}
#     gs = GridSearchCV(SVD, param_grid, measures=["mse"], cv=5)

#     gs.fit(data)

#     print(factor, gs.best_score["mse"], gs.best_params["mse"])

# 2 3.0879222970713043 {'n_epochs': 20, 'reg_all': 0.1, 'n_factors': 2}
# 3 3.086121620599191 {'n_epochs': 20, 'reg_all': 0.1, 'n_factors': 3}
# 5 3.087488467139108 {'n_epochs': 10, 'reg_all': 0.01, 'n_factors': 5}
# 10 3.100085170101152 {'n_epochs': 10, 'reg_all': 0.05, 'n_factors': 10}
# 15 3.101539947261756 {'n_epochs': 10, 'reg_all': 0.05, 'n_factors': 15}
# 20 3.106343392649928 {'n_epochs': 10, 'reg_all': 0.05, 'n_factors': 20}
# 25 3.1076779148589075 {'n_epochs': 10, 'reg_all': 0.05, 'n_factors': 25}

In [19]:
params = [(20, 0.1, 2), (20, 0.1, 3), (10, 0.01, 5), (10, 0.05, 10), (10, 0.05, 15), (10, 0.05, 20), (10, 0.05, 25)]
copies = 3

models = []

for i in range(copies):
    for param in params:
        model = SVD(n_epochs=param[0], reg_all=param[1], n_factors=param[2])
        model.fit(train)
        models.append(model)

        model = SVDpp(n_epochs=param[0], reg_all=param[1], n_factors=param[2])
        model.fit(train)
        models.append(model)

        model = KNNBasic(k=i*10)
        model.fit(train)
        models.append(model)

        model = KNNWithMeans(k=i*10)
        model.fit(train)
        models.append(model)

        model = KNNWithZScore(k=i*10)
        model.fit(train)
        models.append(model)

        model = KNNBaseline(k=i*10)
        model.fit(train)
        models.append(model)

        model = BaselineOnly()
    model.fit(train)
    models.append(model)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity ma

In [20]:
for model in models:
    output = model.test(valid)
    predictions.append([o.est for o in output])

In [21]:
mean_pred = np.mean(predictions, axis=0)

mean_MSE = mean_squared_error([o.r_ui for o in output], mean_pred)
print("Mean MSE: ", mean_MSE)

Mean MSE:  3.1211926658236915


In [22]:
X = np.array(predictions).T
y = [o.r_ui for o in output]

mixer = Ridge()
mixer.fit(X, y)

valid_pred = mixer.predict(X)
valid_MSE = mean_squared_error(y, valid_pred)
print("Mixed MSE: ", valid_MSE)

Mixed MSE:  2.993156192014606


In [23]:
# misses = []
# for i, pred in enumerate(valid_pred):
#     if abs(valid[i][2] - pred) > 3:
#         misses.append((*valid[i], pred))
# misses.sort(key=lambda x: -abs(x[2] - x[3]))
# print(misses[:10])

In [25]:
test_predictions = []

for func in [Cosine, Jaccard, Pearson, ModJaccard, LopJaccard, Custom]:
    sim_ij = defaultdict(int)
    prediction = [predict(u, i, func) for (u, i) in tasks]

    test_predictions.append(prediction)

for model in models:
    test_prediction = []
    for task in tasks:
        u, g = task
        test_prediction.append(model.predict(u, g).est)
    test_predictions.append(test_prediction)

test_predictions = np.array(test_predictions).T
test_pred = mixer.predict(test_predictions)

In [26]:
predictions = open("predictions_Hours.csv", 'w')
predictions.write(header)
for i in range(len(tasks)):
    u,g = otasks[i]
    predictions.write(u + ',' + g + "," + str(test_pred[i]) + "\n")

predictions.close()

: 