In [31]:
import gzip
from collections import defaultdict
import numpy as np
import random
import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from scipy.sparse import csr_matrix
import surprise
from surprise import SVD, Reader, Dataset, accuracy, SVDpp
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error
import pandas as pd
np.random.seed(0)

In [13]:
def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    yield d

In [14]:
raw_data = [d for d in readJSON("train.json.gz")]
np.random.shuffle(raw_data)

In [15]:
umap = {}
gmap = {}

for d in raw_data:
    u, g = d['userID'], d['gameID']
    if u not in umap:
        umap[u] = len(umap)
    if g not in gmap:
        gmap[g] = len(gmap)

    d['user'] = umap[u]
    d['game'] = gmap[g]

In [18]:
dataset = [(d['user'], d['game'], d['hours_transformed']) for d in raw_data]
df = pd.DataFrame(dataset, columns =['user', 'game', 'hours'])

In [21]:
print(df['hours'].min(), df['hours'].max())

0.0 14.013750114071462


In [24]:
reader = Reader(rating_scale=(df['hours'].min(), df['hours'].max()))
data = Dataset.load_from_df(df[["user", "game", "hours"]], reader)

train, valid = train_test_split(data, test_size=0.1, random_state=0)

In [34]:
param_grid = {"n_epochs": [5, 10], "reg_all": [0.05, 0.1, 0.2], "n_factors": [2, 3, 4, 5, 6, 7, 8, 9, 10]}
gs = GridSearchCV(SVD, param_grid, measures=["mse"], cv=5)

gs.fit(data)

print(gs.best_score["mse"])

model = gs.best_estimator["mse"]

3.128099386787955


In [35]:
print(gs.best_params["mse"])

{'n_epochs': 10, 'reg_all': 0.05, 'n_factors': 3}
