In [4]:
import gzip
import json
import pickle
import scipy.sparse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy import spatial

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.data import Dataset

In [5]:
THREADS = 8

## Preparing dataset

In [None]:
df = pd.read_csv('/pio/scratch/1/i313924/slim_ratings.csv',
                 names=['asin', 'reviewerID', 'overall', 'unixReviewTime'])

In [None]:
df

In [None]:
len(df)

In [13]:
dataset = Dataset()
dataset.fit((x for x in df['reviewerID']), (x for x in df['asin']))
with open('/pio/scratch/1/i313924/lightfm_data/slim_dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f, -1)

In [14]:
df

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,0871167042,A22ZX01TPWQY4G,2.0,1409702400
1,1519588135,A2S5EPJ451WL3I,3.0,1494633600
2,1519588135,A5UM17P1E7DUQ,5.0,1494115200
3,3979050432,A3GOUQIIW470Q,5.0,1455926400
4,3979050432,A1HIISJWW6007Z,4.0,1454457600
...,...,...,...,...
2384657,B01HJDVCJI,AQMUXDDCLJSSV,4.0,1535932800
2384658,B01HJDVCJI,AGFW667QNHDOY,3.0,1535155200
2384659,B01HJDVCJI,A3JTBJC5WSEZ7Q,3.0,1535155200
2384660,B01HJDVCJI,A7B48AJT6IC0A,4.0,1534118400


## Preparing interactions

In [15]:
df = df.loc[(df['overall'] >= 4.0)]

In [16]:
len(df)

1939517

In [17]:
interactions = dataset.build_interactions(
    (
        (val["reviewerID"], val["asin"])
        for idx, val in df.iterrows()
    )
)[1]
scipy.sparse.save_npz('/pio/scratch/1/i313924/lightfm_data/slim_interactions.npz', interactions)

## Training model

In [7]:
interactions = scipy.sparse.load_npz('/pio/scratch/1/i313924/data/lightfm_data/slim_interactions.npz')

In [9]:
model = LightFM(no_components=30, learning_schedule='adadelta', loss='warp')

In [10]:
model.fit(interactions, verbose=True, epochs=300, num_threads=THREADS)

Epoch: 100%|██████████| 300/300 [09:43<00:00,  1.95s/it]


<lightfm.lightfm.LightFM at 0x7f971e4014e0>

In [11]:
# https://bugs.python.org/issue25465 ?
pickle.dump(model, open('/pio/scratch/1/i313924/data/lightfm_data/warp_model_300_epochs_30_el_slim.pkl', 'wb'), protocol=4)

In [7]:
# evaluating precision is super time-consuming
# precision = []
# for i in tqdm(range(15)):
#     model.fit_partial(interactions, verbose=False, epochs=10, num_threads=THREADS)
#     current_precision = precision_at_k(model, interactions, k=5, num_threads=THREADS)
#     print(current_precision)
#     precision.append(current_precision)
    
# plt.plot(precision)
# pickle.dump(model, open('/pio/scratch/1/i313924/lightfm_data/warp_model.pkl', 'wb'))

Epoch: 100%|██████████████████████████████████████████████████████████████████████| 100/100 [1:22:22<00:00, 49.42s/it]
