In [1]:
import gzip
import json
import pickle
import scipy.sparse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy import spatial

from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.data import Dataset

In [2]:
THREADS = 8

## Preparing dataset

In [3]:
df = pd.read_csv('/pio/scratch/1/recommender_systems/raw/Amazon/Clothing_Shoes_and_Jewelry.csv',
                 names=['asin', 'reviewerID', 'overall', 'unixReviewTime'])

In [4]:
df

Unnamed: 0,asin,reviewerID,overall,unixReviewTime
0,0871167042,A2IC3NZN488KWK,5.0,1399161600
1,0871167042,A3OT9BYASFGU2X,4.0,1398470400
2,0871167042,A28GK1G2KDXHRP,5.0,1397692800
3,0871167042,A3NFXFEKW8OK0E,5.0,1397606400
4,0871167042,A3I6G5TKBVJEK9,5.0,1397520000
...,...,...,...,...
32292094,B01HJHMQW6,A2ATY4H4A3RK05,5.0,1482710400
32292095,B01HJHA4W0,ACNWGCHFOPZ3N,3.0,1497830400
32292096,B01HJI0G5Y,A1R7H90HCU8WJK,3.0,1510704000
32292097,B01HJI0G5Y,A3VZ29X7SM3L09,5.0,1500768000


In [5]:
len(df)

32292099

In [6]:
dataset = Dataset()
dataset.fit((x for x in df['reviewerID']), (x for x in df['asin']))
with open('/pio/scratch/1/i313924/lightfm_data/dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f, -1)

## Preparing interactions

In [7]:
df = df.loc[(df['overall'] >= 4.0)]

In [8]:
len(df)

25233044

In [9]:
interactions = dataset.build_interactions(
    (
        (val["reviewerID"], val["asin"])
        for idx, val in df.iterrows()
    )
)[1]
scipy.sparse.save_npz('/pio/scratch/1/i313924/lightfm_data/interactions.npz', interactions)

In [11]:
sample_df = df.sample(frac=0.1, random_state=1)

In [12]:
len(sample_df)

2523304

In [13]:
interactions_sample = dataset.build_interactions(
    (
        (val["reviewerID"], val["asin"])
        for idx, val in sample_df.iterrows()
    )
)[1]
scipy.sparse.save_npz('/pio/scratch/1/i313924/lightfm_data/interactions_sample.npz', interactions_sample)

## Training model

In [3]:
interactions = scipy.sparse.load_npz('/pio/scratch/1/i313924/lightfm_data/interactions.npz')

In [4]:
model = LightFM(no_components=100, learning_schedule='adadelta',loss='warp')

In [5]:
model.fit(interactions, verbose=True, epochs=1000, num_threads=THREADS)

Epoch: 100%|██████████| 300/300 [1:58:55<00:00, 23.79s/it]


<lightfm.lightfm.LightFM at 0x7f5e3847dba8>

In [6]:
# https://bugs.python.org/issue25465 ?
pickle.dump(model, open('/pio/scratch/1/i313924/lightfm_data/warp_model_1000_epochs.pkl', 'wb'), protocol=4)

In [7]:
# evaluating precision is super time-consuming
# precision = []
# for i in tqdm(range(15)):
#     model.fit_partial(interactions, verbose=False, epochs=10, num_threads=THREADS)
#     current_precision = precision_at_k(model, interactions, k=5, num_threads=THREADS)
#     print(current_precision)
#     precision.append(current_precision)
    
# plt.plot(precision)
# pickle.dump(model, open('/pio/scratch/1/i313924/lightfm_data/warp_model.pkl', 'wb'))

Epoch: 100%|██████████████████████████████████████████████████████████████████████| 100/100 [1:22:22<00:00, 49.42s/it]
