In [73]:
import pickle

import pandas as pd
import scipy.sparse
from lightfm import LightFM
from lightfm.data import Dataset

In [74]:
THREADS = 8

In [75]:
df = pd.read_csv('/pio/scratch/1/i313924/data/train_data/slim_ratings.csv',
                 names=['asin', 'reviewerID', 'overall', 'unixReviewTime'])

## Chosing item(s)

In [76]:
df = df.drop_duplicates()
df = df.loc[(df['overall'] >= 4.0)]
df['date'] = pd.to_datetime(df.unixReviewTime, unit='s')

In [77]:
df['asin'].value_counts()

B00SU5244M    6119
B00SU52460    6119
B00X8AMIDG    6096
B00WRGPHY4    6096
B00WBHSMOQ    6092
              ... 
B01AJWWXNA       1
B00J3QHF2I       1
B01A2JGYRQ       1
B015VLL15A       1
B00J02CG5G       1
Name: asin, Length: 200255, dtype: int64

In [78]:
for i in range(0, 10):
    current_year = 2010 + i
    next_year = current_year + 1
    print(f'{current_year}-01-01 - {next_year}-01-01')
    print(len(df.loc[(df.asin == "B00SU52460") & (df.date > f'{current_year}-01-01') & (df.date < f'{next_year}-01-01')]))

2010-01-01 - 2011-01-01
5
2011-01-01 - 2012-01-01
15
2012-01-01 - 2013-01-01
33
2013-01-01 - 2014-01-01
173
2014-01-01 - 2015-01-01
558
2015-01-01 - 2016-01-01
1304
2016-01-01 - 2017-01-01
1949
2017-01-01 - 2018-01-01
1522
2018-01-01 - 2019-01-01
542
2019-01-01 - 2020-01-01
0


In [79]:
# B00SU52460: 2013 - 2019

In [80]:
for i in range(3, 9):
    df.loc[(df.asin == "B00SU52460") & (df.date > f'201{i}-01-01') & (df.date < f'201{i + 1}-01-01'), ['asin']] = f'B00SU52460_201{i}'

In [81]:
df = df.drop(df[df.asin == "B00SU52460"].index)

## Preparing dataset

In [82]:
dataset = Dataset()
dataset.fit((x for x in df['reviewerID']), (x for x in df['asin']))
with open('/pio/scratch/1/i313924/data/lightfm_data/B00SU52460_dataset.pkl', 'wb') as f:
    pickle.dump(dataset, f, -1)

## Preparing interactions

In [83]:
interactions = dataset.build_interactions(
    (
        (val["reviewerID"], val["asin"])
        for idx, val in df.iterrows()
    )
)[1]
scipy.sparse.save_npz('/pio/scratch/1/i313924/data/lightfm_data/B00SU52460_interactions.npz', interactions)

## Training model

In [84]:
model = LightFM(no_components=100, learning_schedule='adadelta', loss='warp')

In [85]:
model.fit(interactions, verbose=True, epochs=1000, num_threads=THREADS)

Epoch: 100%|██████████| 1000/1000 [29:38<00:00,  1.78s/it]


<lightfm.lightfm.LightFM at 0x7f3de84c7358>

In [86]:
pickle.dump(model, open('/pio/scratch/1/i313924/data/lightfm_data/B00SU52460_model_1000_epochs.pkl', 'wb'), protocol=4)