### Basic EDA

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('train.csv', low_memory=False)
test_data = pd.read_csv('test.csv', low_memory=False)
sample_submission = pd.read_csv('sample_submission.csv', low_memory=False)

In [3]:
train_data.head(3)

Unnamed: 0,overall,verified,reviewTime,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image,userid,itemid,rating
0,5.0,True,"10 4, 2016",B01CPNIEQG,Heather,These are my FAVORITE spices in my collection....,Must Add to your Spice kitchen!,1475539200,,,,102179,37138,1.0
1,5.0,True,"03 1, 2016",B006F63M8U,Linda Odom,Add A package to my Coffee and it makes a good...,Milk Chocolate Swiis MIss Hot Cocoa Mix,1456790400,,{'Size:': ' 60-0.73 oz Envelopes'},,3625,17322,1.0
2,5.0,True,"06 26, 2016",B00112O8NG,DesertBlossom,"I love the Torani syrups, but not the prices o...","Love these sugar free syrups, but didn't love ...",1466899200,28.0,,,39495,5600,1.0


In [4]:
test_data.head(3)

Unnamed: 0,verified,reviewTime,asin,reviewerName,unixReviewTime,vote,style,image,userid,itemid,Id
0,True,"10 1, 2016",B001E5E3X0,Rudys Mom,1475280000,,,,68877,7506,0
1,True,"06 29, 2014",B005BYXX5E,pharg,1404000000,2.0,,,50442,15530,1
2,True,"05 19, 2015",B0052OK6OO,dhalex,1431993600,,,,64349,15049,2


In [5]:
sample_submission.head(3)

Unnamed: 0,Id,rating
0,0,0.282813
1,1,0.629758
2,2,0.916899


### Preprocessing

In [6]:
train_data['rating'] = train_data['rating'].astype('int32')

In [7]:
train_data['overall'] = train_data['overall'].apply(lambda x: x if x>=4 else 0)

### Building and training a model

In [8]:
from lightfm import LightFM
from lightfm.data import Dataset
import scipy.sparse

#ratings_coo = scipy.sparse.coo_matrix((train_data['overall'].astype(int),
#                                      (train_data['userid'],
#                                       train_data['itemid'])))

dataset = Dataset()
dataset.fit(train_data['userid'].values, train_data['itemid'].values)
dataset.fit_partial(test_data['userid'].values, test_data['itemid'].values)

(interactions, weights) = dataset.build_interactions(tuple(x) for x in train_data[['userid','itemid', 'overall']].itertuples(index=False))


print(f'Got weights matrix of {interactions.shape[0]} x {interactions.shape[1]} shape. Nice!')



Got weights matrix of 127496 x 41320 shape. Nice!


In [18]:
NUM_THREADS = 4
NUM_COMPONENTS = 250
NUM_EPOCHS = 15

model = LightFM(learning_rate=0.05,
                loss='logistic',
                no_components=NUM_COMPONENTS)

model = model.fit(interactions,
                  sample_weight=weights,
                  epochs=NUM_EPOCHS,
                  num_threads=NUM_THREADS,
                  verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14


In [15]:
from sklearn.metrics import roc_auc_score

user_id_map, _, item_id_map, _ = dataset.mapping()
train_user_ids = np.array(list(map(user_id_map.get, list(train_data['userid'].values))))
train_item_ids = np.array(list(map(item_id_map.get, list(train_data['itemid'].values))))

train_preds = model.predict(train_user_ids,
                            train_item_ids)

roc_auc_score(train_data['rating'], train_preds)

0.8654589360056157

### Generate Submission

In [16]:
user_id_map, _, item_id_map, _ = dataset.mapping()
test_user_ids = np.array(list(map(user_id_map.get, list(test_data['userid'].values))))
test_item_ids = np.array(list(map(item_id_map.get, list(test_data['itemid'].values))))

preds = model.predict(test_user_ids,
                      test_item_ids)

normalized_preds = (preds - preds.min())/(preds - preds.min()).max()
normalized_preds.min(), normalized_preds.max()

submission = sample_submission.copy()
submission['rating'] = normalized_preds
submission.to_csv('submission_log.csv', index=False)