<a href="https://colab.research.google.com/github/ViMan21/DAITA/blob/main/DAITA_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install lightfm



In [None]:
import csv
import pandas as pd

RATING_FILE = 'ratings.csv'
CLEAN_RATING_FILE = 'ratings_nodup.csv'
FEATURE_FILE = 'features.csv'
CLEAN_FEATURE_FILE = 'features_noNA.csv'

In [None]:
toclean = pd.read_csv(RATING_FILE)
toclean = toclean.drop_duplicates(['user_id','product_id'])
toclean.to_csv(CLEAN_RATING_FILE, index=False, float_format='%.0f')

In [None]:
toclean = pd.read_csv(FEATURE_FILE)
toclean = toclean.replace('#DIV/0!', 'unknown')
toclean = toclean.replace('#N/A', 'unknown')
toclean['avg_ratings'] = toclean['avg_ratings'].apply(lambda x: round(x*2)/2)
toclean['price'] = toclean['price'].apply(lambda x: round(x))
toclean['num_ratings'] = toclean['num_ratings'].apply(lambda x: round(x, -len(str(x))+1))
toclean.to_csv(CLEAN_FEATURE_FILE, index=False)

In [None]:
def getRatings():
  return csv.DictReader(open(CLEAN_RATING_FILE),delimiter=",")

def getFeatures():
  return csv.DictReader(open(CLEAN_FEATURE_FILE),delimiter=",")

In [None]:
ratings = getRatings()
features = getFeatures()


In [None]:
import json
from itertools import islice

for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))


{
    "user_id": "8765713110",
    "product_id": "1623200116759",
    "rating": "10"
}
{
    "user_id": "8765713110",
    "product_id": "353472373254",
    "rating": "9"
}


In [None]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['user_id'] for x in getRatings()),
            (x['product_id'] for x in getRatings()))

In [None]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 16838, num_items 3914.


In [None]:
(interactions, weights) = dataset.build_interactions([(x['user_id'], x['product_id'], int(x['rating']))
                                                      for x in getRatings()])

print(repr(interactions))

<16838x3914 sparse matrix of type '<class 'numpy.int32'>'
	with 88436 stored elements in COOrdinate format>


In [None]:
def get_feature_byID(name):
  return (name+":"+x[name] for x in getFeatures())

In [None]:
def get_feature_ID(name):
  feature = ''.join([i for i in name if not i.isdigit()])
  return (feature+":"+x[name] for x in getFeatures())

In [None]:
dataset.fit_partial(items=(x['product_id'] for x in getFeatures()))
dataset.fit_partial(item_features=get_feature_byID('product_year'))
dataset.fit_partial(item_features=get_feature_byID('country_name'))
dataset.fit_partial(item_features=get_feature_byID('feature1'))
dataset.fit_partial(item_features=get_feature_byID('feature2'))
dataset.fit_partial(item_features=get_feature_byID('feature3'))
dataset.fit_partial(item_features=get_feature_byID('feature4'))
dataset.fit_partial(item_features=get_feature_byID('feature5'))
dataset.fit_partial(item_features=get_feature_byID('num_ratings'))
dataset.fit_partial(item_features=get_feature_byID('avg_ratings'))
dataset.fit_partial(item_features=get_feature_byID('region_name'))
dataset.fit_partial(item_features=get_feature_byID('food1'))
dataset.fit_partial(item_features=get_feature_byID('food2'))
dataset.fit_partial(item_features=get_feature_byID('food3'))
dataset.fit_partial(item_features=get_feature_byID('food4'))
dataset.fit_partial(item_features=get_feature_byID('food5'))
dataset.fit_partial(item_features=get_feature_byID('food6'))
dataset.fit_partial(item_features=get_feature_byID('food7'))
dataset.fit_partial(item_features=get_feature_byID('note1'))
dataset.fit_partial(item_features=get_feature_byID('note2'))
dataset.fit_partial(item_features=get_feature_byID('note3'))
dataset.fit_partial(item_features=get_feature_byID('note4'))
dataset.fit_partial(item_features=get_feature_byID('note5'))
dataset.fit_partial(item_features=get_feature_byID('note6'))
dataset.fit_partial(item_features=get_feature_byID('note7'))
dataset.fit_partial(item_features=get_feature_byID('note8'))
dataset.fit_partial(item_features=get_feature_byID('note9'))
dataset.fit_partial(item_features=get_feature_byID('note10'))
dataset.fit_partial(item_features=get_feature_byID('note11'))
dataset.fit_partial(item_features=get_feature_byID('note12'))
dataset.fit_partial(item_features=get_feature_byID('note13'))
dataset.fit_partial(item_features=get_feature_byID('class_name'))
dataset.fit_partial(item_features=get_feature_byID('price'))

In [None]:
item_features = dataset.build_item_features(((x['product_id'], ['country_name:' + x['country_name'], 
                                                                'product_year:' + x['product_year'],  
                                                                'feature1:' + x['feature1'], 
                                                                'feature2:' + x['feature2'], 
                                                                'feature3:' + x['feature3'], 
                                                                'feature4:' + x['feature4'], 
                                                                'feature5:' + x['feature5'], 
                                                                'num_ratings:' + x['num_ratings'], 
                                                                'avg_ratings:' + x['avg_ratings'], 
                                                                'region_name:' + x['region_name'], 
                                                                'food1:' + x['food1'], 
                                                                'food2:' + x['food2'], 
                                                                'food3:' + x['food3'], 
                                                                'food4:' + x['food4'], 
                                                                'food5:' + x['food5'], 
                                                                'food6:' + x['food6'], 
                                                                'food7:' + x['food7'], 
                                                                'note1:' + x['note1'], 
                                                                'note2:' + x['note2'], 
                                                                'note3:' + x['note3'], 
                                                                'note4:' + x['note4'], 
                                                                'note5:' + x['note5'], 
                                                                'note6:' + x['note6'], 
                                                                'note7:' + x['note7'], 
                                                                'note8:' + x['note8'], 
                                                                'note9:' + x['note9'], 
                                                                'note10:' + x['note10'], 
                                                                'note11:' + x['note11'], 
                                                                'note12:' + x['note12'], 
                                                                'note13:' + x['note13'], 
                                                                'class_name:' + x['class_name'],
                                                                'price:' + x['price']])
                                              for x in getFeatures()))
print(repr(item_features)) 

<4397x37433 sparse matrix of type '<class 'numpy.float32'>'
	with 145069 stored elements in Compressed Sparse Row format>


In [None]:
from lightfm.cross_validation import random_train_test_split
import numpy as np

# create the train and test data split 
train, test = random_train_test_split(interactions, test_percentage=0.1)

print('The dataset has %s users and %s items, '
      'with %s interactions in the test and %s interactions in the training set.'
      % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

The dataset has 16838 users and 3914 items, with 8844 interactions in the test and 79592 interactions in the training set.


In [None]:
# Import the model
from lightfm import LightFM

# Set hyperparameters
NUM_THREADS = 4
NUM_COMPONENTS = 30
NUM_EPOCHS = 15
ITEM_ALPHA = 1e-7
K = 10


In [None]:
# Initialize the model
cf_model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS)

# Fit the model
%time cf_model = cf_model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

CPU times: user 3.26 s, sys: 9.99 ms, total: 3.27 s
Wall time: 1.78 s


In [None]:
# Import the evaluation routines
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k

# Compute and print metrics for training data
cf_train_auc = auc_score(cf_model, train, num_threads=NUM_THREADS).mean()
cf_train_pak = precision_at_k(cf_model, train, num_threads=NUM_THREADS, k=K).mean()
cf_train_rak = recall_at_k(cf_model, train, num_threads=NUM_THREADS, k=K).mean()
print('Collaborative filtering train AUC: %s' % cf_train_auc)
print('Collaborative filtering train precision @ K: %s' % cf_train_pak)
print('Collaborative filtering train recall @ K: %s' % cf_train_rak)

Collaborative filtering train AUC: 0.99024796
Collaborative filtering train precision @ K: 0.16416733
Collaborative filtering train recall @ K: 0.601338781923656


In [None]:
# Compute and print metrics for test data
cf_test_auc = auc_score(cf_model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
cf_test_pak = precision_at_k(cf_model, test, train_interactions=train, num_threads=NUM_THREADS, k=K).mean()
cf_test_rak = recall_at_k(cf_model, test, train_interactions=train, num_threads=NUM_THREADS, k=K).mean()
print('Collaborative filtering test AUC: %s' % cf_test_auc)
print('Collaborative filtering test precision @ K: %s' % cf_test_pak)
print('Collaborative filtering test recall @ K: %s' % cf_test_rak)

Collaborative filtering test AUC: 0.83710164
Collaborative filtering test precision @ K: 0.013795635
Collaborative filtering test recall @ K: 0.0920092438391833


In [None]:
print('There are %s distinct tags' % (item_features.shape[1]))

There are 37433 distinct tags


In [None]:
# Define a new model instance
hy_model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# Fit the hybrid model
hy_model = hy_model.fit(train,
                  item_features=item_features,
                  epochs=NUM_EPOCHS,
                  num_threads=NUM_THREADS)

In [None]:
# Compute and print metrics for training data for hybrid model
hy_train_auc = auc_score(hy_model, train, item_features=item_features, num_threads=NUM_THREADS).mean()
hy_train_pak = precision_at_k(hy_model, train, item_features=item_features, num_threads=NUM_THREADS).mean()
hy_train_rak = recall_at_k(hy_model, train, item_features=item_features, num_threads=NUM_THREADS).mean()
print('Hybrid Model train AUC: %s' % hy_train_auc)
print('Hybrid Model train precision @ K: %s' % hy_train_pak)
print('Hybrid Model train recall @ K: %s' % hy_train_rak)

Hybrid Model train AUC: 0.9452752
Hybrid Model train precision @ K: 0.059354167
Hybrid Model train recall @ K: 0.18685855654603342


In [None]:
# Compute and print metrics for test data for hybrid model
hy_test_auc = auc_score(hy_model, test, train_interactions=train, item_features=item_features, num_threads=NUM_THREADS).mean()
hy_test_pak = precision_at_k(hy_model, test, train_interactions=train, item_features=item_features, num_threads=NUM_THREADS, k=K).mean()
hy_test_rak = recall_at_k(hy_model, test, train_interactions=train, item_features=item_features, num_threads=NUM_THREADS, k=K).mean()
print('Hybrid Model test AUC: %s' % hy_test_auc)
print('Hybrid Model test precision @ K: %s' % hy_test_pak)
print('Hybrid Model test recall @ K: %s' % hy_test_rak)

Hybrid Model test AUC: 0.8477068
Hybrid Model test precision @ K: 0.014015777
Hybrid Model test recall @ K: 0.09111055052673683


In [25]:
print('                               TRAIN                                   ')
print('_______________________________________________________________________')
print('|___________|      AUC      |    Precision at K    |    Recall at K   |')
print('| CF MODEL  |   {:.8f}  |      {:.9f}     |  {:.12f}  |'.format(cf_train_auc, cf_train_pak, cf_train_rak))
print('|___________|_______________|______________________|__________________|')
print('|  Hybird   |   {:.8f}  |      {:.9f}     |  {:.12f}  |' .format(hy_train_auc, hy_train_pak, hy_train_rak))
print('|___________|_______________|______________________|__________________|')

                               TRAIN                                   
_______________________________________________________________________
|___________|      AUC      |    Precision at K    |    Recall at K   |
| CF MODEL  |   0.99024796  |      0.164167330     |  0.601338781924  |
|___________|_______________|______________________|__________________|
|  Hybird   |   0.94527519  |      0.059354167     |  0.186858556546  |
|___________|_______________|______________________|__________________|


In [26]:
print('                                TEST                                   ')
print('_______________________________________________________________________')
print('|___________|      AUC      |    Precision at K    |    Recall at K   |')
print('| CF MODEL  |   {:.8f}  |      {:.9f}     |  {:.12f}  |'.format(cf_test_auc, cf_test_pak, cf_test_rak))
print('|___________|_______________|______________________|__________________|')
print('|  Hybird   |   {:.8f}  |      {:.9f}     |  {:.12f}  |' .format(hy_test_auc, hy_test_pak, hy_test_rak))
print('|___________|_______________|______________________|__________________|')

                                TEST                                   
_______________________________________________________________________
|___________|      AUC      |    Precision at K    |    Recall at K   |
| CF MODEL  |   0.83710164  |      0.013795635     |  0.092009243839  |
|___________|_______________|______________________|__________________|
|  Hybird   |   0.84770679  |      0.014015777     |  0.091110550527  |
|___________|_______________|______________________|__________________|
