In [6]:
import numpy as np
import pandas as pd
#import fastai

The objective of this notebook is to compute a user-item-interaction (UII) matrix to be used for Collaborative Filtering.
The dataset used is the events.csv downloaded from Kaggle. https://www.kaggle.com/retailrocket/ecommerce-dataset

In [8]:
events = pd.read_csv('events.csv')

In [9]:
events.head(5)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [10]:
events.shape

(2756101, 5)

In [11]:
events = events[['visitorid', 'event', 'itemid']]

In [12]:
events.head(5)

Unnamed: 0,visitorid,event,itemid
0,257597,view,355908
1,992329,view,248676
2,111016,view,318965
3,483717,view,253185
4,951259,view,367447


In [13]:
events.dropna(inplace = True)

In [14]:
events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2756101 entries, 0 to 2756100
Data columns (total 3 columns):
visitorid    int64
event        object
itemid       int64
dtypes: int64(2), object(1)
memory usage: 84.1+ MB


In [15]:
# unique visitorid
visitorid = events['visitorid'].unique()
# unique itemid
itemid = events['itemid'].unique()

print('There are {} visitors and {} items'.format(len(visitorid), len(itemid)))

There are 1407580 visitors and 235061 items


In [16]:
event = events['event'].unique()
print(event)
event_count = events['event'].value_counts()
print(event_count)

['view' 'addtocart' 'transaction']
view           2664312
addtocart        69332
transaction      22457
Name: event, dtype: int64


In [17]:
# 'view' --> score 1
# 'addtocart' --> score 2
# 'transaction' --> score 3

events['event'].replace(['view', 'addtocart', 'transaction'], [1, 2, 3], inplace = True)


In [18]:
events.rename(columns = {'event':'rating'}, inplace = True)
events.head()

Unnamed: 0,visitorid,rating,itemid
0,257597,1,355908
1,992329,1,248676
2,111016,1,318965
3,483717,1,253185
4,951259,1,367447


In [20]:
# Uncomment to export events dataframe as rating.csv
events.to_csv('rating.csv')


In [13]:
events=events[:50000] # we only use 50000 rows because the data is too large to run
events.astype({'visitorid':'object', 'itemid':'object'})

Unnamed: 0,visitorid,rating,itemid
0,257597,1,355908
1,992329,1,248676
2,111016,1,318965
3,483717,1,253185
4,951259,1,367447
...,...,...,...
49995,604,1,87869
49996,350728,1,232379
49997,1381600,1,123688
49998,397480,1,277276


In [14]:
events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   visitorid  50000 non-null  int64
 1   rating     50000 non-null  int64
 2   itemid     50000 non-null  int64
dtypes: int64(3)
memory usage: 1.5 MB


In [15]:
#creating the user-item interaction matrix
retail_UII = pd.pivot_table(events, index='visitorid', columns='itemid', values='rating')

In [16]:
retail_UII.shape

(28860, 24716)

In [18]:
retail_UII.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28860 entries, 17 to 1407576
Columns: 24716 entries, 6 to 466867
dtypes: float64(24716)
memory usage: 5.3 GB


Inside the retail_UII matrix, there are a lot of NaN values because only a few users bought a particular product

In [20]:
retail_UII.head()

itemid,6,15,55,66,92,147,168,190,195,217,...,466760,466772,466785,466789,466795,466828,466858,466861,466864,466867
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,,,,,,,,,,,...,,,,,,,,,,
52,,,,,,,,,,,...,,,,,,,,,,
74,,,,,,,,,,,...,,,,,,,,,,
109,,,,,,,,,,,...,,,,,,,,,,
122,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# Uncomment to export the user-item matrix to csv
#retail_UII.to_csv('dataset/retail_UII.csv')