In [1]:
import numpy as np
import pandas as pd
import scipy
print(f"SciPy version: {scipy.__version__}")
import scipy.sparse as sp
import time
import random
SEED = 74
random.seed(SEED)
class T:
   B = '\033[94m' # blue
   G = '\033[92m' # green
   Y = '\033[93m' # yellow
   R = '\033[91m' # red
   b = '\033[1m' # bold
   E = '\033[0m' # end formatting

SciPy version: 1.4.1


In [2]:
DATA_DIR = r'./data/'
DATA_OCT = DATA_DIR+r'2019-Oct.csv'
DATA_NOV = DATA_DIR+r'2019-Nov.csv'
USECOLS = ["event_type","product_id","user_id"]

*Minimum number of events a user needs to have before being included in the dataset*

In [3]:
EVENT_THRESHOLD = 5

In [4]:
ux_constants = pd.Series(pd.read_csv(DATA_DIR+r'ux_constants.csv', index_col=0, squeeze=True, header=None), dtype='float32')
VIEW     = ux_constants['view_to_purchase']
CART     = ux_constants['cart_to_purchase']
REMOVE   = ux_constants['remove_to_purchase']
PURCHASE = ux_constants['purchase_to_purchase']

def event_to_ux(event):
    event_weights = {
        'view': VIEW,
        'cart': CART,
        'remove_from_cart': REMOVE,
        'purchase': PURCHASE,   
    }
    return event_weights.get(event, 0)

In [5]:
df = pd.concat([pd.read_csv(DATA_OCT, engine='c', sep=',',usecols=USECOLS)
                ,pd.read_csv(DATA_NOV, engine='c', sep=',',usecols=USECOLS)])

In [6]:
df["event_type"] = df["event_type"].astype("category")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8738120 entries, 0 to 4635836
Data columns (total 3 columns):
 #   Column      Dtype   
---  ------      -----   
 0   event_type  category
 1   product_id  int64   
 2   user_id     int64   
dtypes: category(1), int64(2)
memory usage: 208.3 MB


In [8]:
start_dim = df.shape
start_dim

(8738120, 3)

In [9]:
print(f"We start with {len(df.user_id.unique()):,} unique users.")

We start with 713,100 unique users.


# Data Reduction

In [10]:
drop_visitors = set(df.user_id.value_counts()[df.user_id.value_counts()<EVENT_THRESHOLD].index)
print(f"We will {T.R}drop {len(drop_visitors):,} ({len(drop_visitors)*100/len(df.user_id.unique()):.2f}%) users,{T.E} for not meeting the minimum event number requirements.")

We will [91mdrop 535,508 (75.10%) users,[0m for not meeting the minimum event number requirements.


In [11]:
df = df[~df.user_id.isin(drop_visitors)]
df.reset_index(inplace=True,drop=True)

In [12]:
print(f"This way we have reduced the number of total events by {T.G}{100-len(df)*100/start_dim[0]:.2f}%{T.E}.")

This way we have reduced the number of total events by [92m10.95%[0m.


In [13]:
new_user_id = pd.DataFrame()
new_user_id['user_id']=df.user_id.unique()
print(f"We will have {T.B}{len(new_user_id):,} unique users.{T.E}")
new_user_id.to_csv(DATA_DIR+r'new_user_id.csv', index = True, header=True)
uid_lookup = pd.Series(index=new_user_id.user_id,data=new_user_id.index)
del new_user_id

We will have [94m177,592 unique users.[0m


In [14]:
new_product_id = pd.DataFrame()
new_product_id['product_id']=df.product_id.unique()
print(f"We will have {T.B}{len(new_product_id):,} unique features{T.E} (products for e-commerce).")
new_product_id.to_csv(DATA_DIR+r'new_product_id.csv', index = True, header=True)
pid_lookup = pd.Series(index=new_product_id.product_id,data=new_product_id.index)
del new_product_id

We will have [94m44,780 unique features[0m (products for e-commerce).


# Feature engineering

In [15]:
number_of_users = df['user_id'].unique().shape[0]
number_of_features = df['product_id'].unique().shape[0]

In [16]:
def user_experience_matrix(df):
    last_index = df.shape[0]-1      
    # Use np.float32 for torch.cuda.FloatTensor.or np.float16 for torch.cuda.HalfTensor (float64 not recommended)
    uxm = sp.dok_matrix((number_of_users, number_of_features), dtype=np.float32)   
    print(f"   Event |   User | Product | Event | Previous |   {T.b}New UX{T.E}")
    
    for row in df.itertuples():        
        uid = uid_lookup[row.user_id]
        pid = pid_lookup[row.product_id]        
        prev_ux = uxm[uid,pid]
        ux = np.tanh(prev_ux+event_to_ux(row.event_type))   
#       ux = prev_ux + 1 # test case calculating the number of events between the user-product pair
        uxm[uid,pid] = ux        
        if (row.Index % 500000 == 0) or (row.Index == last_index):
            print(f"{row.Index:8} | "+
                  f"{uid:6} | "+
                  f"{pid:7} |  "+
                  f"{row.event_type[:4]} | "+
                  f"{prev_ux:8.5f} | "+
                  f"{T.b}{ux:8.5f}{T.E}")       
    return uxm


In [17]:
start_time = time.time()
uxm = user_experience_matrix(df)
print(f"Elapsed time: {time.time()-start_time:.2f} seconds")

   Event |   User | Product | Event | Previous |   [1mNew UX[0m
       0 |      0 |       0 |  cart |  0.00000 | [1m 0.19295[0m
  500000 |    946 |   22949 |  view |  0.00000 | [1m 0.05298[0m
 1000000 |   1287 |   33175 |  remo |  0.23090 | [1m 0.26649[0m
 1500000 |  55881 |    4161 |  cart |  0.00000 | [1m 0.19295[0m
 2000000 |  56372 |    4570 |  cart |  0.72553 | [1m 0.72634[0m
 2500000 |  80486 |   18263 |  view |  0.47220 | [1m 0.48172[0m
 3000000 |  92749 |   11986 |  view |  0.00000 | [1m 0.05298[0m
 3500000 |  18163 |   12778 |  purc |  0.36994 | [1m 0.87868[0m
 4000000 | 114521 |   28527 |  view |  0.05298 | [1m 0.10561[0m
 4500000 | 122760 |   17009 |  remo |  0.00000 | [1m 0.04216[0m
 5000000 |   2867 |   26409 |  view |  0.00000 | [1m 0.05298[0m
 5500000 |  91468 |    3903 |  cart |  0.00000 | [1m 0.19295[0m
 6000000 |  84718 |     564 |  remo |  0.24114 | [1m 0.27598[0m
 6500000 | 156464 |   16120 |  remo |  0.51193 | [1m 0.50360[0m
 7000000 |

In [26]:
print(f"Mean feature engineering duration: {np.array([290.73,290.71,289.59,290.45,288.57,288.90,291.77,286.21,288.36,288.63,283.52]).mean():.2f} seconds")

Mean feature engineering duration: 288.86 seconds


# Train - test - validation split

In [19]:
def save_to_npz(X,filename):
    X = X.tocoo()
    sp.save_npz(DATA_DIR+filename+r'.npz',X)
    print(f"{T.G}Sparse matrix saved as {filename}.npz{T.E}")

In [20]:
VAL_THRESHOLD = 0.7
TEST_THRESHOLD = VAL_THRESHOLD+(1-VAL_THRESHOLD)/2
print(f"Train: {VAL_THRESHOLD*100:.2f}% \nValidation: {(1-TEST_THRESHOLD)*100:.2f}% \nTest: {(1-TEST_THRESHOLD)*100:.2f}%")

Train: 70.00% 
Validation: 15.00% 
Test: 15.00%


In [21]:
NNZ = uxm.nnz
print(f"Number of stored values: {NNZ:,}")

Number of stored values: 3,687,560


In [22]:
uxm_train = sp.dok_matrix.copy(uxm)
uxm_val = sp.dok_matrix((number_of_users, number_of_features), dtype=np.float32) 
uxm_test = sp.dok_matrix((number_of_users, number_of_features), dtype=np.float32) 

In [23]:
rows,cols = uxm_train.nonzero()
for row,col in zip(rows,cols):
    rnd = random.random()
    if rnd > TEST_THRESHOLD:
        uxm_test[row,col] = uxm_train[row,col]
        uxm_train[row,col] = 0
    elif rnd > VAL_THRESHOLD:
        uxm_val[row,col] = uxm_train[row,col]
        uxm_train[row,col] = 0   

In [24]:
print(f"Number of train data values: {uxm_train.nnz:,} ({uxm_train.nnz*100/NNZ:.2f}%)")
print(f"Number of validation data values: {uxm_val.nnz:,} ({uxm_val.nnz*100/NNZ:.2f}%)")
print(f"Number of test data values: {uxm_test.nnz:,} ({uxm_test.nnz*100/NNZ:.2f}%)")
errormessage = '''All datapoints should be in either the train, the test of the validation datasets. 
The reason might be a change in how .nnz of a DOK matrix (scipy.sparse.dok_matrix) is calculated. 
In version 1.4.1 SciPy setting the value to zero explicitly (X[i,j]=0) is not counted by .nnz'''
assert NNZ - uxm_train.nnz - uxm_val.nnz - uxm_test.nnz == 0, errormessage

Number of train data values: 2,581,863 (70.02%)
Number of validation data values: 553,442 (15.01%)
Number of test data values: 552,255 (14.98%)


In [25]:
save_to_npz(uxm,'uxm')
save_to_npz(uxm_train,'uxm_train')
save_to_npz(uxm_val,'uxm_val')
save_to_npz(uxm_test,'uxm_test')

[92mSparse matrix saved as uxm.npz[0m
[92mSparse matrix saved as uxm_train.npz[0m
[92mSparse matrix saved as uxm_val.npz[0m
[92mSparse matrix saved as uxm_test.npz[0m
