In [1]:
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csc_matrix as csc
import pandas as pd

In [2]:
EVENT_THRESHOLD = 5
LOG_DIR = r'C:\TensorLogs'
NAME = r'BasicMatrixFactorization' 
DATA_DIR = r'./data/'
Y_HAT_PATH = DATA_DIR+r'/'+NAME+r'-y_hat.npz'
TEST_DATA_PATH = DATA_DIR+r'uxm_test.npz'
TEST_RESULTS_PATH = LOG_DIR+'\\'+NAME+'\\test-results.csv'
DATA_OCT = DATA_DIR+r'2019-Oct.csv'
DATA_NOV = DATA_DIR+r'2019-Nov.csv'
USECOLS = ["event_type","product_id","user_id"]

In [3]:
ux_constants = pd.Series(pd.read_csv(DATA_DIR+r'ux_constants.csv', index_col=0, squeeze=True, header=None), dtype='float32')
VIEW     = ux_constants['view_to_purchase']
CART     = ux_constants['cart_to_purchase']
REMOVE   = ux_constants['remove_to_purchase']
PURCHASE = ux_constants['purchase_to_purchase']

In [4]:
log = pd.Series(dtype='float64')
y_hat = sp.load_npz(Y_HAT_PATH)
y = sp.load_npz(TEST_DATA_PATH)
assert y_hat.shape == y.shape, 'The shape of Y and Y_hat must match, otherwise they are not comparable.'
print(f"Shape of the matrices: {y.shape}")
print("Number of non-zero values:")
print(f"Y: {y.nnz:8,}")
print(f"Ŷ: {y_hat.nnz:8,}")

Shape of the matrices: (177592, 44780)
Number of non-zero values:
Y:  552,255
Ŷ:  552,255


In [5]:
# Usually, the CSC is used when there are more rows than columns. (If there are more columns, use CSR instead.)
y_hat = y_hat.tocsc()
y = y.tocsc()

In [6]:
MSE = csc.sum(csc.power(y_hat-y,2))/y.nnz
print(f"Mean Square Error: {MSE}")
log["MSE"]=MSE

Mean Square Error: 1.3106135299816208


In [7]:
RMSE = np.sqrt(MSE)
print(f"Root Mean Square Error: {RMSE}")
log["RMSE"]=RMSE

Root Mean Square Error: 1.14482030466865


In [8]:
MAE = csc.sum(abs(y_hat-y))/y.nnz
print(f"Mean Absolute Error: {MAE}")
log["MAE"]=MAE

Mean Absolute Error: 0.58932699794479


In [9]:
# log.to_csv(TEST_RESULTS_PATH, index = True, header=False)
# log

# Did we predict purchase behaviour correctly?
Can we know if a user will buy a product before their first event related to the product?

We need to be aware that some users might have purchased without dispatching EVENT_THRESHOLD (5) number of events!
For example user_id:533444013 had 2 events, one of the purchase.

In [10]:
df = pd.concat([pd.read_csv(DATA_OCT, engine='c', sep=',',usecols=USECOLS)
                ,pd.read_csv(DATA_NOV, engine='c', sep=',',usecols=USECOLS)])
drop_visitors = set(df.user_id.value_counts()[df.user_id.value_counts()<EVENT_THRESHOLD].index)
df = df[~df.user_id.isin(drop_visitors)]
df.reset_index(inplace=True,drop=True)

In [11]:
df = df[df.event_type=='purchase']
df = df.drop(columns=['event_type'])

In [12]:
new_user_id = pd.Series(pd.read_csv(DATA_DIR+r'new_user_id.csv', index_col=1, squeeze=True), dtype='int32')
new_product_id = pd.Series(pd.read_csv(DATA_DIR+r'new_product_id.csv', index_col=1, squeeze=True), dtype='int32')

In [13]:
purchases = set()
i = 0
for row in df.itertuples(): 
    uid = new_user_id[row.user_id]
    pid = new_product_id[row.product_id]
    purchases.add((uid,pid))    

In [14]:
# To test our set: We know that new_user_id 102890 made a purchase for the product with new_product_id 21957, so this should be in purchases.
(102890, 21957) in purchases 

True

In [15]:
print(f"VIEW     = {VIEW:.5f}")
print(f"CART     = {CART:.5f}")
print(f"REMOVE   = {REMOVE:.5f}")
print(f"PURCHASE = {PURCHASE:.5f}")

VIEW     = 0.05302
CART     = 0.19540
REMOVE   = 0.04219
PURCHASE = 1.00000


In [16]:
def confusion_matrix(A, log):
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    # false_values = set()

    rows,cols = A.nonzero()
    for row,col in zip(rows,cols):
        predicted_positive = expected_positive = False
        predicted_positive = (A[row,col] >= 0.5)
        expected_positive = (row, col) in purchases
        if expected_positive and predicted_positive:
            true_positives += 1
        elif expected_positive and (not predicted_positive):
            false_negatives += 1
    #         false_values.add(A[row,col])
        elif (not expected_positive) and predicted_positive:
            false_positives += 1
    #         false_values.add(A[row,col])
        else: 
            true_negatives += 1

    ### Print and log the results   
    # false_values = np.array(list(false_values))
    print(f"* True positives: {true_positives}")
    log["true_positives"]=true_positives
    print(f"* True negatives: {true_negatives}")
    log["true_negatives"]=true_negatives
    print(f"* False positives: {false_positives}")
    log["false_positives"]=false_positives
    print(f"* False negatives (worst case): {false_negatives}")
    log["false_negatives"]=false_negatives
    print(f"* False total: {false_negatives+false_positives}")
    return log

## Example output for y
### Essentially testing if our data preparation works (should have near zero false negatives) (MSE=0)
* True positives: 82194
* True negatives: 463701
* False positives: 6357
* False negatives (worst case): 3
* False total: 6360

In [17]:
log = confusion_matrix(y_hat,log)

* True positives: 6762
* True negatives: 427498
* False positives: 42560
* False negatives (worst case): 75435
* False total: 117995


In [18]:
log

MSE                     1.310614
RMSE                    1.144820
MAE                     0.589327
true_positives       6762.000000
true_negatives     427498.000000
false_positives     42560.000000
false_negatives     75435.000000
dtype: float64