In [1]:
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csc_matrix as csc
import pandas as pd
pd.options.display.float_format = '{:,.6f}'.format
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    explained_variance_score,
    roc_auc_score,
    log_loss,
)
import time
from sklearn.preprocessing import minmax_scale
from constants import (SEED, EPSILON, EVENT_THRESHOLD, DEFAULT_K, DEFAULT_THRESHOLD,  UX_CONSTANTS, LOG_DIR, 
                       DATA_DIR, TEST_DATA_PATH, DATA_OCT, DATA_NOV, USECOLS, USER, ITEM, RATING, PREDICTION)
from utilities.ms_evaluation import (rmse, auc, logloss, precision_at_k, recall_at_k, ndcg_at_k, map_at_k, mae, rsquared, exp_var)

In [2]:
NAME = r'BasicMatrixFactorization'
Y_HAT_PATH = DATA_DIR+r'/'+NAME+r'-y_hat.npz'
TEST_RESULTS_PATH = LOG_DIR+'\\'+NAME+'\\test-results.csv'
SKL = "(calculated using sklearn.metrics on non-zero values of sparse matrices)"
SPA = "(calculated using CSC sparse matrix operations)"
MSE = "(calculated using the Microsoft Evaluation method)"

In [3]:
start_time = time.time()
log = pd.Series(dtype='float64')
y_hat = sp.load_npz(Y_HAT_PATH) 
y = sp.load_npz(TEST_DATA_PATH)
assert y_hat.shape == y.shape, 'The shape of Y and Y_hat must match, otherwise they are not comparable.'
print(f"Shape of the matrices: {y.shape}")
print("Number of non-zero values:")
print(f"Y: {y.nnz:8,}")
print(f"Ŷ: {y_hat.nnz:8,}")

Shape of the matrices: (177592, 44780)
Number of non-zero values:
Y:  552,255
Ŷ:  552,255


In [4]:
# Usually, the CSC is used when there are more rows than columns. (If there are more columns, use CSR instead.)
y_hat = y_hat.tocsc()
y = y.tocsc()
y_nz = np.array(y[y.nonzero()]).reshape(-1)
y_hat_nz = np.array(y_hat[y_hat.nonzero()]).reshape(-1)

# Standard metrics

### Mean Square Error

In [5]:
mse_spa = csc.sum(csc.power(y_hat-y,2))/y.nnz
print(f"Mean Square Error: {mse_spa} {SPA}")
print('Note: The smaller the better.')
log["mse"]=mse_spa

Mean Square Error: 1.310757711564404 (calculated using CSC sparse matrix operations)
Note: The smaller the better.


### Root Mean Square Error

In [6]:
rmse_spa = np.sqrt(mse_spa)
print(f"Root Mean Square Error: {rmse_spa} {SPA}")
print('Note: The smaller the better.')
log["rmse"]=rmse_spa

Root Mean Square Error: 1.144883274209386 (calculated using CSC sparse matrix operations)
Note: The smaller the better.


### Mean Absolute Error

In [7]:
mae_spa = csc.sum(abs(y_hat-y))/y.nnz
print(f"Mean Absolute Error: {mae_spa} {SPA}")
print('Note: The smaller the better.')
log["mae"]=mae_spa

Mean Absolute Error: 0.5893737381282198 (calculated using CSC sparse matrix operations)
Note: The smaller the better.


### R²

In [8]:
r2_skl = r2_score(y_nz,y_hat_nz)
print(f"Coefficient of determination (R\u00B2): {r2_skl} {SKL}")
print("Note: The closer to 1 the better.")
log["r-squared"]=r2_skl

Coefficient of determination (R²): -17.556097329188866 (calculated using sklearn.metrics on non-zero values of sparse matrices)
Note: The closer to 1 the better.


### Explained variance

In [9]:
exp_var_skl = explained_variance_score(y_nz,y_hat_nz)
print(f"Explained variance: {exp_var_skl}")
print("Note: The closer to 1 the better.")
log["exp_var"]=exp_var_skl

Explained variance: -16.636564254760742
Note: The closer to 1 the better.


In [10]:
log

mse           1.310758
rmse          1.144883
mae           0.589374
r-squared   -17.556097
exp_var     -16.636564
dtype: float64

In [11]:
print(f"Elapsed time: {time.time()-start_time:.2f} seconds")

Elapsed time: 0.30 seconds


In [12]:
# log.to_csv(TEST_RESULTS_PATH, index = True, header=False)