In [1]:
import numpy as np
import matplotlib.pyplot as plt
FONT_SIZE = 16
plt.rcParams['figure.figsize'] = (20,8)
plt.rcParams['text.usetex'] = True
plt.rcParams['font.size'] = FONT_SIZE
plt.rcParams['legend.fontsize'] = FONT_SIZE
plt.rcParams['xtick.labelsize'] = FONT_SIZE
plt.rcParams['ytick.labelsize'] = FONT_SIZE
%config InlineBackend.figure_format ='retina'
import scipy.sparse as sp
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
from constants import (TEST_DATA_PATH, UX_CONSTANTS,DATA_DIR, SEED,
                        TEST_DATA_PATH, TRAIN_DATA_PATH,
                       DATA_OCT, DATA_NOV,USECOLS,EVENT_THRESHOLD)
from collections import OrderedDict
from tqdm.notebook import tqdm

In [2]:
Y_HAT_PATH = DATA_DIR+r'/AdamUXML-y_hat-500epochs.npz'
ux_constants = pd.Series(pd.read_csv(UX_CONSTANTS, index_col=0, squeeze=True, header=None), dtype='float32')
POSITIVE_ABOVE = ux_constants['positive_above']

In [3]:
input_df = pd.concat([pd.read_csv(DATA_OCT, engine='c', sep=',')
                ,pd.read_csv(DATA_NOV, engine='c', sep=',')])
drop_visitors = set(input_df.user_id.value_counts()[input_df.user_id.value_counts()<EVENT_THRESHOLD].index)
input_df = input_df[~input_df.user_id.isin(drop_visitors)]
input_df.reset_index(inplace=True,drop=True)

In [4]:
### Convert new_user_id and new_product_id to database user_id and product_id
new_user_id = pd.Series(pd.read_csv(DATA_DIR+r'new_user_id.csv', index_col=0, squeeze=True), dtype='int32')
uid_lookup = new_user_id.to_dict(OrderedDict)
del new_user_id
new_product_id = pd.Series(pd.read_csv(DATA_DIR+r'new_product_id.csv', index_col=0, squeeze=True), dtype='int32')
pid_lookup = new_product_id.to_dict(OrderedDict)
del new_product_id

In [5]:
#to convert new_*_id to database *_id, simply use uid_lookup[new_user_id] or pid_lookup[new_product_id]
i = 4
print(f"For example, new_user_id:{i} is user_id:{uid_lookup[i]} in the database and the CSV export of it.")

For example, new_user_id:4 is user_id:467916806 in the database and the CSV export of it.


In [6]:
y_hat = sp.load_npz(Y_HAT_PATH) 
y = sp.load_npz(TEST_DATA_PATH)
y_train = sp.load_npz(TRAIN_DATA_PATH)
print(f"MLUX made {y_hat.nnz:,} pedictions, being trained on {y_train.nnz:,} datapoints for 500 epochs.")

MLUX made 552,255 pedictions, being trained on 2,581,863 datapoints for 500 epochs.


# All predictions as a dataframe

In [7]:
rows,cols = y_hat.nonzero()
y_hat = y_hat.todok()
preds_data = []
test_set = set()
for row, col in zip(rows,cols):
    uid= uid_lookup[row]
    pid = pid_lookup[col]
    preds_data.append([uid,pid,y_hat[row,col]])
    test_set.add((uid,pid))
preds = pd.DataFrame(columns=['user_id','product_id','p_conversion'], data=preds_data)
del preds_data

In [8]:
preds

Unnamed: 0,user_id,product_id,p_conversion
0,559469332,5877495,0.0328
1,536041479,5711162,0.0136
2,551025996,5900647,-0.1729
3,529061768,5649461,0.3923
4,552392310,5875392,0.2250
...,...,...,...
552250,565813192,5843949,0.1340
552251,494135107,5875513,0.0861
552252,563238967,5815732,0.7068
552253,476481009,31612,0.5522


In [9]:
tqdm.pandas()
def func(user_id,product_id):    
    return tuple([user_id,product_id]) in test_set
input_df['in_test']= input_df.progress_apply(lambda x: func(x['user_id'],x['product_id']), axis=1) 

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=7780864.0), HTML(value='')))




In [10]:
test_df = input_df[input_df.in_test == True]

# A. Ordered recommendations for a user from the test set

In [11]:
predictions_for_each_user = preds.user_id.value_counts()
predictions_for_each_user

557616099    380
557956487    256
352394658    250
510369366    191
541975884    181
            ... 
536232573      1
563891981      1
562280433      1
578019071      1
465577979      1
Name: user_id, Length: 128624, dtype: int64

In [12]:
# let's take a sample user
sample = predictions_for_each_user.sample(n=1, random_state=0)
sample

463654656    15
Name: user_id, dtype: int64

In [13]:
recommendations = preds[preds.user_id==sample.index[0]].sort_values(by=['p_conversion'],ascending=False).head(10).product_id.values
print(f"We should recommend the following products to the user:\n{recommendations}")

We should recommend the following products to the user:
[5653177 5820720   59973 5867073 5683376 5879134 5820717 5723471 5879120
 5775814]


# B. Minimalistic prediction of the number of items needed to be in stock for the near future

In [14]:
item_conversion_predictions = preds[preds.p_conversion>POSITIVE_ABOVE]
stock_needed = item_conversion_predictions.product_id.value_counts()
stock_needed

5854897    202
5802432    196
5809910    189
5815662    151
5700037    144
          ... 
5876996      1
5676290      1
5600447      1
5864638      1
5900289      1
Name: product_id, Length: 14498, dtype: int64

### What we predict

In [15]:
samples = stock_needed.sample(n=1, random_state=0)
samples

5900594    2
Name: product_id, dtype: int64

In [16]:
item_conversion_predictions[item_conversion_predictions.product_id == samples.index[0]]

Unnamed: 0,user_id,product_id,p_conversion
102071,564697246,5900594,0.6464
336594,470892422,5900594,0.6929


### What is the ground truth from the CSV?

In [17]:
gt_puchases = test_df[test_df.event_type=='purchase']
gt_puchases[gt_puchases.product_id==samples.index[0]] 

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,in_test
3790298,2019-11-03 12:48:56 UTC,purchase,5900594,1487580007675986893,,freedecor,0.79,564697246,44d48704-24c6-4ab1-da23-65acddeaacfb,True
7592608,2019-11-29 18:54:02 UTC,purchase,5900594,1487580007675986893,,freedecor,0.67,470892422,6010c118-6483-41da-8ed5-df805022182d,True


## For n samples

In [18]:
def stock_prediction(samples,minimalism=0):
    assert minimalism < 1, "The minimalism must be in (-1,1) range."
    assert minimalism > -1, "The minimalism must be in [0,1) range."
    adjusted_p = POSITIVE_ABOVE + (1-POSITIVE_ABOVE)*minimalism 
    item_conversion_predictions = preds[preds.p_conversion>adjusted_p]
    predicted_stock_need = [len(item_conversion_predictions[item_conversion_predictions.product_id==sample]) for sample in samples.index]
    predicted_stock_need = np.array(predicted_stock_need)
#     print(f"Predicted stock need:    {predicted_stock_need}")
    ground_truth_stock_need = [len(gt_puchases[gt_puchases.product_id==sample]) for sample in samples.index]
    ground_truth_stock_need = np.array(ground_truth_stock_need)
#     print(f"Ground truth stock need: {ground_truth_stock_need}")
    print(f"We predicted the need for {predicted_stock_need.sum()} items")
    print(f"The ground truth is {ground_truth_stock_need.sum()} items")
    dif = ground_truth_stock_need-predicted_stock_need 
    prefect_match = 0
    under_predicted = 0
    over_predicted = 0
    for g,p in zip(ground_truth_stock_need,predicted_stock_need):
        if g==p:
            prefect_match+=1
        elif g>p:
            under_predicted+=1
        else:
            over_predicted+=1
    print(f"We had {prefect_match} perfect predictions, {under_predicted} were under predicted, and {over_predicted} were over predicted")
    return predicted_stock_need

In [19]:
n = 10
samples = stock_needed.sample(n=n, random_state=0)
stock_to_get = stock_prediction(samples)

We predicted the need for 20 items
The ground truth is 25 items
We had 4 perfect predictions, 4 were under predicted, and 2 were over predicted


In [20]:
n = 1000
samples = stock_needed.sample(n=n, random_state=SEED)
stock_to_get = stock_prediction(samples)

We predicted the need for 2853 items
The ground truth is 4840 items
We had 243 perfect predictions, 535 were under predicted, and 222 were over predicted


## With minimalism

In [21]:
# With minimalism of 0.25
n = 10
samples = stock_needed.sample(n=n, random_state=0)
stock_to_get = stock_prediction(samples,0.25)

We predicted the need for 11 items
The ground truth is 25 items
We had 3 perfect predictions, 7 were under predicted, and 0 were over predicted


In [22]:
n = 1000
samples = stock_needed.sample(n=n, random_state=SEED)
stock_to_get = stock_prediction(samples,0.5)

We predicted the need for 890 items
The ground truth is 4840 items
We had 182 perfect predictions, 770 were under predicted, and 48 were over predicted
