# Local Evaluations

Sources: https://www.kaggle.com/frankherfert/local-validation-with-detailed-product-comparison

In [3]:
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 200) # use this to display more content for each column

## Load the Data

In [4]:
#prior dataset
op_prior = pd.read_csv('../input/order_products__prior.csv', engine='c', 
                       dtype={'order_id': np.int32, 
                              'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 
                              'reordered': np.int8})

print('Total ordered products(prior): {}'.format(op_prior.shape[0]))
op_prior.head()

Total ordered products(prior): 32434489


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [5]:
# train
op_train = pd.read_csv('../input/order_products__train.csv', engine='c', 
                       dtype={'order_id': np.int32, 
                              'product_id': np.int32, 
                              'add_to_cart_order': np.int16, 
                              'reordered': np.int8})

print('Total ordered products(train): {}'.format(op_train.shape[0]))
op_train.head()

Total ordered products(train): 1384617


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [6]:
# orders
orders = pd.read_csv('../input/orders.csv', engine='c', dtype={'order_id': np.int32, 
                                                           'user_id': np.int32, 
                                                           'order_number': np.int32, 
                                                           'order_dow': np.int8, 
                                                           'order_hour_of_day': np.int8, 
                                                           'days_since_prior_order': np.float16})
print('Total orders: {}'.format(orders.shape[0]))
print(orders.info())
orders.head()

Total orders: 3421083
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
order_id                  int32
user_id                   int32
eval_set                  object
order_number              int32
order_dow                 int8
order_hour_of_day         int8
days_since_prior_order    float16
dtypes: float16(1), int32(3), int8(2), object(1)
memory usage: 78.3+ MB
None


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [7]:
# combine order details for prior

prior_order_details = pd.merge(op_prior, orders, on = 'order_id', how = 'left')
print(prior_order_details.head())
print(prior_order_details.dtypes)

   order_id  product_id  add_to_cart_order  reordered  user_id eval_set  \
0         2       33120                  1          1   202279    prior   
1         2       28985                  2          1   202279    prior   
2         2        9327                  3          0   202279    prior   
3         2       45918                  4          1   202279    prior   
4         2       30035                  5          0   202279    prior   

   order_number  order_dow  order_hour_of_day  days_since_prior_order  
0             3          5                  9                     8.0  
1             3          5                  9                     8.0  
2             3          5                  9                     8.0  
3             3          5                  9                     8.0  
4             3          5                  9                     8.0  
order_id                    int32
product_id                  int32
add_to_cart_order           int16
reordered      

In [20]:
def calculate_f1_score(row):
    """
    pred: list of product ids
    true: list of product ids
    """
    pred = row['products_pred'].split()
    true = row['products_true'].split()
    
    pred = set(pred)
    true = set(true)
    correct_predictions = pred.intersection(true)
    try:
        precision = len(correct_predictions)/len(pred)
        recall = len(correct_predictions)/len(true)

        f_score = 2*precision*recall/(precision + recall)
        return f_score
    except ZeroDivisionError:  
        return 0
    

In [31]:
# transform the training set to the output format

train = pd.DataFrame(op_train.groupby('order_id')["product_id"].apply(lambda x: ' '.join([str(e) for e in set(x)])).reset_index().rename(columns={'product_id':'products_true'}))

train.head()

Unnamed: 0,order_id,products_true
0,1,11109 10246 47209 43633 49683 22035 49302 13176
1,36,34497 46979 48679 19660 43086 49235 39612 46620
2,38,42625 23622 11913 28842 4461 39693 18159 21616 32433
3,96,40706 24489 25610 39275 30391 20574 27966
4,98,46720 24964 4357 43654 18441 36364 34065 19731 1939 45204 790 22935 40986 8859 9373 48287 37664 27683 7461 43560 9896 20520 3880 41387 30776 25659 27966 44479 18117 329 40396 46413 34126 45007 273...


In [32]:
from sklearn.model_selection import train_test_split
df_train, df_validate = train_test_split(train, test_size = 0.2, random_state=42)

## Do something on the 

In [36]:
y_pred = df_validate['products_true'] #fake it now

df_validate = df_validate.assign(products_pred = y_pred)
df_validate.head()

Unnamed: 0,order_id,products_true,products_pred
115356,3007878,3682 18531 9092 27845 1862 17862 41004 23565 23029 47766 19895 9366 21461 43999 8988 2078 4799,3682 18531 9092 27845 1862 17862 41004 23565 23029 47766 19895 9366 21461 43999 8988 2078 4799
66774,1727889,42058 5450 9389 32047 21137 19348 11351 12218 927,42058 5450 9389 32047 21137 19348 11351 12218 927
108561,2829462,11777 23586 36835 49347 41349 18658 37959 13380 5449 21894 46990 39408 4853 22008 40826 47546 13339 16797 46654 12223,11777 23586 36835 49347 41349 18658 37959 13380 5449 21894 46990 39408 4853 22008 40826 47546 13339 16797 46654 12223
125040,3261021,10017 1090 23815 8138 41290 39180 39470 21903 49520 4920 8467 24852 47766 25272,10017 1090 23815 8138 41290 39180 39470 21903 49520 4920 8467 24852 47766 25272
66173,1711491,12409 37508 25194 5451 40749 38159 16978 31990 30391 39001 6459 11069,12409 37508 25194 5451 40749 38159 16978 31990 30391 39001 6459 11069


In [35]:
np.average(df_validate.apply(calculate_f1_score, axis=1, raw=True))

KeyError: ('products_pred', 'occurred at index 115356')