<h3>Loading data</h3>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics.scorer import make_scorer

import warnings
warnings.filterwarnings("ignore")

In [2]:
FULFILLEMENT_CENTER = 'input/fulfilment_center_info.csv'
MEAL_INFO = 'input/meal_info.csv'
TRAIN = 'input/train.csv'

In [3]:
TEST = 'input/test.csv'
SAMPLE_SUBMISSION = 'input/sample_submission.csv'

In [4]:
df = pd.read_csv(TRAIN)
meal_df = pd.read_csv(MEAL_INFO)
fulfillement_center_df = pd.read_csv(FULFILLEMENT_CENTER)

In [5]:
test_df = pd.read_csv(TEST)

In [6]:
# Merging 

tmp_df = pd.merge(df, meal_df, how='left', on='meal_id')
df = pd.merge(tmp_df, fulfillement_center_df, how='left', on='center_id')

test_df = pd.merge(test_df, meal_df, how='left', on="meal_id")
test_df = pd.merge(test_df, fulfillement_center_df, how='left', on='center_id')

del tmp_df, fulfillement_center_df, meal_df

<h3>Feature engineering</h3>

In [8]:
def interaction_features(df):
    df["base_over_checkout"] = df['base_price'] / df['checkout_price']
    df['center_meal_id'] = df['center_id'].astype(str) + '_' + df['meal_id'].astype(str)
    df['prom+homepage'] = df['emailer_for_promotion'].astype(str) + '_' + df['homepage_featured'].astype(str)
    
    return df

In [9]:
def lagging_features(df):
    
    windows = [2, 4, 15, 30, 52]
    
    for w in windows:  
        print(f'Generating features with {w}-week time frame')
        
        df[f'avg_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).mean()
        df[f'avg_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).mean()

        df[f'std_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).std()
        df[f'std_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).std()
        
        df[f'min_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).min()
        df[f'min_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).min()
        
        df[f'max_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).max()
        df[f'max_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).max()
        
        df[f'has_been_promoted_{w}'] = df.groupby(['meal_id'])['emailer_for_promotion'].shift(w-1).rolling(w).sum()
        df[f'has_been_featured_{w}'] = df.groupby(['meal_id'])['homepage_featured'].shift(w-1).rolling(w).sum()
        
        df[f'avg_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).mean()
        df[f'std_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).std()
        df[f'min_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).min()
        df[f'max_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).max()
        
        df[f'num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w)
        
    return df

In [10]:
df = lagging_features(df)
test_df = lagging_features(df)

Generating features with 2-week time frame
Generating features with 4-week time frame
Generating features with 15-week time frame
Generating features with 30-week time frame
Generating features with 52-week time frame
Generating features with 2-week time frame
Generating features with 4-week time frame
Generating features with 15-week time frame
Generating features with 30-week time frame
Generating features with 52-week time frame


In [11]:
df = interaction_features(df)
test_df = interaction_features(test_df)

<h3>Preprocessing</h3>

In [12]:
def encode_categorical_features(df):
    cat_var = ['category', 'cuisine', 'center_type', 'center_meal_id', 'prom+homepage']

    le = LabelEncoder()

    for c in cat_var:
        df[c] = le.fit_transform(df[c])
        
    return df

In [13]:
df = encode_categorical_features(df)
test_df = encode_categorical_features(df)

<h3>Modelling</h3>

In [14]:
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))) * 100, False

In [15]:
tscv = TimeSeriesSplit(n_splits=5)

In [16]:
y = df['num_orders']
y = np.log1p(y)
X = df.drop(['id', 'num_orders'], axis=1)

X_test = test_df.drop(['id', 'num_orders'], axis=1)

In [17]:
# Quich check

assert((X.columns == X_test.columns).all()), 'columns are not the same'

In [18]:
X_test = X_test.values

In [19]:
metrics = []
test_preds = []

for train_index, valid_index in tscv.split(X, y):
    X_train = X.loc[train_index].values
    X_valid = X.loc[valid_index].values
    
    y_train = y.loc[train_index].values
    y_valid = y.loc[valid_index].values
        
    gbm = xgb.XGBRegressor()
    
    gbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='mae', early_stopping_rounds=5, verbose=20)
    
    print("\n")
    print("Evaluating model...")
    y_pred = gbm.predict(X_valid)
    y_pred = np.expm1(y_pred)
    y_pred[y_pred < 0] = 0
    
    y_valid = np.expm1(y_valid)
    metric = rmsle(y_valid, y_pred)[1]
    metrics.append(metric)
    print('The rmsle of prediction is:', metric)
    print('\n')
    
    print('Predicting test set...')
    y_pred = gbm.predict(X_test)
    y_pred = np.expm1(y_pred)
    y_pred[y_pred < 0] = 0
    test_preds.append(y_pred)
    print('\n')

print('Evaluation RMLSE: {}'.format(np.mean(np.array(metrics))))

[0]	validation_0-mae:3.97436
Will train until validation_0-mae hasn't improved in 5 rounds.
[20]	validation_0-mae:0.670372
[40]	validation_0-mae:0.45175
[60]	validation_0-mae:0.433287
[80]	validation_0-mae:0.428621
[99]	validation_0-mae:0.425338


Evaluating model...
The rmsle of prediction is: 55.43975597802756


Predicting test set...


[0]	validation_0-mae:3.95847
Will train until validation_0-mae hasn't improved in 5 rounds.
[20]	validation_0-mae:0.676971
[40]	validation_0-mae:0.459891
[60]	validation_0-mae:0.444933
Stopping. Best iteration:
[67]	validation_0-mae:0.443019



Evaluating model...
The rmsle of prediction is: 57.116165372941786


Predicting test set...


[0]	validation_0-mae:3.93523
Will train until validation_0-mae hasn't improved in 5 rounds.
[20]	validation_0-mae:0.637794
[40]	validation_0-mae:0.425568
[60]	validation_0-mae:0.414021
[80]	validation_0-mae:0.409688
[99]	validation_0-mae:0.406908


Evaluating model...
The rmsle of prediction is: 52.79292723186712


Pre

<h3>Experiment summary</h3>

No feature engineering / baseline: **98**<br/>
Slight feature engineering: **96.8**<br/>
Slight feature engineering + log target: **68.2**<br/>
Lag feature engineering + log target: **67.1**<br/>
Enhanced feature engineering + log target: **60.14**<br/>
Final feature engineering + log target: **54.6**<br/>

In [20]:
test_preds = np.array(test_preds)

In [21]:
final_preds = np.mean(test_preds, axis=0)

<h3>Inference</h3>

In [37]:
test_df['num_orders']  = final_preds

In [38]:
sub_df = pd.read_csv(SAMPLE_SUBMISSION)

In [39]:
sub_df.head()

Unnamed: 0,id,num_orders
0,1028232,0
1,1127204,0
2,1212707,0
3,1082698,0
4,1400926,0


In [43]:
test_df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,...,has_been_promoted_52,has_been_featured_52,avg_num_orders_lag_52,std_num_orders_lag_52,min_num_orders_lag_52,max_num_orders_lag_52,num_orders_lag_52,base_over_checkout,center_meal_id,prom+homepage
0,1379560,1,55,1885,136.83,152.29,0,0,170.510544,0,...,,,,,,,,1.112987,2335,0
1,1466964,1,55,1993,136.83,135.83,0,0,170.693207,0,...,,,,,,,,0.992692,2339,0
2,1346989,1,55,2539,134.86,135.86,0,0,168.402191,0,...,,,,,,,,1.007415,2349,0
3,1338232,1,55,2139,339.5,437.53,0,0,52.458019,0,...,,,,,,,,1.288748,2341,0
4,1448490,1,55,2631,243.5,242.5,0,0,90.82869,0,...,,,,,,,,0.995893,2353,0
