<h3>Loading data</h3>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

import warnings
warnings.filterwarnings("ignore")

In [None]:
FULFILLEMENT_CENTER = 'input/fulfilment_center_info.csv'
MEAL_INFO = 'input/meal_info.csv'
TRAIN = 'input/train.csv'

In [None]:
TEST = 'input/test.csv'
SAMPLE_SUBMISSION = 'input/sample_submission.csv'

In [None]:
df = pd.read_csv(TRAIN)
meal_df = pd.read_csv(MEAL_INFO)
fulfillement_center_df = pd.read_csv(FULFILLEMENT_CENTER)

In [None]:
test_df = pd.read_csv(TEST)

In [None]:
# Merging 

tmp_df = pd.merge(df, meal_df, how='left', on='meal_id')
df = pd.merge(tmp_df, fulfillement_center_df, how='left', on='center_id')

test_df = pd.merge(test_df, meal_df, how='left', on="meal_id")
test_df = pd.merge(test_df, fulfillement_center_df, how='left', on='center_id')

del tmp_df, fulfillement_center_df, meal_df

<h3>Feature engineering</h3>

In [None]:
def interaction_features(df):
    df["base_over_checkout"] = df['base_price'] / df['checkout_price']
    df['center_meal_id'] = df['center_id'].astype(str) + '_' + df['meal_id'].astype(str)
    df['prom+homepage'] = df['emailer_for_promotion'].astype(str) + '_' + df['homepage_featured'].astype(str)
    
    return df

In [None]:
def lagging_features(df):
    
    windows = [2, 4, 15, 30, 52]
    
    for w in windows:  
        print(f'Generating features with {w}-week time frame')
        
        df[f'avg_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).mean()
        df[f'avg_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).mean()

        df[f'std_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).std()
        df[f'std_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).std()
        
        df[f'min_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).min()
        df[f'min_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).min()
        
        df[f'max_past_checkout_price_{w}'] = df.groupby(['center_id', 'meal_id'])['checkout_price'].shift(w-1).rolling(w).max()
        df[f'max_past_base_price_{w}'] = df.groupby(['center_id', 'meal_id'])['base_price'].shift(w-1).rolling(w).max()
        
        df[f'has_been_promoted_{w}'] = df.groupby(['meal_id'])['emailer_for_promotion'].shift(w-1).rolling(w).sum()
        df[f'has_been_featured_{w}'] = df.groupby(['meal_id'])['homepage_featured'].shift(w-1).rolling(w).sum()
        
        df[f'avg_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).mean()
        df[f'std_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).std()
        df[f'min_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).min()
        df[f'max_num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w-1).rolling(w).max()
        
        df[f'num_orders_lag_{w}'] = df.groupby(['center_id', 'meal_id'])['num_orders'].shift(w)
        
    return df

In [None]:
df = lagging_features(df)
test_df = lagging_features(df)

In [None]:
df = interaction_features(df)
test_df = interaction_features(test_df)

<h3>Preprocessing</h3>

In [None]:
def encode_categorical_features(df):
    cat_var = ['category', 'cuisine', 'center_type', 'center_meal_id', 'prom+homepage']

    le = LabelEncoder()

    for c in cat_var:
        df[c] = le.fit_transform(df[c])
        
    return df

In [None]:
df = encode_categorical_features(df)
test_df = encode_categorical_features(df)

<h3>Feature selection</h3>

<h3>Modelling</h3>

In [None]:
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))) * 100, False

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
y = df['num_orders']
y = np.log1p(y)
X = df.drop(['id', 'num_orders'], axis=1)

X_test = test_df.drop(['id', 'num_orders'], axis=1)

In [None]:
# Quich check

assert((X.columns == X_test.columns).all()), 'columns are not the same'

In [None]:
X_test = X_test.values

In [None]:
metrics = []
test_preds = []

for train_index, valid_index in tscv.split(X, y):
    X_train = X.loc[train_index].values
    X_valid = X.loc[valid_index].values
    
    y_train = y.loc[train_index].values
    y_valid = y.loc[valid_index].values
        
    gbm = xgb.XGBRegressor()
    
    gbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='mae', early_stopping_rounds=5, verbose=20)
    
    print("\n")
    print("Evaluating model...")
    y_pred = gbm.predict(X_valid)
    y_pred = np.expm1(y_pred)
    y_pred[y_pred < 0] = 0
    
    y_valid = np.expm1(y_valid)
    metric = rmsle(y_valid, y_pred)[1]
    metrics.append(metric)
    print('The rmsle of prediction is:', metric)
    print('\n')
    
    print('Predicting test set...')
    y_pred = gbm.predict(X_test)
    y_pred = np.expm1(y_pred)
    y_pred[y_pred < 0] = 0
    test_preds.append(y_pred)
    print('\n')

print('Evaluation RMLSE: {}'.format(np.mean(np.array(metrics))))

<h3>Experiment summary</h3>

No feature engineering / baseline: **98**<br/>
Slight feature engineering: **96.8**<br/>
Slight feature engineering + log target: **68.2**<br/>
Lag feature engineering + log target: **67.1**<br/>
Enhanced feature engineering + log target: **60.14**<br/>
Final feature engineering + log target: **54.6**<br/>

In [None]:
test_preds = np.array(test_preds)

In [None]:
final_preds = np.mean(test_preds, axis=0)

<h3>Inference</h3>

In [None]:
test_df['num_orders']  = final_preds

In [None]:
sub_df = pd.read_csv(SAMPLE_SUBMISSION)

In [None]:
sub_df.head()

In [None]:
test_df.head()