In [25]:
import numpy as np 
import pandas as pd 
import sklearn
import scipy.sparse
import lightgbm as lgb
import gc 
import matplotlib.pyplot as plt 
import warnings
from pandas import HDFStore
from sklearn.metrics import mean_squared_error
import math
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from pandas import HDFStore

warnings.filterwarnings('ignore')

from xgboost import XGBRegressor
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)



pd.set_option('display.max_rows',600)
pd.set_option('display.max_columns',50)


from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

from itertools import product 

In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df


### Loading all datasets  

In [3]:
data_path = 'C:/Code/Data Science Code Base/Kaggle Data/Predict_Future_Sales/'
sales = pd.read_csv(data_path+'sales_train_v2.csv')
shops = pd.read_csv(data_path+'shops-translated.csv')
items = pd.read_csv(data_path+'items.csv')
item_cats = pd.read_csv(data_path+'item_categories-translated.csv')
test = pd.read_csv(data_path+'test.csv')
calendar = pd.read_csv(data_path+'calendar.csv')
usd_rub = pd.read_csv(data_path+'usd-rub.csv')

In [4]:
sales.drop_duplicates(inplace=True)
shops.drop_duplicates(inplace=True)
items.drop_duplicates(inplace=True)
item_cats.drop_duplicates(inplace=True)

In [5]:
sales = sales[sales.item_cnt_day<1000]
sales = sales[sales.item_price<100000]
sales = sales[sales.item_price>0]

# Loading the encoded_train_test.dataset

In [10]:
encoded_train_test = HDFStore('encoded_train_test.h5')
all_data1 = encoded_train_test['encoded_train_test']

In [11]:
all_data2 = all_data1.copy()

### Defining function for lag features 

In [8]:
def lag_features_func(lag_features,time_range,df,non_lag_features):
    for month_shift in tqdm_notebook(time_range):
        shift_df = df[non_lag_features+lag_features].copy()
        shift_df['date_block_num'] = shift_df['date_block_num'] + month_shift
        foo = lambda x: '{}_lag_{}'.format(x,month_shift) if x in lag_features else x
        shift_df = shift_df.rename(columns=foo)
        df = pd.merge(df,shift_df,how='left',on=non_lag_features).fillna(0)
    del shift_df
    gc.collect();
    
    return df
    

##### Creating lag features 

In [12]:
lag_features = [col for col in all_data1.columns if (('target' in col.split('_'))|('encoded' in col.split('_')))]
non_lag_features = list(all_data1.columns.difference(lag_features))
time_range = [1,2,3,6,12]

In [13]:
all_data2 = lag_features_func(lag_features,time_range,all_data2,non_lag_features)
all_data2 = all_data2[all_data2.date_block_num>=12]

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




#### Adding supplementary information
* Add Month (done)
* Add number of holidays (national holidays)(done)
* Add season (optional)


In [16]:
calendar['month'] = calendar.date.apply(lambda x: int(x.split('-')[1]))
gp = calendar.groupby('month').agg({'holiday':{'holidays_in_month':'sum'}})
gp.columns = [col[0] if col[-1]=='' else col[-1] for col in gp.columns.values]
all_data2['month'] = all_data2.date_block_num.apply(lambda block: (block%12) +1)
all_data2 = pd.merge(all_data2,gp,how='left',on=['month'])
del gp
gc.collect();

### Saving to disk 

In [18]:
store = HDFStore('encoded_train_test_lag.h5')
store.put('encoded_train_test_lag',all_data2,format='table',data_columns=True)

# Modelling 
* Using Lightgbm to train models 
* validation set is the last month


### Stuff to do 
* Use mean_encodings of train and map to validation (done)
* Prepare test set(done) 
* tune model 
* Clip predictions and figure out when and how to do it 

In [21]:
dates = all_data2['date_block_num']
test_block = dates.max()
val_block = test_block -1
dates_train = dates[dates<val_block]
dates_val = dates[dates==val_block]
present_encoded = [col for col in all_data2.columns if col.split('_')[-1]=='encoded']

X_train = all_data2.loc[dates<val_block].drop(['target','target_shop','target_item']+present_encoded,axis=1)
X_val = all_data2.loc[dates== val_block].drop(['target','target_shop','target_item']+present_encoded,axis=1)
X_test = all_data2.loc[dates==test_block].drop(['target','target_shop','target_item']+present_encoded,axis=1)

y_train = all_data2.loc[dates<val_block,'target'].values
y_val = all_data2.loc[dates==val_block,'target'].values

### Trying out XGBRegressor 

In [29]:
model = XGBRegressor(
    max_depth=15,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 10)


[0]	validation_0-rmse:3.38582	validation_1-rmse:2.59711
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:3.30173	validation_1-rmse:2.51783
[2]	validation_0-rmse:3.21919	validation_1-rmse:2.44098
[3]	validation_0-rmse:3.14943	validation_1-rmse:2.38248
[4]	validation_0-rmse:3.08873	validation_1-rmse:2.32854
[5]	validation_0-rmse:3.0324	validation_1-rmse:2.27514
[6]	validation_0-rmse:2.99192	validation_1-rmse:2.23911
[7]	validation_0-rmse:2.94907	validation_1-rmse:2.20544
[8]	validation_0-rmse:2.91293	validation_1-rmse:2.17792
[9]	validation_0-rmse:2.88197	validation_1-rmse:2.1528
[10]	validation_0-rmse:2.85533	validation_1-rmse:2.13528
[11]	validation_0-rmse:2.83353	validation_1-rmse:2.12034
[12]	validation_0-rmse:2.81472	validation_1-rmse:2.11189
[13]	validation_0-rmse:2.79896	validation_1-rmse:2.10459
[14]	validation_0-rmse:2.78192	validation_1-rmse:2.0964

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=15, min_child_weight=300, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=None,
       subsample=0.8, verbosity=1)