In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import time

In [2]:
df = pd.read_csv('dataset/train/1_train.csv', parse_dates=['date'])
df.drop(df.columns[[2, 3, 5]], axis=1, inplace=True)

In [3]:
df.head(5)

Unnamed: 0,date,Balance.Cost,Withdrawals,uang_idle
0,2018-01-01,32367,78700000,45304
1,2018-01-02,65392,112900000,32367
2,2018-01-03,47523,108700000,65392
3,2018-01-04,30970,100700000,47523
4,2018-01-05,62901,127750000,30970


In [4]:
test_df = pd.read_csv('dataset_backup/data_input/atm_test.csv')
test_df.head(5)

Unnamed: 0,no. ATM;date
0,K1;25/03/2018
1,K1;26/03/2018
2,K1;27/03/2018
3,K1;28/03/2018
4,K1;29/03/2018


# Add Features

In [25]:
df['dayofmonth'] = df.date.dt.day
df['dayofyear'] = df.date.dt.dayofyear
df['dayofweek'] = df.date.dt.dayofweek
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year
df['weekofyear'] = df.date.dt.weekofyear
df['is_month_start'] = (df.date.dt.is_month_start).astype(int)
df['is_month_end'] = (df.date.dt.is_month_end).astype(int)
df.head(60)

Unnamed: 0,date,Balance.Cost,Withdrawals,uang_idle,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2018-01-01,32367,18.181154,45304,1,1,0,1,2018,1,1,0
1,2018-01-02,65392,18.542013,32367,2,2,1,1,2018,1,0,0
2,2018-01-03,47523,18.504102,65392,3,3,2,1,2018,1,0,0
3,2018-01-04,30970,18.427656,47523,4,4,3,1,2018,1,0,0
4,2018-01-05,62901,18.665586,30970,5,5,4,1,2018,1,0,0
5,2018-01-06,45805,18.459901,62901,6,6,5,1,2018,1,0,0
6,2018-01-07,34455,18.050341,45805,7,7,6,1,2018,1,0,0
7,2018-01-08,62671,18.720415,34455,8,8,0,1,2018,2,0,0
8,2018-01-09,45732,18.450725,62671,9,9,1,1,2018,2,0,0
9,2018-01-10,27625,18.517354,45732,10,10,2,1,2018,2,0,0


## One Hot Encoder

In [6]:
def one_hot_encoder(df, ohe_cols=['store','item','dayofmonth','dayofweek','month','weekofyear']):
    '''
    One-Hot Encoder function
    '''
    print('Creating OHE features..\nOld df shape:{}'.format(df.shape))
    df = pd.get_dummies(df, columns=ohe_cols)
    print('New df shape:{}'.format(df.shape))
    return df

# Normalization

In [7]:
df.head(2)

Unnamed: 0,date,Balance.Cost,Withdrawals,uang_idle,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2018-01-01,32367,78700000,45304,1,1,0,1,2018,1,1,0
1,2018-01-02,65392,112900000,32367,2,2,1,1,2018,1,0,0


In [8]:
df['Withdrawals'] = np.log1p(df.Withdrawals.values)
df.head(2)

Unnamed: 0,date,Balance.Cost,Withdrawals,uang_idle,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2018-01-01,32367,18.181154,45304,1,1,0,1,2018,1,1,0
1,2018-01-02,65392,18.542013,32367,2,2,1,1,2018,1,0,0


In [9]:
df.head(2)

Unnamed: 0,date,Balance.Cost,Withdrawals,uang_idle,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2018-01-01,32367,18.181154,45304,1,1,0,1,2018,1,1,0
1,2018-01-02,65392,18.542013,32367,2,2,1,1,2018,1,0,0


In [10]:
# X = df.iloc[:, [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
X = df.iloc[:, [4, 5, 6, 7, 8, 9, 10, 11]]
y = df.iloc[:, 2]

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1
)

In [12]:
X = one_hot_encoder(X, ohe_cols=['dayofweek','month'])
X.head(5)

Creating OHE features..
Old df shape:(83, 8)
New df shape:(83, 16)


Unnamed: 0,dayofmonth,dayofyear,year,weekofyear,is_month_start,is_month_end,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,month_1,month_2,month_3
0,1,1,2018,1,1,0,1,0,0,0,0,0,0,1,0,0
1,2,2,2018,1,0,0,0,1,0,0,0,0,0,1,0,0
2,3,3,2018,1,0,0,0,0,1,0,0,0,0,1,0,0
3,4,4,2018,1,0,0,0,0,0,1,0,0,0,1,0,0
4,5,5,2018,1,0,0,0,0,0,0,1,0,0,1,0,0


In [13]:
avoid_cols = ['date', 'Withdrawals', 'year']
cols = [col for col in X.columns if col not in avoid_cols]
print('No of training features: {} \nAnd they are:{}'.format(len(cols), cols))

No of training features: 15 
And they are:['dayofmonth', 'dayofyear', 'weekofyear', 'is_month_start', 'is_month_end', 'dayofweek_0', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'month_1', 'month_2', 'month_3']


In [14]:
def smape(preds, target):
    '''
    Function to calculate SMAPE
    '''
    n = len(preds)
    masked_arr = ~((preds==0)&(target==0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds-target)
    denom = np.abs(preds)+np.abs(target)
    smape_val = (200*np.sum(num/denom))/n
    return smape_val

def lgbm_smape(preds, train_data):
    '''
    Custom Evaluation Function for LGBM
    '''
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

In [15]:
# LightGBM parameters
lgb_params = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'mae'}, 'num_leaves': 10, 'learning_rate': 0.02, 
              'feature_fraction': 0.8, 'max_depth': 5, 'verbose': 0, 
              'num_boost_round':15000, 'early_stopping_rounds':200, 'nthread':-1}

In [16]:
lgbtrain = lgb.Dataset(data=X_train.loc[:,cols].values, label=y_train, 
                       feature_name=cols)
lgbval = lgb.Dataset(data=X_val.loc[:,cols].values, label=y_val, 
                     reference=lgbtrain, feature_name=cols)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [17]:
def lgb_validation(params, lgbtrain, lgbval, X_val, Y_val, verbose_eval):
    t0 = time.time()
    evals_result = {}
    model = lgb.train(params, lgbtrain, num_boost_round=params['num_boost_round'], 
                      valid_sets=[lgbtrain, lgbval], feval=lgbm_smape, 
                      early_stopping_rounds=params['early_stopping_rounds'], 
                      evals_result=evals_result, verbose_eval=verbose_eval)
    print(model.best_iteration)
    print('Total time taken to build the model: ', (time.time()-t0)/60, 'minutes!!')
    pred_Y_val = model.predict(X_val, num_iteration=model.best_iteration)
    pred_Y_val = np.expm1(pred_Y_val)
    Y_val = np.expm1(Y_val)
    val_df = pd.DataFrame(columns=['true_Y_val','pred_Y_val'])
    val_df['pred_Y_val'] = pred_Y_val
    val_df['true_Y_val'] = Y_val
    print(val_df.shape)
    print(val_df.sample(5))
    print('SMAPE for validation data is:{}'.format(smape(pred_Y_val, Y_val)))
    return model, val_df

In [18]:
# Training lightgbm model and validating
model, val_df = lgb_validation(lgb_params, lgbtrain, lgbval, X_val.loc[:,cols].values, 
                               y_val, verbose_eval=500)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[15]	training's l1: 0.211217	training's SMAPE: 20.7837	valid_1's l1: 0.222603	valid_1's SMAPE: 22.1019
15
Total time taken to build the model:  0.011211645603179932 minutes!!
(9, 2)
   true_Y_val    pred_Y_val
8         NaN  1.072436e+08
0         NaN  1.072436e+08
2         NaN  1.123721e+08
3         NaN  1.118764e+08
6         NaN  1.100921e+08
SMAPE for validation data is:22.101927493407512


In [19]:
# Let's see top 25 features as identified by the lightgbm model.
print("Features importance...")
gain = model.feature_importance('gain')
feat_imp = pd.DataFrame({'feature':model.feature_name(), 
                         'split':model.feature_importance('split'), 
                         'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print('Top 25 features:\n', feat_imp.head(25))

Features importance...
Top 25 features:
            feature  split       gain
0       dayofmonth     18  70.390024
1        dayofyear     12  29.609976
2       weekofyear      0   0.000000
3   is_month_start      0   0.000000
4     is_month_end      0   0.000000
5      dayofweek_0      0   0.000000
6      dayofweek_1      0   0.000000
7      dayofweek_2      0   0.000000
8      dayofweek_3      0   0.000000
9      dayofweek_4      0   0.000000
10     dayofweek_5      0   0.000000
11     dayofweek_6      0   0.000000
12         month_1      0   0.000000
13         month_2      0   0.000000
14         month_3      0   0.000000


In [20]:
X_train

Unnamed: 0,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
7,8,8,0,1,2018,2,0,0
18,19,19,4,1,2018,3,0,0
45,15,46,3,2,2018,7,0,0
10,11,11,3,1,2018,2,0,0
21,22,22,0,1,2018,4,0,0
39,9,40,4,2,2018,6,0,0
47,17,48,5,2,2018,7,0,0
79,21,80,2,3,2018,12,0,0
61,3,62,5,3,2018,9,0,0
6,7,7,6,1,2018,1,0,0


In [21]:
def lgb_train(params, lgbtrain_all, X_test, num_round):
    t0 = time.time()
    model = lgb.train(params, lgbtrain_all, num_boost_round=num_round, feval=lgbm_smape)
    test_preds = model.predict(X_test, num_iteration=num_round)
    print('Total time taken in model training: ', (time.time()-t0)/60, 'minutes!')
    return model, test_preds

In [22]:
# # Training lgb model on whole data(train+val)
# lgb_model, test_preds = lgb_train(lgb_params, lgbtrain_all, test.loc[:,cols].values, model.best_iteration)
# print('test_preds shape:{}'.format(test_preds.shape))

NameError: name 'lgbtrain_all' is not defined