# Feature Engineering and train, dev, val split

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split 
from scipy.stats import mode
from sklearn.metrics import mean_squared_error 

%matplotlib inline
import gc #garbage collector interface

import warnings 
warnings.filterwarnings('ignore')

## Feature Engineering on the Historical and New Merchants Datasets

read Historical Transactions dataset

In [None]:
hist_df = pd.read_csv("data/historical_transactions.csv")
hist_df.head()

In [None]:
hist_df.columns

read new merchants dataset

In [None]:
new_merchant_df = pd.read_csv("data/new_merchant_transactions.csv")
new_merchant_df.head()

In [None]:
new_merchant_df.columns

define simple function to compute the [mode](https://en.wikipedia.org/wiki/Mode_(statistics)) of an array-like structure. The default scipy function returns both the value and the occurance absolute frequence.

In [None]:
def mod(arr):
    return mode(arr)[0][0]

We want to aggregate different function to different column of the dataframe. We construct three dictionaries with the following structure:
```python
{col_to_be_applied: {result_col_name: function}}
```


In [None]:
aggregationDictHist = {'card_id':{'hist_transactions_count':'count'},
        'merchant_category_id':{'hist_most_frequent_merchant_cat':mod},
                       'subsector_id':{'hist_most_frequent_subsector':mod},
        'city_id':{'hist_most_frequent_city':mod}, 'state_id':{'hist_most_frequent_state':mod},
        'month_lag':{'hist_min_month_lag':'min'},'purchase_amount':{'hist_max_purchase_amount':'max'}}

In [None]:
aggregationDictNew = {'card_id':{'new_transactions_count':'count'},
        'merchant_category_id':{'new_most_frequent_merchant_cat':mod},
                       'subsector_id':{'new_most_frequent_subsector':mod},
        'city_id':{'new_most_frequent_city':mod}, 'state_id':{'new_most_frequent_state':mod},
        'month_lag':{'new_min_month_lag':'min'},'purchase_amount':{'new_max_purchase_amount':'max'}}

apply aggregated functions to historical df

In [None]:
hist_grouped = hist_df[['card_id','merchant_category_id','subsector_id','city_id',
                'state_id','month_lag','purchase_amount']].groupby('card_id').agg(aggregationDictHist)
hist_grouped.columns = hist_grouped.columns.droplevel(0)

In [None]:
hist_grouped.head()

In [None]:
#in hist dataset
totalH = hist_grouped.isnull().sum().sort_values(ascending = False)
percentH = (hist_grouped.isnull().sum()/hist_grouped.isnull().count()*100).sort_values(ascending = False)
missing_data_h  = pd.concat([totalH, percentH], axis=1, keys=['Total', 'Percent']) # axis = 1 because it concatenates 2 columns, not rows
missing_data_h.head(2)

Apply aggregated functions in the new merchants dataset:

In [None]:
new_merchant_grouped = new_merchant_df[['card_id','merchant_category_id','subsector_id','city_id',
                'state_id','month_lag','purchase_amount']].groupby('card_id').agg(aggregationDictNew)
new_merchant_grouped.columns = new_merchant_grouped.columns.droplevel(0)

In [None]:
new_merchant_grouped.head()

In [None]:
totalN = new_merchant_grouped.isnull().sum().sort_values(ascending = False)
percentN = (new_merchant_grouped.isnull().sum()/new_merchant_grouped.isnull().count()*100).sort_values(ascending = False)
missing_data_n  = pd.concat([totalN, percentN], axis=1, keys=['Total', 'Percent']) # axis = 1 because it concatenates 2 columns, not rows
missing_data_n.head(2)

## Train-val-dev split

We want to split the training set into train, dev, val, following a 60-20-20 splitting

In [None]:
train_df = pd.read_csv(os.path.join("data","train.csv"), 
                       parse_dates = ["first_active_month"]) # parse_dates param in order to have a correct date format

In [None]:
train_df.head(3)

In [None]:
test_df = pd.read_csv("data/test.csv", parse_dates = ["first_active_month"])

In [None]:
len(train_df)

merge the hist and new merchants features to the train_df

In [None]:
train_df_feats = hist_grouped.reset_index().merge(new_merchant_grouped.reset_index(),
                                                 on='card_id').merge(train_df,on='card_id',how='right')

In [None]:
train_df_feats.head()

In [None]:
len(train_df_feats)

In [None]:
train_df_feats.columns

In [None]:
rest_df, val_df_feats = train_test_split(train_df_feats, test_size=0.2, random_state=23)

train_df_feats_final, dev_df_feats = train_test_split(rest_df, test_size=0.2, random_state=23)

In [None]:
test_df_feats.head()

In [None]:
print('length of training set: ', len(train_df_feats_final))
print('length of validation set: ', len(val_df_feats))
print('length of development set: ', len(dev_df_feats))
print('sum of the above: ',len(train_df_feats_final) + len(val_df_feats) + len(dev_df_feats))
print('length of previous trainingset: ',len(train_df))

In [None]:
train_df_feats_final.to_csv(os.path.join('data','train_feats.csv'),index=False)
val_df_feats.to_csv(os.path.join('data','val_feats.csv'),index=False)
dev_df_feats.to_csv(os.path.join('data','dev_feats.csv'),index=False)

In [None]:
test_df.head()

In [None]:
len(test_df)

merge the hist and new merchants features to the train_df

In [None]:
test_df_feats = hist_grouped.reset_index().merge(new_merchant_grouped.reset_index(),
                                                 on='card_id').merge(test_df,on='card_id',how='right')

In [None]:
test_df_feats.head()

In [None]:
len(test_df_feats)

In [None]:
test_df_feats.to_csv(os.path.join('data','test_feats.csv'),index=False)

<b> Applying LGBM for the newly created datasets </b>

In [2]:
import lightgbm as lgb

train_df = pd.read_csv('data/train_feats.csv', parse_dates = ["first_active_month"])
#train_df.head()

dev_df = pd.read_csv('data/dev_feats.csv', parse_dates = ["first_active_month"])
#dev_df.head()

test_df = pd.read_csv('data/val_feats.csv', parse_dates = ["first_active_month"])
#test_df.head()


used_cols = [col for col in train_df.columns if col not in ['card_id'] ]

# we'll remove for the moment the card_id column
train_df = train_df[used_cols]
dev_df = dev_df[used_cols]
test_df = test_df[used_cols]

train_df.head(10)

Unnamed: 0,hist_transactions_count,hist_most_frequent_merchant_cat,hist_most_frequent_subsector,hist_most_frequent_city,hist_most_frequent_state,hist_min_month_lag,hist_max_purchase_amount,new_transactions_count,new_most_frequent_merchant_cat,new_most_frequent_subsector,new_most_frequent_city,new_most_frequent_state,new_min_month_lag,new_max_purchase_amount,first_active_month,feature_1,feature_2,feature_3,target
0,15.0,705.0,33.0,322.0,11.0,-8.0,3.400409,3.0,879.0,29.0,-1.0,-1.0,1.0,-0.173481,2017-06-01,2,3,0,-1.433714
1,12.0,307.0,1.0,69.0,9.0,-4.0,6.060808,8.0,307.0,19.0,19.0,9.0,1.0,11.259275,2014-10-01,5,1,1,-4.474933
2,77.0,705.0,33.0,302.0,7.0,-13.0,5.963316,4.0,307.0,19.0,38.0,7.0,1.0,-0.660565,2015-12-01,5,1,1,2.156978
3,131.0,307.0,19.0,173.0,9.0,-13.0,0.754692,4.0,278.0,37.0,140.0,9.0,1.0,11.559805,2016-11-01,5,1,1,-3.134993
4,131.0,705.0,29.0,213.0,9.0,-5.0,2.645327,12.0,367.0,16.0,213.0,9.0,1.0,-0.326316,2017-09-01,5,1,1,-2.339241
5,62.0,705.0,33.0,286.0,3.0,-7.0,-0.256442,3.0,705.0,33.0,286.0,3.0,1.0,-0.624036,2017-05-01,2,1,0,-0.044707
6,,,,,,,,,,,,,,,2016-09-01,5,1,1,-0.337872
7,31.0,528.0,37.0,69.0,9.0,-3.0,-0.641722,6.0,705.0,37.0,69.0,9.0,1.0,-0.701979,2016-03-01,3,1,1,2.066695
8,16.0,409.0,29.0,-1.0,-1.0,-13.0,-0.473365,1.0,690.0,1.0,179.0,-1.0,2.0,-0.724383,2016-08-01,2,2,0,-0.236893
9,53.0,705.0,33.0,123.0,19.0,-6.0,0.19823,8.0,222.0,21.0,123.0,19.0,1.0,-0.662278,2017-07-01,3,1,1,-2.05982


In [3]:
# transform the first_activ_month feature in : number of months until 01.01.2019
import datetime as dt

reference_date = dt.datetime.strptime('Jan 1 2019', '%b %d %Y').date()
def months_between_dates(date2):
    return reference_date.month - date2.month + 12*(reference_date.year - date2.year)


train_df['first_active_month'] = train_df['first_active_month'].apply(months_between_dates)
dev_df['first_active_month'] = dev_df['first_active_month'].apply(months_between_dates)
test_df['first_active_month'] = test_df['first_active_month'].apply(months_between_dates)



In [4]:
# converting our training data into LightGBM dataset format
d_train = lgb.Dataset(train_df, label = train_df['target'])

param_used = {"max_depth": [25,50,75,100],
              "learning_rate" : [0.005,0.003,0.001,0.05,0.03,0.01,0.1],
              "num_leaves": [100,300,900,1200],
             }

params = {}
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'mse'
params['num_leaves'] = 100
params['max_depth'] = 75
params['learnin_rate'] = 0.001

# test_params(param_used, d_train)

model = lgb.train(params, d_train, 1000)

#prediction

target_prediction = model.predict(dev_df) 

#results

RMSE = np.sqrt(mean_squared_error(target_prediction, dev_df['target']))

RMSE

0.17752901161101534

In [5]:
#prediction

target_prediction = model.predict(test_df) 

#results

RMSE = np.sqrt(mean_squared_error(target_prediction, test_df['target']))

RMSE

0.175378621192175

In [14]:
# created a loop to see which one is the best combinations of param
num_iterations = 1000
rmse_list = []
Max_depth = 25
Learning_rate = 0.005
Num_leaves = 100

RMSE = 0.1724823085666873
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.005
Num_leaves = 300

RMSE = 0.17291522313013324
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.005
Num_leaves = 900

RMSE = 0.17286935946569895
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.005
Num_leaves = 1200

RMSE = 0.17330056722600648
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.003
Num_leaves = 100

RMSE = 0.2544207508484014
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.003
Num_leaves = 300

RMSE = 0.2544285208612549
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.003
Num_leaves = 900

RMSE = 0.25446421757176785
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.003
Num_leaves = 1200

RMSE = 0.2544684983213409
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.001
Num_leaves = 100

RMSE = 1.398686341957468
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.001
Num_leaves = 300

RMSE = 1.398609835004237
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.001
Num_leaves = 900

RMSE = 1.3985994658784973
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.001
Num_leaves = 1200

RMSE = 1.3985982007968796
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.05
Num_leaves = 100

RMSE = 0.17402145757005041
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.05
Num_leaves = 300

RMSE = 0.1765721688619503
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.05
Num_leaves = 900

RMSE = 0.17577398836227243
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.05
Num_leaves = 1200

RMSE = 0.17675849931089524
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.03
Num_leaves = 100

RMSE = 0.1741929702757772
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.03
Num_leaves = 300

RMSE = 0.17446823309214998
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.03
Num_leaves = 900

RMSE = 0.1748736935794383
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.03
Num_leaves = 1200

RMSE = 0.17506776709047142
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.01
Num_leaves = 100

RMSE = 0.17128728161947973
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.01
Num_leaves = 300

RMSE = 0.1720175641999096
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.01
Num_leaves = 900

RMSE = 0.17277173778173205
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.01
Num_leaves = 1200

RMSE = 0.1718237228562855
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.1
Num_leaves = 100

RMSE = 0.17759328556660633
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.1
Num_leaves = 300

RMSE = 0.17898436705623041
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.1
Num_leaves = 900

RMSE = 0.17908507961643214
--------------------------------------------------
Max_depth = 25
Learning_rate = 0.1
Num_leaves = 1200

RMSE = 0.17827208081289256
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.005
Num_leaves = 100

RMSE = 0.17246809803081617
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.005
Num_leaves = 300

RMSE = 0.17294495523837033
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.005
Num_leaves = 900

RMSE = 0.17292688997027916
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.005
Num_leaves = 1200

RMSE = 0.17325098210391573
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.003
Num_leaves = 100

RMSE = 0.2544207508484014
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.003
Num_leaves = 300

RMSE = 0.2544285208612549
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.003
Num_leaves = 900

RMSE = 0.25446421757176785
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.003
Num_leaves = 1200

RMSE = 0.2544684983213409
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.001
Num_leaves = 100

RMSE = 1.398686341957468
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.001
Num_leaves = 300

RMSE = 1.398609835004237
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.001
Num_leaves = 900

RMSE = 1.3985994658784973
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.001
Num_leaves = 1200

RMSE = 1.3985982007968796
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.05
Num_leaves = 100

RMSE = 0.17536940889611735
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.05
Num_leaves = 300

RMSE = 0.1765080565964099
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.05
Num_leaves = 900

RMSE = 0.17624219848736036
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.05
Num_leaves = 1200

RMSE = 0.1760555594074027
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.03
Num_leaves = 100

RMSE = 0.1736242712044347
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.03
Num_leaves = 300

RMSE = 0.17521864552176689
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.03
Num_leaves = 900

RMSE = 0.17566412211348004
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.03
Num_leaves = 1200

RMSE = 0.17482009739264592
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.01
Num_leaves = 100

RMSE = 0.17125971654535216
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.01
Num_leaves = 300

RMSE = 0.17207751924921694
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.01
Num_leaves = 900

RMSE = 0.17251703873241672
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.01
Num_leaves = 1200

RMSE = 0.17196618678105502
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.1
Num_leaves = 100

RMSE = 0.17752901161101534
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.1
Num_leaves = 300

RMSE = 0.1789027263015239
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.1
Num_leaves = 900

RMSE = 0.1803601423103399
--------------------------------------------------
Max_depth = 50
Learning_rate = 0.1
Num_leaves = 1200

RMSE = 0.17992684675378923
--------------------------------------------------
Max_depth = 75
Learning_rate = 0.005
Num_leaves = 100

RMSE = 0.17246809803081617
--------------------------------------------------
Max_depth = 75
Learning_rate = 0.005
Num_leaves = 300

RMSE = 0.17294495523837033
--------------------------------------------------
Max_depth = 75
Learning_rate = 0.005
Num_leaves = 900

RMSE = 0.17292688997027916
--------------------------------------------------
Max_depth = 75
Learning_rate = 0.005
Num_leaves = 1200


def test_params(params, dataset): 
    for depth in params['max_depth']:
        for learning_rate in params['learning_rate']:
            for num_leaves in params['num_leaves']:
                temp_params = {'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mse', 'max_depth': depth,
                              'learning_rate': learning_rate, 'num_leaves': num_leaves }
                print("Max_depth = " + str(depth) + "\n" +
                     "Learning_rate = " + str(learning_rate) + "\n" +
                     "Num_leaves = " + str(num_leaves) + "\n")
                
                #training
                model = lgb.train(temp_params, d_train, num_iterations)
                
                #prediction
                target_prediction = model.predict(dev_df)
                
                #result
                RMSE = np.sqrt(mean_squared_error(target_prediction, dev_df['target']))
                rmse_list.append(RMSE)
                
                print("RMSE = " + str(RMSE) + "\n" +
                    "-" * 50)
                
            
    

In [33]:
print(rmse_list)
print(len(rmse_list))

rmse_list.sort()
rmse_list

[0.17125971654535216, 0.17125971654535216, 0.17125971654535216, 0.17128728161947973, 0.17152304365994905, 0.17171564436640366, 0.1718237228562855, 0.17196618678105502, 0.1720175641999096, 0.17207751924921694, 0.17227765522918062, 0.17227765522918062, 0.17229647293898118, 0.17241512450987045, 0.17246809803081617, 0.17246809803081617, 0.17246809803081617, 0.1724823085666873, 0.17251703873241672, 0.17277173778173205, 0.17286935946569895, 0.17291522313013324, 0.17292688997027916, 0.17292688997027916, 0.17292688997027916, 0.17294495523837033, 0.17294495523837033, 0.17294495523837033, 0.17325098210391573, 0.17325098210391573, 0.17325098210391573, 0.17330056722600648, 0.1736242712044347, 0.1736242712044347, 0.1736242712044347, 0.17402145757005041, 0.1741929702757772, 0.17446823309214998, 0.17452746310046258, 0.17482009739264592, 0.17482336702992465, 0.17485211568240877, 0.17485211568240877, 0.1748736935794383, 0.17506776709047142, 0.1751251255365857, 0.17513247254899253, 0.17521864552176689, 

[0.17125971654535216,
 0.17125971654535216,
 0.17125971654535216,
 0.17128728161947973,
 0.17152304365994905,
 0.17171564436640366,
 0.1718237228562855,
 0.17196618678105502,
 0.1720175641999096,
 0.17207751924921694,
 0.17227765522918062,
 0.17227765522918062,
 0.17229647293898118,
 0.17241512450987045,
 0.17246809803081617,
 0.17246809803081617,
 0.17246809803081617,
 0.1724823085666873,
 0.17251703873241672,
 0.17277173778173205,
 0.17286935946569895,
 0.17291522313013324,
 0.17292688997027916,
 0.17292688997027916,
 0.17292688997027916,
 0.17294495523837033,
 0.17294495523837033,
 0.17294495523837033,
 0.17325098210391573,
 0.17325098210391573,
 0.17325098210391573,
 0.17330056722600648,
 0.1736242712044347,
 0.1736242712044347,
 0.1736242712044347,
 0.17402145757005041,
 0.1741929702757772,
 0.17446823309214998,
 0.17452746310046258,
 0.17482009739264592,
 0.17482336702992465,
 0.17485211568240877,
 0.17485211568240877,
 0.1748736935794383,
 0.17506776709047142,
 0.175125125536585