### The following Modeling is done removing the null values  from the dataset to check if the model will perform differently as compared to the model with includes null values to train. 

#### Dataset like historical transaction, new and old merchant dataset contains 20% of null values, those null values are removed to perform this model.

In [20]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import time
from sklearn.model_selection import train_test_split
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import tensorflow.contrib.layers as layers
from tensorflow.python.summary.writer.writer import FileWriter

In [21]:
# Data import

train_df = pd.read_csv("train.csv", parse_dates=["first_active_month"]) #makes first_active_month column as datetime column 
test_df = pd.read_csv("test.csv", parse_dates=["first_active_month"])

In [22]:
# Now converting feature columns into categorical columns

train_df['feature_1'] = train_df['feature_1'].astype('category')
train_df['feature_2'] = train_df['feature_2'].astype('category')
train_df['feature_3'] = train_df['feature_3'].astype('category')

test_df['feature_1'] = test_df['feature_1'].astype('category')
test_df['feature_2'] = test_df['feature_2'].astype('category')
test_df['feature_3'] = test_df['feature_3'].astype('category')

In [23]:
# checking missing data for test dataset. There is only one data point for test dataset which is null.

total = test_df.isnull().sum().sort_values(ascending = False)
percent = (test_df.isnull().sum()/test_df.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
first_active_month,1,0.000809
target,0,0.0
feature_3,0,0.0
feature_2,0,0.0
feature_1,0,0.0
card_id,0,0.0


In [24]:
# Filling the missing value in test_df

test_df.loc[test_df['first_active_month'].isna(), 'first_active_month'] = test_df.loc[(test_df['feature_1'] == 5) & (test_df['feature_2'] == 2) & (test_df['feature_3'] == 1), 'first_active_month'].min()

In [25]:
# Created a elapsed time column for each id in train set:

import datetime
def read_data(input_file):
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['year'] = df['first_active_month'].dt.year
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
    return df
train_df = read_data('train.csv')
test_df = read_data('test.csv')

target = train_df['target']
del train_df['target']

In [26]:
#Now lets look at the historical data
hist_df = pd.read_csv("historical_transactions.csv")
hist_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [27]:
# The shape of historicsl data

hist_df.shape

(29112361, 14)

In [28]:
# convert the authorized_flag to a binary value

hist_df['authorized_flag'] = hist_df['authorized_flag'].map({'Y':1, 'N':0})

In [29]:
# Checking the null Values

total = hist_df.isnull().sum().sort_values(ascending = False)
percent = (hist_df.isnull().sum()/hist_df.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
category_2,2652864,9.1125
category_3,178159,0.61197
merchant_id,138481,0.475678
subsector_id,0,0.0
state_id,0,0.0
purchase_date,0,0.0
purchase_amount,0,0.0
month_lag,0,0.0
merchant_category_id,0,0.0
installments,0,0.0


In [30]:
# Dropping the null values in the rows from dataset

hist_df.dropna(inplace = True)
hist_df.isnull().sum()

authorized_flag         0
card_id                 0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
merchant_id             0
month_lag               0
purchase_amount         0
purchase_date           0
category_2              0
state_id                0
subsector_id            0
dtype: int64

In [31]:
# Calculating the aggregate of historical dataset variables:

def aggregate_historical_transactions(history):
    
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
        'authorized_flag': ['sum', 'mean'],
        'merchant_id': ['nunique'],
        'city_id': ['nunique'],
        'purchase_amount': ['sum', 'median', 'max', 'min', 'std'],
        'installments': ['sum', 'median', 'max', 'min', 'std'],
        'purchase_date': [np.ptp],
        'month_lag': ['min', 'max']
        }
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['hist_' + '_'.join(col).strip() 
                           for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='hist_transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

history = aggregate_historical_transactions(hist_df)

In [32]:
# Merge history data and train data 

train_df = pd.merge(train_df, history, on='card_id', how='left')
test_df = pd.merge(test_df, history, on='card_id', how='left')
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,year,month,elapsed_time,hist_transactions_count,hist_authorized_flag_sum,...,hist_purchase_amount_min,hist_purchase_amount_std,hist_installments_sum,hist_installments_median,hist_installments_max,hist_installments_min,hist_installments_std,hist_purchase_date_ptp,hist_month_lag_min,hist_month_lag_max
0,2017-06-01,C_ID_92a2005557,5,2,1,2017,6,245,254.0,242.0,...,-0.739395,0.214439,4.0,0.0,1.0,0.0,0.124745,20977987.0,-8.0,0.0
1,2017-01-01,C_ID_3d0044924f,4,1,0,2017,1,396,317.0,308.0,...,-0.740897,0.246037,476.0,1.0,10.0,1.0,1.388526,33717687.0,-12.0,0.0
2,2016-08-01,C_ID_d639edf6cd,2,2,0,2016,8,549,43.0,41.0,...,-0.730138,0.08738,0.0,0.0,0.0,0.0,0.0,35635623.0,-13.0,0.0
3,2017-09-01,C_ID_186d6a6901,4,3,0,2017,9,153,63.0,63.0,...,-0.73941,0.043748,64.0,1.0,2.0,1.0,0.125988,13375339.0,-5.0,0.0
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,2017,11,92,117.0,115.0,...,-0.737892,0.159654,117.0,1.0,1.0,1.0,0.0,9405641.0,-3.0,0.0


In [33]:
# Import old Merchant data

merchant_df = pd.read_csv("merchants.csv")
print("shape of merchant : ",merchant_df.shape)

shape of merchant :  (334696, 22)


In [34]:
# checking missing data of old merchant data

total = merchant_df.isnull().sum().sort_values(ascending = False)
percent = (merchant_df.isnull().sum()/merchant_df.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
category_2,11887,3.551581
avg_sales_lag3,13,0.003884
avg_sales_lag12,13,0.003884
avg_sales_lag6,13,0.003884
merchant_group_id,0,0.0
merchant_category_id,0,0.0
subsector_id,0,0.0
numerical_1,0,0.0
numerical_2,0,0.0
category_1,0,0.0


In [35]:
# Removing the missing rows

merchant_df.dropna(inplace=True)
merchant_df.isnull().sum()

merchant_id                    0
merchant_group_id              0
merchant_category_id           0
subsector_id                   0
numerical_1                    0
numerical_2                    0
category_1                     0
most_recent_sales_range        0
most_recent_purchases_range    0
avg_sales_lag3                 0
avg_purchases_lag3             0
active_months_lag3             0
avg_sales_lag6                 0
avg_purchases_lag6             0
active_months_lag6             0
avg_sales_lag12                0
avg_purchases_lag12            0
active_months_lag12            0
category_4                     0
city_id                        0
state_id                       0
category_2                     0
dtype: int64

In [36]:
# Import new merchant data file

new_merchant_df = pd.read_csv("new_merchant_transactions.csv")
print("shape of new_merchant_transactions : ",new_merchant_df.shape)

shape of new_merchant_transactions :  (1048575, 14)


In [37]:
# convert the authorized_flag to a binary value

new_merchant_df['authorized_flag'] = new_merchant_df['authorized_flag'].map({'Y':1, 'N':0})

In [38]:
# checking missing data of old merchant data

total = new_merchant_df.isnull().sum().sort_values(ascending = False)
percent = (new_merchant_df.isnull().sum()/new_merchant_df.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
category_2,52936,5.048375
category_3,26054,2.484705
merchant_id,12044,1.148606
subsector_id,0,0.0
state_id,0,0.0
purchase_date,0,0.0
purchase_amount,0,0.0
month_lag,0,0.0
merchant_category_id,0,0.0
installments,0,0.0


In [39]:
# Removing the missing rows

new_merchant_df.dropna(inplace=True)
new_merchant_df.isnull().sum()

authorized_flag         0
card_id                 0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
merchant_id             0
month_lag               0
purchase_amount         0
purchase_date           0
category_2              0
state_id                0
subsector_id            0
dtype: int64

In [40]:
# Calculating the aggregate of Merchant dataset variables:

def aggregate_new_transactions(new_trans):    
    agg_func = {
        'authorized_flag': ['sum', 'mean'],
        'merchant_id': ['nunique'],
        'city_id': ['nunique'],
        'purchase_amount': ['sum', 'median', 'max', 'min', 'std'],
        'installments': ['sum', 'median', 'max', 'min', 'std'],
        'month_lag': ['min', 'max']
        }
    agg_new_trans = new_trans.groupby(['card_id']).agg(agg_func)
    agg_new_trans.columns = ['new_' + '_'.join(col).strip() 
                           for col in agg_new_trans.columns.values]
    agg_new_trans.reset_index(inplace=True)
    
    df = (new_trans.groupby('card_id')
          .size()
          .reset_index(name='new_transactions_count'))
    
    agg_new_trans = pd.merge(df, agg_new_trans, on='card_id', how='left')
    
    return agg_new_trans

new_merchant_df = aggregate_new_transactions(new_merchant_df)

In [41]:
# Printing the New Merchant data 

new_merchant_df.head()

Unnamed: 0,card_id,new_transactions_count,new_authorized_flag_sum,new_authorized_flag_mean,new_merchant_id_nunique,new_city_id_nunique,new_purchase_amount_sum,new_purchase_amount_median,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_median,new_installments_max,new_installments_min,new_installments_std,new_month_lag_min,new_month_lag_max
0,C_ID_0001238066,22,22,1,22,7,-12.697057,-0.649235,-0.078318,-0.732783,0.15885,35,1.0,10,1,1.967815,1,2
1,C_ID_0001793786,29,29,1,29,6,0.916332,-0.365234,3.129932,-0.726622,0.96693,0,0.0,0,0,0.0,1,2
2,C_ID_000183fdda,10,10,1,10,2,-6.483098,-0.672602,-0.459601,-0.732332,0.087387,17,1.0,4,1,1.05935,1,2
3,C_ID_00032df08f,7,7,1,7,1,-2.151015,-0.433395,0.449203,-0.624291,0.374766,7,1.0,1,1,0.0,2,2
4,C_ID_00057b99fe,1,1,1,1,1,-0.701828,-0.701828,-0.701828,-0.701828,,0,0.0,0,0,,1,1


In [42]:
# Merge train data and new merchant data:

train_df = pd.merge(train_df, new_merchant_df, on='card_id', how='left')
test_df = pd.merge(test_df, new_merchant_df, on='card_id', how='left')
train_df.shape

(201917, 43)

In [43]:
use_cols = [col for col in train_df.columns if col not in ['card_id', 'first_active_month']]

train_df = train_df[use_cols]
test_df = test_df[use_cols]

features = list(train_df[use_cols].columns)
categorical_feats = [col for col in features if 'feature_' in col]

for col in categorical_feats:
    print(col, 'have', train_df[col].value_counts().shape[0], 'categories.')

feature_1 have 5 categories.
feature_2 have 3 categories.
feature_3 have 2 categories.


In [44]:
from sklearn.preprocessing import LabelEncoder
for col in categorical_feats:
    print(col)
    lbl = LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

feature_1
feature_2
feature_3


In [45]:
# Merge Train and Test data

df_all = pd.concat([train_df, test_df])
df_all = pd.get_dummies(df_all, columns=categorical_feats)

len_train_df = train_df.shape[0]

train = df_all[:len_train_df]
test = df_all[len_train_df:]
train.shape

(201917, 48)

## LIGHT Gradient Boosting (Light GBM)

### ENSEMBLE LEARNING USING BOOSTING TECHNIQUES (lgboost and xgboost)

For LGB AND XGB, we use k-fold for cross validation on the dataset. K-fold is a method usually implemented in applied machine
learning algorithms. It consists of 5-folds for cross validation which means that 1-hold of the dataset is used for validating 
on the test set and the rest are used for training on the train set.

LGB Parameters

1. max_depth: It describes the maximum depth of tree. This parameter is used to handle model overfitting. Any time you feel that your model is overfitted, my first advice will be to lower max_depth.

2. objective: What do you want your output to be? In this case, we want regressional output so "regression" is chosen.

3. metric: rmse

4. min_data_in_leaf: It is the minimum number of the records a leaf may have. The default value is 20, optimum value. It is also used to deal over fitting

5. reg_alpha and reg_lambda are the parameters.

In [46]:
! pip install lightgbm

[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [47]:
lgb_params = {"objective" : "regression", "metric" : "rmse", 
               "max_depth": 7, "min_child_samples": 20, 
               "reg_alpha": 1, "reg_lambda": 1,
               "num_leaves" : 64, "learning_rate" : 0.001, 
               "subsample" : 0.8, "colsample_bytree" : 0.8, 
               "verbosity": -1}

FOLDs = KFold(n_splits=5, shuffle=True, random_state=1989)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

features_lgb = list(train.columns)
feature_importance_df_lgb = pd.DataFrame()

In [48]:
# For every fold, the maximum number of iterations are 2000 and this training will go on for till there is 
# improvement in the validation score.

start = time.time()
for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(train)):
    trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx])

    print("LGB " + str(fold_) + "-" * 50)
    num_round = 2000
    clf = lgb.train(lgb_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 2000)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)

    fold_importance_df_lgb = pd.DataFrame()
    fold_importance_df_lgb["feature"] = features_lgb
    fold_importance_df_lgb["importance"] = clf.feature_importance()
    fold_importance_df_lgb["fold"] = fold_ + 1
    feature_importance_df_lgb = pd.concat([feature_importance_df_lgb, fold_importance_df_lgb], axis=0)
    predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / FOLDs.n_splits
    

print(np.sqrt(mean_squared_error(oof_lgb, target)))
end = time.time()
print(end - start)

LGB 0--------------------------------------------------
Training until validation scores don't improve for 2000 rounds.
[200]	training's rmse: 3.80839	valid_1's rmse: 3.90724
[400]	training's rmse: 3.79039	valid_1's rmse: 3.89401
[600]	training's rmse: 3.7764	valid_1's rmse: 3.88414
[800]	training's rmse: 3.76509	valid_1's rmse: 3.87683
[1000]	training's rmse: 3.75545	valid_1's rmse: 3.87103
[1200]	training's rmse: 3.74744	valid_1's rmse: 3.86663
[1400]	training's rmse: 3.7405	valid_1's rmse: 3.86324
[1600]	training's rmse: 3.73437	valid_1's rmse: 3.86085
[1800]	training's rmse: 3.72912	valid_1's rmse: 3.85907
[2000]	training's rmse: 3.72456	valid_1's rmse: 3.85779
Did not meet early stopping. Best iteration is:
[2000]	training's rmse: 3.72456	valid_1's rmse: 3.85779
LGB 1--------------------------------------------------
Training until validation scores don't improve for 2000 rounds.
[200]	training's rmse: 3.84036	valid_1's rmse: 3.77523
[400]	training's rmse: 3.82166	valid_1's rmse: 

In [11]:
# Printing the final CV score of LGB:

print('CV score lgb', np.sqrt(mean_squared_error(oof_lgb, target)))

CV score lgb 3.78386


In [49]:
import xgboost as xgb

start = time.time()

xgb_params = {'eta': 0.001, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True}


FOLDs = KFold(n_splits=5, shuffle=True, random_state=1989)

oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))



for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(train)):
    trn_data = xgb.DMatrix(data=train.iloc[trn_idx], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(data=train.iloc[val_idx], label=target.iloc[val_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    print("xgb " + str(fold_) + "-" * 50)
    num_round = 2000
    
    xgb_model = xgb.train(xgb_params, trn_data, num_round, watchlist, early_stopping_rounds=100, verbose_eval=200)
    oof_xgb[val_idx] = xgb_model.predict(xgb.DMatrix(train.iloc[val_idx]), ntree_limit=xgb_model.best_ntree_limit+50)

    predictions_xgb += xgb_model.predict(xgb.DMatrix(test), ntree_limit=xgb_model.best_ntree_limit+50) / FOLDs.n_splits

np.sqrt(mean_squared_error(oof_xgb, target))
end = time.time()
print(end - start)

xgb 0--------------------------------------------------
[0]	train-rmse:3.93301	valid-rmse:4.02943
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[200]	train-rmse:3.87392	valid-rmse:3.9789
[400]	train-rmse:3.8305	valid-rmse:3.94369
[600]	train-rmse:3.79836	valid-rmse:3.91939
[800]	train-rmse:3.77352	valid-rmse:3.90179
[1000]	train-rmse:3.75404	valid-rmse:3.88962
[1200]	train-rmse:3.73827	valid-rmse:3.88093
[1400]	train-rmse:3.72498	valid-rmse:3.87447
[1600]	train-rmse:3.71385	valid-rmse:3.86972
[1800]	train-rmse:3.70402	valid-rmse:3.86639
[1999]	train-rmse:3.69521	valid-rmse:3.86408
xgb 1--------------------------------------------------
[0]	train-rmse:3.96778	valid-rmse:3.89069
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[200]	train-rmse:3.90728	valid-rmse:3.83979
[400]	train-rmse:3.86275

In [12]:
print('CV score xgb', np.sqrt(mean_squared_error(oof_xgb, target)))

CV score xgb 3.78752


In [57]:
total_sum = 0.5 * oof_lgb + 0.5 * oof_xgb
print("CV score: {:<8.5f}".format(mean_squared_error(total_sum, target)**0.5))

CV score: 3.78483 
