### Importing Dataset

In [279]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
import pickle
from sklearn.metrics import roc_auc_score
pd. set_option('display.max_columns', 500)
pd. set_option('display.max_rows', 500)
from itertools import combinations

### EDA on Train and Test Datasets

In [314]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [315]:
train.head(5)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [316]:
train.describe()

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Is_Lead
count,245725.0,245725.0,245725.0,245725.0
mean,43.856307,46.959141,1128403.0,0.237208
std,14.828672,32.353136,852936.4,0.425372
min,23.0,7.0,20790.0,0.0
25%,30.0,20.0,604310.0,0.0
50%,43.0,32.0,894601.0,0.0
75%,54.0,73.0,1366666.0,0.0
max,85.0,135.0,10352010.0,1.0


In [317]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


In [318]:
train.isna().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [319]:
test.isna().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         12522
Avg_Account_Balance        0
Is_Active                  0
dtype: int64

In [320]:
train['Credit_Product'].unique()

array(['No', nan, 'Yes'], dtype=object)

In [321]:
train['Credit_Product'].fillna('Unknown', inplace=True)
test['Credit_Product'].fillna('Unknown', inplace=True)

In [322]:
train.isna().sum()

ID                     0
Gender                 0
Age                    0
Region_Code            0
Occupation             0
Channel_Code           0
Vintage                0
Credit_Product         0
Avg_Account_Balance    0
Is_Active              0
Is_Lead                0
dtype: int64

In [323]:
test.isna().sum()

ID                     0
Gender                 0
Age                    0
Region_Code            0
Occupation             0
Channel_Code           0
Vintage                0
Credit_Product         0
Avg_Account_Balance    0
Is_Active              0
dtype: int64

In [324]:
train.shape, test.shape

((245725, 11), (105312, 10))

### Combining Train and Test Data for Preprocessing

In [325]:
combine=train.append(test)
combine.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0.0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0.0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0.0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0.0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0.0


In [326]:
combine.shape

(351037, 11)

In [327]:
combine.isna().sum()

ID                          0
Gender                      0
Age                         0
Region_Code                 0
Occupation                  0
Channel_Code                0
Vintage                     0
Credit_Product              0
Avg_Account_Balance         0
Is_Active                   0
Is_Lead                105312
dtype: int64

In [328]:
combine.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0.0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0.0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0.0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0.0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0.0


In [329]:
combine['Region_Code'].unique()

array(['RG268', 'RG277', 'RG270', 'RG282', 'RG261', 'RG265', 'RG283',
       'RG254', 'RG269', 'RG257', 'RG279', 'RG280', 'RG252', 'RG284',
       'RG259', 'RG281', 'RG258', 'RG266', 'RG260', 'RG274', 'RG256',
       'RG275', 'RG273', 'RG267', 'RG272', 'RG251', 'RG262', 'RG264',
       'RG278', 'RG276', 'RG263', 'RG250', 'RG255', 'RG253', 'RG271'],
      dtype=object)

In [330]:
combine['Channel_Code'].unique()

array(['X3', 'X1', 'X2', 'X4'], dtype=object)

In [331]:
combine['Occupation'].unique()

array(['Other', 'Salaried', 'Self_Employed', 'Entrepreneur'], dtype=object)

In [332]:
combine['Gender'].unique()

array(['Female', 'Male'], dtype=object)

### Dummy Encoding for Categorical Columns

In [333]:
df = pd.get_dummies(data=combine, columns=['Region_Code', 'Channel_Code', 'Gender', 'Occupation', 'Credit_Product', 'Is_Active'])

In [334]:
df.head()

Unnamed: 0,ID,Age,Vintage,Avg_Account_Balance,Is_Lead,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,Region_Code_RG255,Region_Code_RG256,Region_Code_RG257,Region_Code_RG258,Region_Code_RG259,Region_Code_RG260,Region_Code_RG261,Region_Code_RG262,Region_Code_RG263,Region_Code_RG264,Region_Code_RG265,Region_Code_RG266,Region_Code_RG267,Region_Code_RG268,Region_Code_RG269,Region_Code_RG270,Region_Code_RG271,Region_Code_RG272,Region_Code_RG273,Region_Code_RG274,Region_Code_RG275,Region_Code_RG276,Region_Code_RG277,Region_Code_RG278,Region_Code_RG279,Region_Code_RG280,Region_Code_RG281,Region_Code_RG282,Region_Code_RG283,Region_Code_RG284,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Gender_Female,Gender_Male,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Unknown,Credit_Product_Yes,Is_Active_No,Is_Active_Yes
0,NNVBBKZB,73,43,1045696,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0
1,IDD62UNG,30,32,581988,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0
2,HD3DSEMC,56,26,1484315,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,BF3NC7KV,34,19,470454,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0
4,TEASRWXV,30,33,886787,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0


In [335]:
df.shape

(351037, 55)

In [336]:
df = df.drop(columns=['Region_Code_RG268', 'Channel_Code_X3', 'Gender_Female', 'Occupation_Other', 'Credit_Product_Unknown', 'Is_Active_No'],axis=1)

In [337]:
df.shape

(351037, 49)

In [338]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
df['Avg_Account_Balance']=sc.fit_transform(np.array(df['Avg_Account_Balance']).reshape(-1,1))
df['Age']=sc.fit_transform(np.array(df['Age']).reshape(-1,1))
df['Vintage']=sc.fit_transform(np.array(df['Vintage']).reshape(-1,1))

In [339]:
df.head()

Unnamed: 0,ID,Age,Vintage,Avg_Account_Balance,Is_Lead,Region_Code_RG250,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,Region_Code_RG255,Region_Code_RG256,Region_Code_RG257,Region_Code_RG258,Region_Code_RG259,Region_Code_RG260,Region_Code_RG261,Region_Code_RG262,Region_Code_RG263,Region_Code_RG264,Region_Code_RG265,Region_Code_RG266,Region_Code_RG267,Region_Code_RG269,Region_Code_RG270,Region_Code_RG271,Region_Code_RG272,Region_Code_RG273,Region_Code_RG274,Region_Code_RG275,Region_Code_RG276,Region_Code_RG277,Region_Code_RG278,Region_Code_RG279,Region_Code_RG280,Region_Code_RG281,Region_Code_RG282,Region_Code_RG283,Region_Code_RG284,Channel_Code_X1,Channel_Code_X2,Channel_Code_X4,Gender_Male,Occupation_Entrepreneur,Occupation_Salaried,Occupation_Self_Employed,Credit_Product_No,Credit_Product_Yes,Is_Active_Yes
0,NNVBBKZB,1.963311,-0.121384,-0.098541,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,IDD62UNG,-0.93389,-0.461633,-0.639654,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0
2,HD3DSEMC,0.817906,-0.647223,0.413296,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
3,BF3NC7KV,-0.664383,-0.863745,-0.769806,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0
4,TEASRWXV,-0.93389,-0.430701,-0.283976,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0


In [340]:
df.shape

(351037, 49)

In [341]:
df.isna().sum()

ID                               0
Age                              0
Vintage                          0
Avg_Account_Balance              0
Is_Lead                     105312
Region_Code_RG250                0
Region_Code_RG251                0
Region_Code_RG252                0
Region_Code_RG253                0
Region_Code_RG254                0
Region_Code_RG255                0
Region_Code_RG256                0
Region_Code_RG257                0
Region_Code_RG258                0
Region_Code_RG259                0
Region_Code_RG260                0
Region_Code_RG261                0
Region_Code_RG262                0
Region_Code_RG263                0
Region_Code_RG264                0
Region_Code_RG265                0
Region_Code_RG266                0
Region_Code_RG267                0
Region_Code_RG269                0
Region_Code_RG270                0
Region_Code_RG271                0
Region_Code_RG272                0
Region_Code_RG273                0
Region_Code_RG274   

### Seperating the combined data into Train and Test (after preprocessing is performed)

In [342]:
X = df[df['Is_Lead'].isnull()!=True].drop(['ID','Is_Lead'], axis=1)
y = df[df['Is_Lead'].isnull()!=True]['Is_Lead']

test = df[df['Is_Lead'].isnull()==True].drop(['ID','Is_Lead'], axis=1)

X.shape, y.shape, test.shape

((245725, 47), (245725,), (105312, 47))

In [343]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [344]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((221152, 47), (221152,), (24573, 47), (24573,))

In [345]:
X_train.dtypes

Age                         float64
Vintage                     float64
Avg_Account_Balance         float64
Region_Code_RG250             uint8
Region_Code_RG251             uint8
Region_Code_RG252             uint8
Region_Code_RG253             uint8
Region_Code_RG254             uint8
Region_Code_RG255             uint8
Region_Code_RG256             uint8
Region_Code_RG257             uint8
Region_Code_RG258             uint8
Region_Code_RG259             uint8
Region_Code_RG260             uint8
Region_Code_RG261             uint8
Region_Code_RG262             uint8
Region_Code_RG263             uint8
Region_Code_RG264             uint8
Region_Code_RG265             uint8
Region_Code_RG266             uint8
Region_Code_RG267             uint8
Region_Code_RG269             uint8
Region_Code_RG270             uint8
Region_Code_RG271             uint8
Region_Code_RG272             uint8
Region_Code_RG273             uint8
Region_Code_RG274             uint8
Region_Code_RG275           

## XGBoost with KFold Cross Validation 

In [34]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

errxgb = []
y_pred_tot_xgb = []

from sklearn.model_selection import KFold,StratifiedKFold

fold = StratifiedKFold(n_splits=15)
i = 1
for train_index, test_index in fold.split(X,y):
    x_train, x_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y[train_index], y[test_index]
    m = XGBClassifier(boosting_type='gbdt',
                      max_depth=5,
                      learning_rate=0.07,
                      n_estimators=5000,
                      random_state=1994)
    m.fit(x_train, y_train,
          eval_set=[(x_train,y_train),(x_val, y_val)],
          early_stopping_rounds=200,
          eval_metric='auc',
          verbose=200)
    pred_y = m.predict_proba(x_val)[:,-1]
    print("err_xgb: ",roc_auc_score(y_val,pred_y))
    fpr, tpr, thresholds = roc_curve(y_val, pred_y)
    errxgb.append(roc_auc_score(y_val, pred_y))
    pred_test = m.predict_proba(test)[:,-1]
    i = i + 1
    y_pred_tot_xgb.append(pred_test)

Parameters: { boosting_type } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-auc:0.86709	validation_1-auc:0.86549
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.88025	validation_1-auc:0.87305
[400]	validation_0-auc:0.88453	validation_1-auc:0.87327
Stopping. Best iteration:
[329]	validation_0-auc:0.88318	validation_1-auc:0.87342

err_xgb:  0.8734223567084441
Parameters: { boosting_type } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find a

[0]	validation_0-auc:0.86705	validation_1-auc:0.86408
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.88061	validation_1-auc:0.87172
Stopping. Best iteration:
[190]	validation_0-auc:0.88035	validation_1-auc:0.87182

err_xgb:  0.8718228803616127
Parameters: { boosting_type } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-auc:0.86682	validation_1-auc:0.86889
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.88016	validation_1-auc:0.87612
Stopping. Best iteration:
[103]	validation_0-auc:0.87726	valida

In [35]:
errxgb

[0.8734223567084441,
 0.8730017074326368,
 0.8730717555814465,
 0.8723463355932158,
 0.8754613931424602,
 0.8754994804719394,
 0.8703276453508869,
 0.8747705287307995,
 0.8761047796746324,
 0.8744657580183766,
 0.8718228803616127,
 0.8767842620835579,
 0.8788813518201929,
 0.8771307905560578,
 0.8710525898470557]

In [36]:
y_pred_tot_xgb

[array([0.04723882, 0.8797175 , 0.09780419, ..., 0.06832457, 0.21979232,
        0.04558076], dtype=float32),
 array([0.0573528 , 0.86809224, 0.05535294, ..., 0.066852  , 0.21570942,
        0.05400757], dtype=float32),
 array([0.05379895, 0.8764502 , 0.07069585, ..., 0.06939562, 0.22922099,
        0.05246773], dtype=float32),
 array([0.05891298, 0.873611  , 0.05439588, ..., 0.07828564, 0.21493192,
        0.05289472], dtype=float32),
 array([0.05046985, 0.8719527 , 0.08216296, ..., 0.06075313, 0.22269185,
        0.05189189], dtype=float32),
 array([0.04825287, 0.8755949 , 0.07021528, ..., 0.06789476, 0.21955298,
        0.05324034], dtype=float32),
 array([0.05479074, 0.8717453 , 0.05529359, ..., 0.07569923, 0.21849713,
        0.05357978], dtype=float32),
 array([0.04420976, 0.8730412 , 0.10235523, ..., 0.06678578, 0.20788234,
        0.04863653], dtype=float32),
 array([0.0577918 , 0.8637155 , 0.05413707, ..., 0.07393185, 0.20600437,
        0.05484215], dtype=float32),
 array([0.

In [38]:
filename = 'jobathon.sav'
pickle.dump(m, open(filename, 'wb'))

In [39]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8596020021975339


In [44]:
y_pred = loaded_model.predict_proba(X_test)[:,-1]

In [49]:
roc_auc_score(y_test,y_pred)

0.8778819568267398

In [89]:
tp = pd.DataFrame(np.mean(y_pred_tot_xgb,0))

In [90]:
tp.head()

Unnamed: 0,0
0,0.05324
1,0.872685
2,0.067966
3,0.02575
4,0.025543


## LGBM with KFold Cross Validation 

In [93]:
from lightgbm import LGBMClassifier

err = []
y_pred_tot_lgm = []

from sklearn.model_selection import StratifiedKFold

fold = StratifiedKFold(n_splits=15)
i = 1
for train_index, test_index in fold.split(X, y):
    x_train, x_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y[train_index], y[test_index]
    m = LGBMClassifier(boosting_type='gbdt',
                       max_depth=5,
                       learning_rate=0.05,
                       n_estimators=5000,
                       min_child_weight=0.01,
                       colsample_bytree=0.5,
                       random_state=1994)
    m.fit(x_train, y_train,
          eval_set=[(x_train,y_train),(x_val, y_val)],
          early_stopping_rounds=200,
          eval_metric='auc',
          verbose=200)
    pred_y = m.predict_proba(x_val)[:,1]
    print("err_lgm: ",roc_auc_score(y_val,pred_y))
    fpr, tpr, thresholds = roc_curve(y_val, pred_y)
    err.append(roc_auc_score(y_val, pred_y))
    pred_test = m.predict_proba(test)[:,1]
    i = i + 1
    y_pred_tot_lgm.append(pred_test)

Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.877777	training's binary_logloss: 0.341273	valid_1's auc: 0.872639	valid_1's binary_logloss: 0.345969
[400]	training's auc: 0.881399	training's binary_logloss: 0.337757	valid_1's auc: 0.87306	valid_1's binary_logloss: 0.344992
[600]	training's auc: 0.883973	training's binary_logloss: 0.33525	valid_1's auc: 0.872819	valid_1's binary_logloss: 0.344993
Early stopping, best iteration is:
[441]	training's auc: 0.881965	training's binary_logloss: 0.337162	valid_1's auc: 0.873102	valid_1's binary_logloss: 0.344895
err_lgm:  0.8731016879596015
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.877888	training's binary_logloss: 0.341267	valid_1's auc: 0.871466	valid_1's binary_logloss: 0.345703
Early stopping, best iteration is:
[174]	training's auc: 0.877203	training's binary_logloss: 0.342116	valid_1's auc: 0.871609	valid_1's binary_logloss: 0.346115
err_lgm:  0.87160892000

Early stopping, best iteration is:
[165]	training's auc: 0.876483	training's binary_logloss: 0.34323	valid_1's auc: 0.879759	valid_1's binary_logloss: 0.337287
err_lgm:  0.8797588927490708
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.877546	training's binary_logloss: 0.341648	valid_1's auc: 0.876055	valid_1's binary_logloss: 0.341191
[400]	training's auc: 0.881197	training's binary_logloss: 0.338044	valid_1's auc: 0.876578	valid_1's binary_logloss: 0.340316
[600]	training's auc: 0.883788	training's binary_logloss: 0.335454	valid_1's auc: 0.876428	valid_1's binary_logloss: 0.340562
Early stopping, best iteration is:
[412]	training's auc: 0.881367	training's binary_logloss: 0.337865	valid_1's auc: 0.876656	valid_1's binary_logloss: 0.340275
err_lgm:  0.8766556957317152
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.877928	training's binary_logloss: 0.341093	valid_1's auc: 0.870538	valid_1's binary_logloss: 0.

In [94]:
err

[0.8731016879596015,
 0.8716089200010807,
 0.8722720967055314,
 0.8729126310640711,
 0.8754799889026763,
 0.8754464423983662,
 0.8698120300194467,
 0.8742405598612968,
 0.8766245033717017,
 0.8738809697538622,
 0.8721706261318938,
 0.8773559147678865,
 0.8797588927490708,
 0.8766556957317152,
 0.8713341538365217]

In [95]:
test_lgbm = pd.DataFrame(np.mean(y_pred_tot_lgm,0))

In [96]:
test_lgbm.head()

Unnamed: 0,0
0,0.053064
1,0.818869
2,0.070106
3,0.027072
4,0.024831


### Ensembling of XGBoost and LightGBM with appropriate weights

In [None]:
def findbestweight(df1,df2,target):
    max_roc = -1
    max_weight = 0
    max_ensemble_oof  = 0
    weights_list = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
    for weight in weights_list:
        ensemble_oof = weight*df1 + (1-weight)*df2
        roc_score = roc_auc_score(target,ensemble_oof)
        if roc_score > max_roc:
            max_ensemble_oof = ensemble_oof
            max_roc = roc_score
            max_weight = weight
            print(weight)
    print("The best weights for blending is {0} with AUC {1}".format(max_weight, max_roc))
    return max_weight

In [None]:
findbestweight(pred_y_xgb_ensemble,pred_y_lgbm_ensemble,y_test_ensemble)

In [99]:
ensemble = 0.45*np.mean(y_pred_tot_xgb,0)+0.55*np.mean(y_pred_tot_lgm,0)

In [102]:
sub = pd.read_csv('sample_submission.csv')
sub['Is_Lead'] = ensemble
sub.head()
sub.to_csv('final_submission.csv',index=False)