In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder,StandardScaler,KBinsDiscretizer
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm ,tnrange

pd. set_option('display.max_columns', None)

In [37]:
train_data = pd.read_csv(r'../input/train.csv')
test_data = pd.read_csv(r'../input/test.csv')

In [38]:
train_data.shape

(245725, 11)

In [39]:
# Drop Duplicate based on ID
data=train_data.drop_duplicates(subset='ID', keep="first")

## Feature Creation

1. Sales variable: Combine the Region code and Channel code 
2. Customer Demographics : Combine the Occupation and gender
3. Extract Vintage year and Month from Vintage column

In [40]:
# Sales channel : based on Channel code & Region code
train_data['Region_channel'] = train_data['Channel_Code'].astype(str)+'_'+train_data['Region_Code'].astype(str)
test_data['Region_channel'] = test_data['Channel_Code'].astype(str)+'_'+test_data['Region_Code'].astype(str)

# Demographics: based on Occupation & Gender
train_data['Demographics'] = train_data['Occupation'].astype(str)+'_'+train_data['Gender'].astype(str)
test_data['Demographics'] = test_data['Occupation'].astype(str)+'_'+test_data['Gender'].astype(str)

# Convert vintage into years as float
train_data['Vintage'] = train_data['Vintage']/12
test_data['Vintage'] = test_data['Vintage']/12

train_data['Vintage'] = train_data['Vintage'].astype(str)
test_data['Vintage'] = test_data['Vintage'].astype(str)

# Extract Vintage month and years
train_data['Vintage_year'] = train_data['Vintage'].apply(lambda x: x.split(".")[0])
train_data['Vintage_month'] = train_data['Vintage'].apply(lambda x: x.split(".")[1][0])
test_data['Vintage_year'] = test_data['Vintage'].apply(lambda x: x.split(".")[0])
test_data['Vintage_month'] = test_data['Vintage'].apply(lambda x: x.split(".")[1][0])

# Drop the Vintage
train_data.drop('Vintage',axis=1,inplace=True)
test_data.drop('Vintage',axis=1,inplace=True)

### Label Encoding

In [41]:
# Feature types
cat_features = ['Gender','Region_Code','Occupation','Channel_Code',
                'Vintage_year','Credit_Product','Is_Active','Region_channel','Demographics','Vintage_month']
cont_features = ['Age','Avg_Account_Balance']
target = 'Is_Lead'

def encoder(data,label_features):
    for feat in label_features:
        encoder = LabelEncoder()
        col = data[feat].fillna("UNKNOWN").astype(str).values
        data.loc[:, feat] = encoder.fit_transform(col)    
    return data

 # Label Encoding the variable   
train_data = encoder(train_data,cat_features)
test_data = encoder(test_data,cat_features)

In [42]:
# Combine the data for further Feature Engineering
train_data['Istrain'] = 1
test_data['Istrain'] = 0

# Combine the data
combined_data = pd.concat([train_data,test_data],axis =0).reset_index(drop = True).copy()

### Discretization Transforms

**The discretization transform is available in the scikit-learn Python machine learning library via the KBinsDiscretizer class.**

**Uniform: Each bin has the same width in the span of possible values for the variable.**

**Quantile: Each bin has the same number of values, split based on percentiles.**

**Clustered: Clusters are identified and examples are assigned to each group**

Both Continous variables are skewed use Quantile based approach

In [43]:
# Bining the Average account balance & Age
for cont_feat in cont_features:
    discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    feature_name = cont_feat+"_"+"bins"
    combined_data[feature_name] = discretizer.fit_transform(combined_data[cont_feat].values.reshape(-1,1)).astype(int)

### Feature Counting

In [44]:
combined_data['Gender_Counts'] = combined_data['Gender'].map(combined_data['Gender'].value_counts().to_dict())
combined_data['Region_counts'] = combined_data['Region_Code'].map(combined_data['Region_Code'].value_counts().to_dict())
combined_data['Channel_Code_Counts'] = combined_data['Channel_Code'].map(combined_data['Channel_Code'].value_counts().to_dict())

### Feature Agg based on different level

In [45]:
# Account amount based on region channel
combined_data['mean_balance_sales']=combined_data.groupby(['Region_channel'])['Avg_Account_Balance'].transform('mean')
combined_data['std_balance_sales']=combined_data.groupby(['Region_channel'])['Avg_Account_Balance'].transform('std')
combined_data['min_balance_sales']=combined_data.groupby(['Region_channel'])['Avg_Account_Balance'].transform('min')
combined_data['max_balance_sales']=combined_data.groupby(['Region_channel'])['Avg_Account_Balance'].transform('max')

#Age based on Region channel
combined_data['mean_age_sales']=combined_data.groupby(['Region_channel'])['Age'].transform('mean')
combined_data['std_age_sales']=combined_data.groupby(['Region_channel'])['Age'].transform('std')
combined_data['min_age_sales']=combined_data.groupby(['Region_channel'])['Age'].transform('min')
combined_data['max_age_sales']=combined_data.groupby(['Region_channel'])['Age'].transform('max')

In [46]:
# Account amount based on Customer Demographics
combined_data['mean_balance_demo']=combined_data.groupby(['Demographics'])['Avg_Account_Balance'].transform('mean')
combined_data['std_balance_demo']=combined_data.groupby(['Demographics'])['Avg_Account_Balance'].transform('std')
combined_data['min_balance_demo']=combined_data.groupby(['Demographics'])['Avg_Account_Balance'].transform('min')
combined_data['max_balance_demo']=combined_data.groupby(['Demographics'])['Avg_Account_Balance'].transform('max')

#Age based on Customer Demographics
combined_data['mean_age_demo']=combined_data.groupby(['Demographics'])['Age'].transform('mean')
combined_data['std_age_demo']=combined_data.groupby(['Demographics'])['Age'].transform('std')
combined_data['min_age_demo']=combined_data.groupby(['Demographics'])['Age'].transform('min')
combined_data['max_age_demo']=combined_data.groupby(['Demographics'])['Age'].transform('max')

In [47]:
#feature Based Credit product and occupation
combined_data['mean_balance_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Avg_Account_Balance'].transform('mean')
combined_data['std_balance_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Avg_Account_Balance'].transform('std')
combined_data['min_balance_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Avg_Account_Balance'].transform('min')
combined_data['max_balance_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Avg_Account_Balance'].transform('max')


combined_data['mean_age_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Age'].transform('mean')
combined_data['std_age_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Age'].transform('std')
combined_data['min_age_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Age'].transform('min')
combined_data['max_age_credit_occ']=combined_data.groupby(['Credit_Product','Occupation'])['Age'].transform('max')


In [48]:
# Feature Based on Active account and Occupation

combined_data['mean_balance_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Avg_Account_Balance'].transform('mean')
combined_data['std_balance_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Avg_Account_Balance'].transform('std')
combined_data['min_balance_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Avg_Account_Balance'].transform('min')
combined_data['max_balance_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Avg_Account_Balance'].transform('max')

combined_data['mean_age_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Age'].transform('mean')
combined_data['std_age_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Age'].transform('std')
combined_data['min_age_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Age'].transform('min')
combined_data['max_age_credit_active']=combined_data.groupby(['Is_Active','Occupation'])['Age'].transform('max')

In [49]:
# Feature based on Credit product and customer Demographics

combined_data['mean_balance_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Avg_Account_Balance'].transform('mean')
combined_data['std_balance_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Avg_Account_Balance'].transform('std')
combined_data['min_balance_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Avg_Account_Balance'].transform('min')
combined_data['max_balance_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Avg_Account_Balance'].transform('max')

combined_data['mean_balance_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Avg_Account_Balance'].transform('mean')
combined_data['std_balance_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Avg_Account_Balance'].transform('std')
combined_data['min_balance_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Avg_Account_Balance'].transform('min')
combined_data['max_balance_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Avg_Account_Balance'].transform('max')

In [50]:
#feature based on Credit product and customer demographics

combined_data['mean_age_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Age'].transform('mean')
combined_data['std_age_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Age'].transform('std')
combined_data['min_age_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Age'].transform('min')
combined_data['max_age_credit_demo']=combined_data.groupby(['Credit_Product','Demographics'])['Age'].transform('max')

combined_data['mean_age_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Age'].transform('mean')
combined_data['std_age_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Age'].transform('std')
combined_data['min_age_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Age'].transform('min')
combined_data['max_age_active_demo']=combined_data.groupby(['Is_Active','Demographics'])['Age'].transform('max')

In [51]:
# Feature based on both sales channel demographics based
combined_data['mean_credit_Region_channel_demo'] = combined_data.groupby(['Region_channel','Demographics'])['Avg_Account_Balance'].transform('mean')
combined_data['max_credit_Region_channel_demo'] = combined_data.groupby(['Region_channel','Demographics'])['Avg_Account_Balance'].transform('max')
combined_data['min_credit_Region_channel_demo'] = combined_data.groupby(['Region_channel','Demographics'])['Avg_Account_Balance'].transform('min')

combined_data['mean_age_Region_channel_demo'] = combined_data.groupby(['Region_channel','Demographics'])['Age'].transform('mean')
combined_data['max_age_Region_channel_demo'] = combined_data.groupby(['Region_channel','Demographics'])['Age'].transform('max')
combined_data['min_age_Region_channel_demo'] = combined_data.groupby(['Region_channel','Demographics'])['Age'].transform('min')

### Standard Scaling

In [52]:
label_feature = ['ID','Gender','Region_Code','Occupation','Channel_Code',
                'Credit_Product','Is_Active','Is_Lead','Region_channel',
                'Demographics','Vintage_year','Vintage_month','Istrain','Avg_Account_Balance_bins','Age_Bins']

cont_features = [x for x in combined_data.columns if x not in label_feature]

def std_encoder(data,std_feat):
    for feat in std_feat:
        std = StandardScaler()
        temp_col = data[feat].fillna(data[feat].mean()).astype(int).values
        data.loc[:, feat] = std.fit_transform(temp_col.reshape(-1,1))        
    return data

final_data = std_encoder(combined_data,cont_features)

#### Modelling Data Preparation

In [53]:
train_df = final_data[final_data['Istrain']==1]
test_df = final_data[final_data['Istrain']==0]

target = train_df['Is_Lead']
train_df = train_df.drop(columns =['Istrain','ID','Is_Lead'],axis=1)
test_df = test_df.drop(columns=['Istrain','ID','Is_Lead'],axis=1)
X_train, y_train, X_test = train_df.copy(), target.copy(),test_df.copy()



##### Model Approch
1. Start with Randomforest base line ---> 0.74 ROC score (Not included in this work)
2. LGBM -- > 0.785 (not inlcude in this work)

*Final Model :weighted voted classifier of LGBM & XGBoost and catboost*

In [54]:
## Model Final Params

## Hyperparameter is mix of both previous work of people and run & trails

cat_features = [x for x in train_df.columns if x not in cont_features]

model_dispatcher = {"lgbm" : lgb.LGBMClassifier(boosting_type='gbdt',
                                 n_estimators=15000,
                                 max_depth=12,
                                 learning_rate=0.02,
                                 subsample=0.9,
                                 colsample_bytree=0.4,
                                 objective ='binary',
                                 random_state = 27,
                                 importance_type='gain',
                                 reg_alpha=2,
                                 reg_lambda=2),
                    
                'xgboost' :   xgb.XGBClassifier(n_estimators=1200,
                                max_depth=8,
                                learning_rate=0.04,
                                subsample=0.9,
                                colsample_bytree=0.4,
                                objective = 'binary:logistic',
                                random_state = 27
                               ),
                    
                "Catboost" : CatBoostClassifier(iterations=15000,
                                learning_rate=0.02,
                                random_strength=0.1,
                                depth=12,
                                loss_function='Logloss',
                                eval_metric='Logloss',
                                leaf_estimation_method='Newton',
                                random_state = 27,
                                cat_features =cat_features,
                                subsample = 0.9,
                                rsm = 0.8
                                )}

#### Modelling

#### 1. LGBM

In [57]:
%%time
probs_score_lgb = np.zeros(shape=(test_df.shape[0],))
scores,avg_loss = [],[]
sskf = StratifiedShuffleSplit(n_splits=5, test_size = 0.34 ,random_state=27)

for fold_, (train_idx, val_idx) in enumerate(sskf.split(X_train,y_train)):
    print("\n")
    print('Training Fold {}:'.format(fold_))
    
    xtrain = X_train.iloc[train_idx]
    ytrain = y_train.iloc[train_idx]
    
    xval = X_train.iloc[val_idx]
    yval = y_train.iloc[val_idx]
    
    model = model_dispatcher['lgbm']
    
    
    classsifier = model.fit(xtrain, ytrain,
                           eval_set = [(xval, yval)],
                           verbose = 100,
                           eval_metric = ['binary_logloss','auc'],
                           early_stopping_rounds=100)
    
    # predicting 
    print("Predicting ....")
    preds = classsifier.predict_proba(xval)[:,1]
    probs_score_lgb += classsifier.predict_proba(X_test)[:,1]
    
    print("Scoring the model ....")
    roc_score = roc_auc_score(yval,preds)
    
    # appending the metrics
    scores.append(roc_score)
    avg_loss.append(classsifier.best_score_['valid_0']['binary_logloss'])
    
    print ('\n\n validation ROC is {}:'.format(roc_score))
    print('*'*80)
    
print("Evalution metrics statatics\n")   
print("Log Loss : {0:.10f},{1:.10f}".format(np.array(avg_loss).mean(), np.array(avg_loss).std()))
print('%.10f (%.10f)' % (np.array(scores).mean(), np.array(scores).std()))        



Training Fold 0:
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.35499	valid_0's auc: 0.871979
[200]	valid_0's binary_logloss: 0.345139	valid_0's auc: 0.87358
[300]	valid_0's binary_logloss: 0.343979	valid_0's auc: 0.87416
[400]	valid_0's binary_logloss: 0.343631	valid_0's auc: 0.874328
[500]	valid_0's binary_logloss: 0.343523	valid_0's auc: 0.8744
Early stopping, best iteration is:
[499]	valid_0's binary_logloss: 0.343523	valid_0's auc: 0.8744
Predicting ....
Scoring the model ....


 validation ROC is 0.874400135428131:
********************************************************************************


Training Fold 1:
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.354819	valid_0's auc: 0.871619
[200]	valid_0's binary_logloss: 0.345242	valid_0's auc: 0.873161
[300]	valid_0's binary_logloss: 0.344114	valid_0's auc: 0.873527
[400]	valid_0's binary_logloss: 0.3438	valid_0's auc: 0.873546
Ea

#### XGboost

In [59]:
%%time
probs_score_xgboost = np.zeros(shape=(test_df.shape[0],))
scores,avg_loss = [],[]
sskf = StratifiedShuffleSplit(n_splits=5, test_size = 0.34 ,random_state=27)

for fold_, (train_idx, val_idx) in enumerate(sskf.split(X_train,y_train)):
    print("\n")
    print('Training Fold {}:'.format(fold_))
    
    xtrain = X_train.iloc[train_idx]
    ytrain = y_train.iloc[train_idx]
    
    xval = X_train.iloc[val_idx]
    yval = y_train.iloc[val_idx]
    
    model = model_dispatcher['xgboost']
    
    
    classsifier = model.fit(xtrain, ytrain,
                           eval_set = [(xval, yval)],
                           verbose = 100,
                           eval_metric = ['logloss','auc'],
                           early_stopping_rounds=100)
    
    # predicting 
    print("Predicting ....")
    preds = classsifier.predict_proba(xval)[:,1]
    probs_score_xgboost += classsifier.predict_proba(X_test)[:,1]
    
    print("Scoring the model ....")
    roc_score = roc_auc_score(yval,preds)
    
    # appending the metrics
    scores.append(roc_score)
    avg_loss.append(classsifier.best_score)
    
    print ('\n\n validation ROC is {}:'.format(roc_score))
    print('*'*80)
    
print("Evalution metrics statatics\n")   
print("Log Loss : {0:.10f},{1:.10f}".format(np.array(avg_loss).mean(), np.array(avg_loss).std()))
print('%.10f (%.10f)' % (np.array(scores).mean(), np.array(scores).std()))



Training Fold 0:
[0]	validation_0-logloss:0.67036	validation_0-auc:0.87064
[100]	validation_0-logloss:0.34543	validation_0-auc:0.87338
[200]	validation_0-logloss:0.34448	validation_0-auc:0.87363
[247]	validation_0-logloss:0.34477	validation_0-auc:0.87339
Predicting ....
Scoring the model ....


 validation ROC is 0.8736265297674767:
********************************************************************************


Training Fold 1:
[0]	validation_0-logloss:0.67035	validation_0-auc:0.86998
[100]	validation_0-logloss:0.34550	validation_0-auc:0.87267
[200]	validation_0-logloss:0.34477	validation_0-auc:0.87268
[223]	validation_0-logloss:0.34490	validation_0-auc:0.87253
Predicting ....
Scoring the model ....


 validation ROC is 0.8728866371944652:
********************************************************************************


Training Fold 2:
[0]	validation_0-logloss:0.67041	validation_0-auc:0.86899
[100]	validation_0-logloss:0.34642	validation_0-auc:0.87223
[200]	validation_0-logloss:

In [61]:
%%time
probs_score_catboost = np.zeros(shape=(test_df.shape[0],))
scores,avg_loss = [],[]
sskf = StratifiedShuffleSplit(n_splits=5, test_size = 0.34 ,random_state=27)

for fold_, (train_idx, val_idx) in enumerate(sskf.split(X_train,y_train)):
    print("\n")
    print('Training Fold {}:'.format(fold_))
    
    xtrain = X_train.iloc[train_idx]
    ytrain = y_train.iloc[train_idx]
    
    xval = X_train.iloc[val_idx]
    yval = y_train.iloc[val_idx]
    
    model = model_dispatcher['Catboost']
    
    
    classsifier = model.fit(xtrain, ytrain,
                           eval_set = [(xval, yval)],
                           verbose = 100,
                           early_stopping_rounds=50)
    
    # predicting 
    print("Predicting ....")
    preds = classsifier.predict_proba(xval)[:,1]
    probs_score_catboost += classsifier.predict_proba(X_test)[:,1]
    
    print("Scoring the model ....")
    roc_score = roc_auc_score(yval,preds)
    
    # appending the metrics
    scores.append(roc_score)
    avg_loss.append(classsifier.best_score_['validation']['Logloss'])
    
    print ('\n\n validation ROC is {}:'.format(roc_score))
    print('*'*80)
    
print("Evalution metrics statatics\n")   
print("Log Loss : {0:.10f},{1:.10f}".format(np.array(avg_loss).mean(), np.array(avg_loss).std()))
print('%.10f (%.10f)' % (np.array(scores).mean(), np.array(scores).std()))



Training Fold 0:
0:	learn: 0.6752997	test: 0.6753023	best: 0.6753023 (0)	total: 816ms	remaining: 3h 24m
100:	learn: 0.3374270	test: 0.3479003	best: 0.3479003 (100)	total: 1m 34s	remaining: 3h 51m 20s
200:	learn: 0.3237355	test: 0.3444605	best: 0.3444565 (195)	total: 3m 15s	remaining: 3h 59m 36s
300:	learn: 0.3177089	test: 0.3443930	best: 0.3443662 (254)	total: 4m 58s	remaining: 4h 2m 42s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3443661566
bestIteration = 254

Shrink model to first 255 iterations.
Predicting ....
Scoring the model ....


 validation ROC is 0.8736806984228702:
********************************************************************************


Training Fold 1:
0:	learn: 0.6753250	test: 0.6753207	best: 0.6753207 (0)	total: 1.02s	remaining: 4h 14m 45s
100:	learn: 0.3373814	test: 0.3482061	best: 0.3482061 (100)	total: 1m 39s	remaining: 4h 3m 58s
200:	learn: 0.3230216	test: 0.3446013	best: 0.3445953 (199)	total: 3m 23s	remaining: 4h 9m 17s
300:	lea

##### weighted Avearge Voting classification

In [65]:
#LGBM
p1 = probs_score_lgb/5

# Catboost
p2 = probs_score_catboost/5

# Xgboost
p3 = probs_score_xgboost/5

submission = pd.read_csv('../input/submission.csv')

submission['Is_Lead'] = 0.15*p1 + 0.7*p2 + 0.15*p3

submission.to_csv('../input/subsmiison.csv',index =False)