In [32]:
!ls


AV contest.ipynb         test_rf.csv              test_xgb_hp_tuned_v2.csv
test_ada.csv             test_xgb.csv             train_s3TEQDk.csv
test_mSzZ8RL.csv         test_xgb_hp_tuned.csv


# importing necessary libraries

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn-pastel")
sns.set_theme(color_codes=True)


# importing the training data and initial insights

In [58]:
df = pd.read_csv("train_s3TEQDk.csv", index_col=0)

In [59]:
df.head(5)

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [60]:
df.isna().any()

Gender                 False
Age                    False
Region_Code            False
Occupation             False
Channel_Code           False
Vintage                False
Credit_Product          True
Avg_Account_Balance    False
Is_Active              False
Is_Lead                False
dtype: bool

In [61]:
df.describe()

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Is_Lead
count,245725.0,245725.0,245725.0,245725.0
mean,43.856307,46.959141,1128403.0,0.237208
std,14.828672,32.353136,852936.4,0.425372
min,23.0,7.0,20790.0,0.0
25%,30.0,20.0,604310.0,0.0
50%,43.0,32.0,894601.0,0.0
75%,54.0,73.0,1366666.0,0.0
max,85.0,135.0,10352010.0,1.0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245725 entries, NNVBBKZB to BOCZSWLJ
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Gender               245725 non-null  object
 1   Age                  245725 non-null  int64 
 2   Region_Code          245725 non-null  object
 3   Occupation           245725 non-null  object
 4   Channel_Code         245725 non-null  object
 5   Vintage              245725 non-null  int64 
 6   Credit_Product       216400 non-null  object
 7   Avg_Account_Balance  245725 non-null  int64 
 8   Is_Active            245725 non-null  object
 9   Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 20.6+ MB


In [63]:
df.loc[:,"Credit_Product"].isnull().sum()

29325

In [64]:
(29325/245725)*100 #12% of the values are null

11.9340726421813

In [65]:
df[df.loc[:,"Credit_Product"] == "No"].loc[:,"Credit_Product"].count()

144357

In [66]:
df[df.loc[:,"Credit_Product"] == "Yes"].loc[:,"Credit_Product"].count()

72043

In [67]:
72043/(72043+144357) #33% values are yes, rest are no

0.3329158964879852

In [68]:
df.loc[:,"Is_Lead"].value_counts()

0    187437
1     58288
Name: Is_Lead, dtype: int64

In [69]:
58288/(58288+187437) #only about 24% data have 1 as a lead

0.23720826126767727

# data cleaning

In [70]:
def data_clean(df):
    df.drop("Credit_Product", axis=1, inplace=True) #approach 1, trying to get rid of the NaN attribute
    df.drop("Gender", axis=1, inplace=True) #dropping the gender column to avoid any gender bias which shouldn't happen
    return df

In [71]:
y = df.loc[:,"Is_Lead"].copy()

In [72]:
x = df.drop("Is_Lead", axis=1).copy()

In [73]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42, stratify=y)

# data splitted into train and test set with stratification

In [74]:
y_train.value_counts()

0    149950
1     46630
Name: Is_Lead, dtype: int64

In [75]:
46630/(46630+149950)

0.23720622647268289

In [76]:
y_test.value_counts()

0    37487
1    11658
Name: Is_Lead, dtype: int64

In [77]:
11658/(11658+37487) #ratio preserved in train and test dataset

0.2372164004476549

# creating pipelines for numerical and categorical attributes


In [78]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 196580 entries, 2QOUTFDT to NQKSRNEP
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Gender               196580 non-null  object
 1   Age                  196580 non-null  int64 
 2   Region_Code          196580 non-null  object
 3   Occupation           196580 non-null  object
 4   Channel_Code         196580 non-null  object
 5   Vintage              196580 non-null  int64 
 6   Credit_Product       173158 non-null  object
 7   Avg_Account_Balance  196580 non-null  int64 
 8   Is_Active            196580 non-null  object
dtypes: int64(3), object(6)
memory usage: 15.0+ MB


In [79]:
# first we need to seperate out the numerical and categorical attributes

def num_cat(x_train):
    x_train = data_clean(x_train)
    x_train_num = x_train.loc[:,["Age", "Vintage", "Avg_Account_Balance"]]
    x_train_cat = x_train.drop(["Age", "Vintage", "Avg_Account_Balance"], axis=1)
    num_attr = list(x_train_num)
    cat_attr = list(x_train_cat)
    return x_train_num, x_train_cat, num_attr, cat_attr


    

In [80]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

x_train_num, x_train_cat, num_attributes, cat_attributes = num_cat(x_train)

#x_test_num, x_test_cat = num_cat(x_test)

pipe = ColumnTransformer([
    ("num", StandardScaler(), num_attributes),
    ("cat", OneHotEncoder(), cat_attributes)
])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [81]:
x_train_prepared = pipe.fit_transform(x_train)

# RF

In [1]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, )
#rf.fit(x_train_prepared, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_score = cross_val_predict(rf, x_train_prepared, y_train, cv=3, method='predict_proba')

In [None]:
from sklearn.metrics import roc_auc_score

y_train_scores_forest = y_train_score[:,1]
roc_auc_score(y_train, y_train_scores_forest)

# xgboost

In [151]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

In [153]:
xgb.fit(x_train_prepared, y_train, eval_set=[(x_train_prepared, y_train)], early_stopping_rounds=3)



[0]	validation_0-logloss:0.58739
[1]	validation_0-logloss:0.53107
[2]	validation_0-logloss:0.49852
[3]	validation_0-logloss:0.47851
[4]	validation_0-logloss:0.46609
[5]	validation_0-logloss:0.45826
[6]	validation_0-logloss:0.45325
[7]	validation_0-logloss:0.45031
[8]	validation_0-logloss:0.44824
[9]	validation_0-logloss:0.44697
[10]	validation_0-logloss:0.44587
[11]	validation_0-logloss:0.44519
[12]	validation_0-logloss:0.44464
[13]	validation_0-logloss:0.44408
[14]	validation_0-logloss:0.44351
[15]	validation_0-logloss:0.44318
[16]	validation_0-logloss:0.44274
[17]	validation_0-logloss:0.44244
[18]	validation_0-logloss:0.44211
[19]	validation_0-logloss:0.44172
[20]	validation_0-logloss:0.44140
[21]	validation_0-logloss:0.44109
[22]	validation_0-logloss:0.44078
[23]	validation_0-logloss:0.44020
[24]	validation_0-logloss:0.43997
[25]	validation_0-logloss:0.43975
[26]	validation_0-logloss:0.43950
[27]	validation_0-logloss:0.43928
[28]	validation_0-logloss:0.43896
[29]	validation_0-loglos

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [154]:
y_proba = xgb.predict_proba(x_train_prepared)
y_scores = y_proba[:,1]

In [155]:
roc_auc_score(y_train, y_scores)

0.8033035860763367

# adaboost

In [165]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=200, learning_rate=0.7, random_state=19)


In [167]:
ada.fit(x_train_prepared, y_train)


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.7, n_estimators=200, random_state=19)

In [171]:
y_proba = ada.predict_proba(x_train_prepared)
y_scores = y_proba[:,1]

In [172]:
roc_auc_score(y_train, y_scores)

0.7804308974390419

# trying on self made test set - RF

In [134]:
x_test_prepared = pipe.transform(x_test)


In [135]:
y_proba = rf.predict_proba(x_test_prepared)

In [138]:
y_test_scores = y_proba[:, 1]
roc_auc_score(y_test, y_test_scores)

0.7584144020959461

# importing test set and predicting the probabilities

In [82]:
df_test = pd.read_csv("test_mSzZ8RL.csv", index_col=0)

In [83]:
df_test

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No
...,...,...,...,...,...,...,...,...,...
DBENJOYI,Male,52,RG268,Salaried,X2,86,Yes,4242558,Yes
CWQ72DWS,Male,55,RG277,Other,X2,86,Yes,1159153,No
HDESC8GU,Male,35,RG254,Salaried,X4,15,No,1703727,No
2PW4SFCA,Male,53,RG254,Other,X3,93,No,737178,Yes


In [84]:
df_test_prepared = pipe.transform(df_test)



In [85]:
df_test_prepared

<105312x48 sparse matrix of type '<class 'numpy.float64'>'
	with 737184 stored elements in Compressed Sparse Row format>

In [86]:
df_test_proba = rf.predict_proba(df_test_prepared)

NameError: name 'rf' is not defined

In [87]:
df_test_proba = df_test_proba[:,1]

NameError: name 'df_test_proba' is not defined

In [145]:
df_test_proba

array([0.07918432, 0.29171071, 0.08702983, ..., 0.25079674, 0.38794865,
       0.08641188])

In [147]:
!ls

AV contest.ipynb  test_mSzZ8RL.csv  test_rf.csv       train_s3TEQDk.csv


In [149]:
test_rf = pd.DataFrame(df_test_proba, index=df_test.index)

In [150]:
test_rf.to_csv("test_rf.csv")

# xgboost test set eval


In [157]:
xgb_test_proba = xgb.predict_proba(df_test_prepared)

xgb_proba = xgb_test_proba[:,1]


In [159]:
test_xgb = pd.DataFrame(xgb_proba, index=df_test.index, columns=["Is_Lead"])

In [161]:
test_xgb.to_csv("test_xgb.csv")

In [177]:
!ls

AV contest.ipynb  test_mSzZ8RL.csv  test_xgb.csv
test_ada.csv      test_rf.csv       train_s3TEQDk.csv


# adaboost test set eval

In [173]:
ada_test_proba = ada.predict_proba(df_test_prepared)

ada_proba = ada_test_proba[:,1]


In [174]:
test_ada = pd.DataFrame(ada_proba, index=df_test.index, columns=["Is_Lead"])

In [176]:
test_ada.to_csv("test_ada.csv")

# HP tuning xgb_rf

In [178]:
import xgboost

In [181]:
xgb_rf = xgboost.XGBRFClassifier()

In [182]:
#parameters

params = {
    "learning_rate" : [0.05, 0.1, 0.3, 0.5, 0.7, 1],
    "max_depth" : [1,2,4,6,8,10],
    "min_child_weight" : [1,3,5,7],
    "gamma" : [0.0, 0.1, 0.2, 0.5, 0.7],
    "colsample_bynode" : [0.3, 0.5, 0.8, 1]
}

In [202]:
from sklearn.model_selection import RandomizedSearchCV

xgb_randomsearch = RandomizedSearchCV(xgb, param_distributions=params, n_iter=15, scoring="roc_auc", n_jobs=-1, cv=5, verbose=3)

In [203]:
xgb_randomsearch.fit(x_train_prepared, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  8.1min finished




RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           gpu_id=-1, importance_type='gain',
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_delta_step=0, max_depth=6,
                                           min_child_weight=1, missing=nan,
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=4,
                                           num_pa...e=0,
                                           reg_alpha=0, reg_lambda=1,
                                           scale_pos_weight=1,

In [206]:
xgb_randomsearch.best_params_ #for xgb, not xgb_rf

{'min_child_weight': 1,
 'max_depth': 6,
 'learning_rate': 0.1,
 'gamma': 0.2,
 'colsample_bynode': 0.3}

In [194]:
xgb_rf = xgboost.XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=0.5, colsample_bytree=1, gamma=0.7, gpu_id=-1,
                importance_type='gain', interaction_constraints='',
                learning_rate=0.05, max_delta_step=0, max_depth=8,
                min_child_weight=7, monotone_constraints='()',
                n_estimators=100, n_jobs=4, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact', validate_parameters=1,
                verbosity=None)

In [195]:
xgb_rf.fit(x_train_prepared, y_train)





XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=0.5, colsample_bytree=1, gamma=0.7, gpu_id=-1,
                importance_type='gain', interaction_constraints='',
                learning_rate=0.05, max_delta_step=0, max_depth=8,
                min_child_weight=7, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=4, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact', validate_parameters=1,
                verbosity=None)

In [198]:
y_proba = xgb_rf.predict_proba(x_test_prepared)
y_scores = y_proba[:,1]


In [200]:
roc_auc_score(y_test, y_scores)

0.7743181941318544

# HP tuning xgb

In [207]:
xgb_randomsearch.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.3, colsample_bytree=1, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [208]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.3, colsample_bytree=1, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=np.nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [209]:
xgb.fit(x_train_prepared, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.3, colsample_bytree=1, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [211]:
y_proba = xgb.predict_proba(x_train_prepared)
y_scores = y_proba[:,1]

In [212]:
roc_auc_score(y_train, y_scores)

0.786685017959736

# xgb test set eval, hp tuned

In [213]:
xgb_test_proba = xgb.predict_proba(df_test_prepared)

xgb_proba = xgb_test_proba[:,1]


In [214]:
test_xgb_hp_tuned = pd.DataFrame(xgb_proba, index=df_test.index, columns=["Is_Lead"])

In [215]:
test_xgb_hp_tuned.to_csv("test_xgb_hp_tuned.csv")

In [216]:
!ls

AV contest.ipynb      test_rf.csv           train_s3TEQDk.csv
test_ada.csv          test_xgb.csv
test_mSzZ8RL.csv      test_xgb_hp_tuned.csv


# xgb another try

In [218]:
xgb = XGBClassifier(max_depth=12,
                      n_estimators=250,
                      min_child_weight=8, 
                      subsample=0.8, 
                      learning_rate =0.02,    
                      seed=42)

In [219]:
xgb.fit(x_train_prepared, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=12,
              min_child_weight=8, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=4, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=0.8, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [220]:
y_proba = xgb.predict_proba(x_train_prepared)
y_scores = y_proba[:,1]

In [222]:
roc_auc_score(y_train, y_scores)

0.8166584232774139

# xgb another try - test set eval

In [223]:
xgb_test_proba = xgb.predict_proba(df_test_prepared)

xgb_proba = xgb_test_proba[:,1]

In [224]:
test_xgb_hp_tuned = pd.DataFrame(xgb_proba, index=df_test.index, columns=["Is_Lead"])

In [225]:
test_xgb_hp_tuned.to_csv("test_xgb_hp_tuned_v2.csv")

# voting ensemble

In [108]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

dt = DecisionTreeClassifier()
svm_clf = SVC(probability=True)
   
voting = VotingClassifier(estimators=[('dt', dt), ('svm', svm_clf)], voting='soft')

In [None]:
voting.fit(x_train_prepared, y_train)