# importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn-pastel")
sns.set_theme(color_codes=True)

# importing training set


In [36]:
df = pd.read_csv("train_s3TEQDk.csv", index_col=0)

In [38]:
# filling the na values

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245725 entries, NNVBBKZB to BOCZSWLJ
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Gender               245725 non-null  object
 1   Age                  245725 non-null  int64 
 2   Region_Code          245725 non-null  object
 3   Occupation           245725 non-null  object
 4   Channel_Code         245725 non-null  object
 5   Vintage              245725 non-null  int64 
 6   Credit_Product       245725 non-null  object
 7   Avg_Account_Balance  245725 non-null  int64 
 8   Is_Active            245725 non-null  object
 9   Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 20.6+ MB


In [68]:
df['Credit_Product'] = df['Credit_Product'].fillna(df['Is_Lead'].map({0:'Yes', 1:'No'}))

#mapping missing values according to the given condition -> yes if are not a lead, no if they are a lead

In [69]:
df.sample(5, random_state=12)

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RAV4LDAM,Female,53,RG284,Self_Employed,X3,122,No,1664798,Yes,0
DQXAJXVH,Female,31,RG261,Salaried,X1,32,No,853243,Yes,0
79YEIEVZ,Female,39,RG268,Self_Employed,X2,37,Yes,1335534,No,0
GLRTCYA5,Female,50,RG268,Self_Employed,X3,93,No,344038,No,1
EZ7DJWVK,Male,26,RG270,Self_Employed,X1,15,No,587524,No,0


In [41]:
x = df.drop("Is_Lead", axis=1).copy()
y = df.loc[:,"Is_Lead"].copy()

In [42]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42, stratify=y)

In [70]:
# first we need to seperate out the numerical and categorical attributes

def num_cat(x_train):
    #x_train = data_clean(x_train)
    x_train_num = x_train.loc[:,["Age", "Vintage", "Avg_Account_Balance"]]
    x_train_cat = x_train.drop(["Age", "Vintage", "Avg_Account_Balance"], axis=1)
    num_attr = list(x_train_num)
    cat_attr = list(x_train_cat)
    return x_train_num, x_train_cat, num_attr, cat_attr



In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

x_train_num, x_train_cat, num_attributes, cat_attributes = num_cat(x_train)


pipe = ColumnTransformer([
    ("num", StandardScaler(), num_attributes),
    ("cat", OneHotEncoder(), cat_attributes)
])

In [72]:
x_train_prepared = pipe.fit_transform(x_train)

In [76]:
x_test_prepared = pipe.transform(x_test)

# xgboost

In [73]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

In [77]:
xgb.fit(x_train_prepared, y_train, eval_set=[(x_test_prepared, y_test)], early_stopping_rounds=3)



[0]	validation_0-logloss:0.58665
[1]	validation_0-logloss:0.52999
[2]	validation_0-logloss:0.49715
[3]	validation_0-logloss:0.47676
[4]	validation_0-logloss:0.46382
[5]	validation_0-logloss:0.45593
[6]	validation_0-logloss:0.45047
[7]	validation_0-logloss:0.44689
[8]	validation_0-logloss:0.44505
[9]	validation_0-logloss:0.44341
[10]	validation_0-logloss:0.44244
[11]	validation_0-logloss:0.44194
[12]	validation_0-logloss:0.44132
[13]	validation_0-logloss:0.44127
[14]	validation_0-logloss:0.44106
[15]	validation_0-logloss:0.44036
[16]	validation_0-logloss:0.44018
[17]	validation_0-logloss:0.44008
[18]	validation_0-logloss:0.43968
[19]	validation_0-logloss:0.43960
[20]	validation_0-logloss:0.43965
[21]	validation_0-logloss:0.43966
[22]	validation_0-logloss:0.43962


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [78]:
y_proba = xgb.predict_proba(x_train_prepared)
y_scores = y_proba[:,1]

from sklearn.metrics import roc_auc_score
print("roc score = ", roc_auc_score(y_train, y_scores))

roc score =  0.794365316139049


# eval on self made test set

In [51]:
x_test_prepared = pipe.transform(x_test)


In [52]:
y_proba = xgb.predict_proba(x_test_prepared)

In [53]:
y_test_scores = y_proba[:, 1]
roc_auc_score(y_test, y_test_scores)

0.7842609947750949

# importing the test set 

In [79]:
df_test = pd.read_csv("test_mSzZ8RL.csv", index_col=0)

In [90]:
df_test['Credit_Product'] = df_test['Credit_Product'].fillna("No")

In [91]:
df_test.isna().any()

Gender                 False
Age                    False
Region_Code            False
Occupation             False
Channel_Code           False
Vintage                False
Credit_Product         False
Avg_Account_Balance    False
Is_Active              False
dtype: bool

In [92]:
df_test_prepared = pipe.transform(df_test)

In [93]:
df_test_proba = xgb.predict_proba(df_test_prepared)

In [94]:
df_test_score = df_test_proba[:,1]

In [97]:
test_xgb_nb2 = pd.DataFrame(df_test_score, index=df_test.index, columns=["Is_Lead"])

In [99]:
test_xgb_nb2.to_csv("test_xgb_nb2.csv")

# rf

In [103]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()


In [104]:
rf.fit(x_train_prepared, y_train)

RandomForestClassifier()

In [105]:
from sklearn.model_selection import cross_val_predict

y_train_score = cross_val_predict(rf, x_train_prepared, y_train, cv=3, method='predict_proba')

y_train_scores_forest = y_train_score[:,1]
roc_auc_score(y_train, y_train_scores_forest)

0.7526642433602679

# gradient boosting

In [107]:
from sklearn.ensemble import GradientBoostingClassifier 

gb = GradientBoostingClassifier()

In [108]:
gb.fit(x_train_prepared, y_train)

GradientBoostingClassifier()

In [111]:
y_proba = gb.predict_proba(x_test_prepared)

In [112]:
y_test_scores = y_proba[:, 1]
roc_auc_score(y_test, y_test_scores)

0.776871394675699

## HP tuning

In [126]:
from sklearn.model_selection import GridSearchCV


param_test1 = {'n_estimators':range(80,150,10),
              'max_depth' : [6, 8, 12],
              'learning_rate' : [0.2, 0.3, 0.4]}

gs_gb = GridSearchCV(estimator = GradientBoostingClassifier(min_samples_split=500,
                                                    min_samples_leaf=50,
                                                    max_features='sqrt',
                                                    subsample=0.8,
                                                    random_state=42),
             param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=3, verbose=True)


In [127]:
gs_gb.fit(x_train_prepared, y_train)

Fitting 3 folds for each of 63 candidates, totalling 189 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 189 out of 189 | elapsed: 47.2min finished


GridSearchCV(cv=3,
             estimator=GradientBoostingClassifier(max_features='sqrt',
                                                  min_samples_leaf=50,
                                                  min_samples_split=500,
                                                  random_state=42,
                                                  subsample=0.8),
             iid=False, n_jobs=-1,
             param_grid={'learning_rate': [0.2, 0.3, 0.4],
                         'max_depth': [6, 8, 12],
                         'n_estimators': range(80, 150, 10)},
             scoring='roc_auc', verbose=True)

In [128]:
gs_gb.best_params_, gs_gb.best_score_


({'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 80},
 0.7869663056727889)

({'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 80},
 0.7863558608643654)

In [129]:
gs_gb.best_estimator_

GradientBoostingClassifier(learning_rate=0.2, max_depth=8, max_features='sqrt',
                           min_samples_leaf=50, min_samples_split=500,
                           n_estimators=80, random_state=42, subsample=0.8)

In [130]:
gs = GradientBoostingClassifier(learning_rate=0.2, max_depth=8, max_features='sqrt',
                           min_samples_leaf=50, min_samples_split=500,
                           n_estimators=80, random_state=42, subsample=0.8)

In [131]:
gs.fit(x_train_prepared, y_train)

GradientBoostingClassifier(learning_rate=0.2, max_depth=8, max_features='sqrt',
                           min_samples_leaf=50, min_samples_split=500,
                           n_estimators=80, random_state=42, subsample=0.8)

In [132]:
y_proba = gb.predict_proba(x_test_prepared)

In [133]:
y_test_scores = y_proba[:, 1]
roc_auc_score(y_test, y_test_scores)

0.776871394675699