In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier,StackingClassifier,VotingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report,roc_auc_score,confusion_matrix

import pickle

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp311-cp311-win_amd64.whl (101.0 MB)
                                              0.0/101.0 MB ? eta -:--:--
                                              0.2/101.0 MB 9.6 MB/s eta 0:00:11
                                              0.4/101.0 MB 4.9 MB/s eta 0:00:21
                                              0.6/101.0 MB 5.1 MB/s eta 0:00:20
                                              0.9/101.0 MB 5.6 MB/s eta 0:00:18
                                              1.1/101.0 MB 4.8 MB/s eta 0:00:21
                                              1.3/101.0 MB 5.0 MB/s eta 0:00:20
                                              1.6/101.0 MB 5.1 MB/s eta 0:00:20
                                              1.7/101.0 MB 4.9 MB/s eta 0:00:21
                                              2.0/101.0 MB 4.9 MB/s eta 0:00:21
                                              2.2/101.0 MB 4.8 MB/s eta 0:00:21
     -                                   

In [27]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [28]:
train.shape

(165034, 14)

In [29]:
test.shape

(110023, 13)

In [30]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


# Data Preprocessing

In [31]:
data = train.merge(test,how='outer')

In [32]:
data

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0.0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0.0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0.0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275052,275052,15662091,P'eng,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62,
275053,275053,15774133,Cox,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68,
275054,275054,15728456,Ch'iu,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38,
275055,275055,15687541,Yegorova,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58,


In [33]:
data.isnull().sum()

id                      0
CustomerId              0
Surname                 0
CreditScore             0
Geography               0
Gender                  0
Age                     0
Tenure                  0
Balance                 0
NumOfProducts           0
HasCrCard               0
IsActiveMember          0
EstimatedSalary         0
Exited             110023
dtype: int64

# Lets remove unnecessary columns from the data

In [11]:
data.drop(columns=['id','CustomerId','Surname'],inplace=True)

In [12]:
data.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

# Lets encode catagorical columns

In [13]:
data['Geography'].value_counts()

France     157386
Spain       60126
Germany     57545
Name: Geography, dtype: int64

In [14]:
data['Gender'].value_counts()

Male      155092
Female    119965
Name: Gender, dtype: int64

In [15]:
data = pd.get_dummies(data,drop_first=True,dtype=int)

In [16]:
data

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,668,33.0,3,0.00,2,1.0,0.0,181449.97,0.0,0,0,1
1,627,33.0,1,0.00,2,1.0,1.0,49503.50,0.0,0,0,1
2,678,40.0,10,0.00,2,1.0,0.0,184866.69,0.0,0,0,1
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0,0,0,1
4,716,33.0,5,0.00,2,1.0,1.0,15068.83,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
275052,570,29.0,7,116099.82,1,1.0,1.0,148087.62,,0,1,1
275053,575,36.0,4,178032.53,1,1.0,1.0,42181.68,,0,0,0
275054,712,31.0,2,0.00,2,1.0,0.0,16287.38,,0,0,1
275055,709,32.0,3,0.00,1,1.0,1.0,158816.58,,0,0,0


In [17]:
data.dtypes

CreditScore            int64
Age                  float64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard            float64
IsActiveMember       float64
EstimatedSalary      float64
Exited               float64
Geography_Germany      int32
Geography_Spain        int32
Gender_Male            int32
dtype: object

# Split the data

In [18]:
data_train = data[data['Exited'].notnull()]
pred_data = data[data['Exited'].isnull()]

In [19]:
data_train.shape

(165034, 12)

In [20]:
pred_data.shape

(110023, 12)

In [21]:
x = data_train.drop(columns=['Exited'])
y = data_train['Exited']

x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,stratify=y,random_state=2)

In [18]:
sc = StandardScaler()

In [19]:
data_s = sc.fit_transform(data)

In [20]:
data_s

array([[ 0.14360088, -0.57805598, -0.71658811, ..., -0.51435408,
        -0.52890982,  0.87949339],
       [-0.36769786, -0.57805598, -1.42930373, ..., -0.51435408,
        -0.52890982,  0.87949339],
       [ 0.26830789,  0.21157406,  1.77791658, ..., -0.51435408,
        -0.52890982,  0.87949339],
       ...,
       [ 0.69231172, -0.80366456, -1.07294592, ..., -0.51435408,
        -0.52890982,  0.87949339],
       [ 0.65489961, -0.69086027, -0.71658811, ..., -0.51435408,
        -0.52890982, -1.13701821],
       [-0.44252207, -0.12683881,  0.70884314, ..., -0.51435408,
        -0.52890982, -1.13701821]])

In [21]:
data_s = pd.DataFrame(data=data_s,columns=data.columns)

In [22]:
data_s

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,0.143601,-0.578056,-0.716588,-0.882407,0.816582,0.571824,-0.993531,1.371720,-0.518063,-0.514354,-0.528910,0.879493
1,-0.367698,-0.578056,-1.429304,-0.882407,0.816582,0.571824,1.006511,-1.252174,-0.518063,-0.514354,-0.528910,0.879493
2,0.268308,0.211574,1.777917,-0.882407,0.816582,0.571824,-0.993531,1.439665,-0.518063,-0.514354,-0.528910,0.879493
3,-0.941350,-0.465252,-1.072946,1.488114,-1.014326,0.571824,1.006511,-0.555021,-0.518063,-0.514354,-0.528910,0.879493
4,0.742195,-0.578056,-0.003872,-0.882407,0.816582,0.571824,1.006511,-1.936943,-0.518063,-0.514354,1.890681,0.879493
...,...,...,...,...,...,...,...,...,...,...,...,...
275052,-1.078528,-1.029273,0.708843,0.966145,-1.014326,0.571824,1.006511,0.708275,,-0.514354,1.890681,0.879493
275053,-1.016174,-0.239643,-0.360230,1.952243,-1.014326,0.571824,1.006511,-1.397776,,-0.514354,-0.528910,-1.137018
275054,0.692312,-0.803665,-1.072946,-0.882407,0.816582,0.571824,-0.993531,-1.912711,,-0.514354,-0.528910,0.879493
275055,0.654900,-0.690860,-0.716588,-0.882407,-1.014326,0.571824,1.006511,0.921631,,-0.514354,-0.528910,-1.137018


In [34]:
data_s.isnull().sum()

CreditScore               0
Age                       0
Tenure                    0
Balance                   0
NumOfProducts             0
HasCrCard                 0
IsActiveMember            0
EstimatedSalary           0
Exited               110023
Geography_Germany         0
Geography_Spain           0
Gender_Male               0
dtype: int64

In [22]:
data.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')

# Modelling

In [24]:
def model_validation(model,trainx,trainy,testx,testy):
    m = model
    m.fit(trainx,trainy)
    ypred = m.predict(testx)
    prob = m.predict_proba(testx)[:,1]
    print('Confusion Matrix:\n',confusion_matrix(testy,ypred))
    print('Classification Report:\n',classification_report(testy,ypred))
    print('ROC AUC:\n',roc_auc_score(testy,prob))
    
    ans = input('Do You want to save this model y/n?')
    if ans.lower() == 'y':
        with open('final_model.pkl','wb') as file:
            pickle.dump(m,file)
        print('Model has been saved')
    else:
        print('Not Saved')

In [44]:
# Random Forest

model_validation(RandomForestClassifier(n_estimators=200,max_depth=7),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[25073   950]
 [ 3677  3307]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.96      0.92     26023
         1.0       0.78      0.47      0.59      6984

    accuracy                           0.86     33007
   macro avg       0.82      0.72      0.75     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.8834679282301994
Do You want to save this model y/n?n
Not Saved


In [39]:
# AdaBoost

model_validation(AdaBoostClassifier(),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24716  1307]
 [ 3340  3644]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     26023
         1.0       0.74      0.52      0.61      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.74      0.76     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.8778408486914762
Do You want to save this model y/n?y
Model has been saved


In [63]:
model_validation(AdaBoostClassifier(n_estimators=100),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24700  1323]
 [ 3312  3672]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     26023
         1.0       0.74      0.53      0.61      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.74      0.76     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.8781232366741925
Do You want to save this model y/n?n
Not Saved


In [64]:
model_validation(AdaBoostClassifier(n_estimators=200),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24704  1319]
 [ 3307  3677]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     26023
         1.0       0.74      0.53      0.61      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.74      0.76     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.8782845811919221
Do You want to save this model y/n?n
Not Saved


In [37]:
# Naive Bayes

model_validation(GaussianNB(),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24848  1175]
 [ 5625  1359]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.95      0.88     26023
         1.0       0.54      0.19      0.29      6984

    accuracy                           0.79     33007
   macro avg       0.68      0.57      0.58     33007
weighted avg       0.76      0.79      0.75     33007

ROC AUC:
 0.7658872780352599
Do You want to save this model y/n?n
Not Saved


In [45]:
# XG Boost

model_validation(XGBClassifier(n_estimators=150,gamma=2),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24636  1387]
 [ 3130  3854]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26023
         1.0       0.74      0.55      0.63      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.86      0.86      0.86     33007

ROC AUC:
 0.8866857371611394
Do You want to save this model y/n?y
Model has been saved


In [60]:
model_validation(XGBClassifier(n_estimators=100,gamma=2),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24636  1387]
 [ 3130  3854]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26023
         1.0       0.74      0.55      0.63      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.86      0.86      0.86     33007

ROC AUC:
 0.8866857371611394
Do You want to save this model y/n?n
Not Saved


In [61]:
model_validation(XGBClassifier(n_estimators=100,gamma=1),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24628  1395]
 [ 3149  3835]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26023
         1.0       0.73      0.55      0.63      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.8856743592845153
Do You want to save this model y/n?n
Not Saved


In [62]:
model_validation(XGBClassifier(n_estimators=200,gamma=3),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24636  1387]
 [ 3135  3849]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26023
         1.0       0.74      0.55      0.63      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.85      0.86      0.86     33007

ROC AUC:
 0.886292127186458
Do You want to save this model y/n?n
Not Saved


In [46]:
model_validation(XGBClassifier(n_estimators=200,gamma=4),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24618  1405]
 [ 3113  3871]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26023
         1.0       0.73      0.55      0.63      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.86      0.86      0.86     33007

ROC AUC:
 0.8862025674573981
Do You want to save this model y/n?n
Not Saved


In [43]:
# Decision Tree

model_validation(DecisionTreeClassifier(max_depth=7),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24593  1430]
 [ 3173  3811]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.95      0.91     26023
         1.0       0.73      0.55      0.62      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.880251040922078
Do You want to save this model y/n?y
Model has been saved


In [66]:
# Logistic Regression

model_validation(LogisticRegression(),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[25103   920]
 [ 6083   901]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.80      0.96      0.88     26023
         1.0       0.49      0.13      0.20      6984

    accuracy                           0.79     33007
   macro avg       0.65      0.55      0.54     33007
weighted avg       0.74      0.79      0.74     33007

ROC AUC:
 0.712773984433279
Do You want to save this model y/n?n
Not Saved


In [None]:
# KNN 

model_validation(KNeighborsClassifier(),)

# Prediction

In [43]:
with open('final_model.pkl','rb') as file:
    final_model = pickle.load(file)

In [44]:
submission = pd.read_csv('sample_submission.csv')

In [45]:
submission['Exited'] = final_model.predict_proba(pred_data.drop(columns=['Exited']))[:,1]

In [46]:
submission

Unnamed: 0,id,Exited
0,165034,0.012560
1,165035,0.781761
2,165036,0.011974
3,165037,0.162802
4,165038,0.364718
...,...,...
110018,275052,0.022700
110019,275053,0.197446
110020,275054,0.010465
110021,275055,0.150469


In [47]:
submission.to_csv('final_submission.csv',index=False)

# Stacking

In [55]:
base_estimators = [('Naive Bayes',GaussianNB()),('Decision Tree',DecisionTreeClassifier(max_depth=5)),
                   ('KNN',KNeighborsClassifier(n_neighbors=7)),
                   ('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=5))]

In [56]:
model_validation(StackingClassifier(estimators=base_estimators),x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24815  1208]
 [ 3440  3544]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     26023
         1.0       0.75      0.51      0.60      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.73      0.76     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.8795791476251139
Do You want to save this model y/n?n
Not Saved


In [58]:
model_validation(StackingClassifier(estimators=base_estimators,final_estimator=DecisionTreeClassifier(max_depth=3)),
                 x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[25290   733]
 [ 4100  2884]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     26023
         1.0       0.80      0.41      0.54      6984

    accuracy                           0.85     33007
   macro avg       0.83      0.69      0.73     33007
weighted avg       0.85      0.85      0.83     33007

ROC AUC:
 0.8690641493059339
Do You want to save this model y/n?n
Not Saved


In [59]:
model_validation(StackingClassifier(estimators=base_estimators,final_estimator=DecisionTreeClassifier(max_depth=5)),
                 x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24672  1351]
 [ 3304  3680]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     26023
         1.0       0.73      0.53      0.61      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.74      0.76     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:
 0.8789224129601803
Do You want to save this model y/n?n
Not Saved


In [25]:
# CatBoost

model_validation(CatBoostClassifier(),x_train,y_train,x_test,y_test)

Learning rate set to 0.082881
0:	learn: 0.6171130	total: 192ms	remaining: 3m 11s
1:	learn: 0.5563935	total: 244ms	remaining: 2m 1s
2:	learn: 0.5101120	total: 287ms	remaining: 1m 35s
3:	learn: 0.4743986	total: 321ms	remaining: 1m 19s
4:	learn: 0.4466009	total: 358ms	remaining: 1m 11s
5:	learn: 0.4238733	total: 388ms	remaining: 1m 4s
6:	learn: 0.4065048	total: 423ms	remaining: 1m
7:	learn: 0.3926699	total: 464ms	remaining: 57.5s
8:	learn: 0.3809121	total: 507ms	remaining: 55.9s
9:	learn: 0.3717958	total: 553ms	remaining: 54.8s
10:	learn: 0.3642043	total: 595ms	remaining: 53.5s
11:	learn: 0.3585027	total: 641ms	remaining: 52.8s
12:	learn: 0.3536171	total: 688ms	remaining: 52.2s
13:	learn: 0.3489980	total: 723ms	remaining: 51s
14:	learn: 0.3456091	total: 768ms	remaining: 50.4s
15:	learn: 0.3425076	total: 807ms	remaining: 49.7s
16:	learn: 0.3398894	total: 848ms	remaining: 49s
17:	learn: 0.3373575	total: 890ms	remaining: 48.6s
18:	learn: 0.3351092	total: 926ms	remaining: 47.8s
19:	learn: 0.3

162:	learn: 0.3137221	total: 5.68s	remaining: 29.2s
163:	learn: 0.3136726	total: 5.72s	remaining: 29.2s
164:	learn: 0.3136232	total: 5.75s	remaining: 29.1s
165:	learn: 0.3135907	total: 5.79s	remaining: 29.1s
166:	learn: 0.3135482	total: 5.84s	remaining: 29.1s
167:	learn: 0.3135043	total: 5.88s	remaining: 29.1s
168:	learn: 0.3134679	total: 5.91s	remaining: 29.1s
169:	learn: 0.3134246	total: 5.95s	remaining: 29s
170:	learn: 0.3133839	total: 5.98s	remaining: 29s
171:	learn: 0.3133451	total: 6.01s	remaining: 28.9s
172:	learn: 0.3132862	total: 6.04s	remaining: 28.9s
173:	learn: 0.3132461	total: 6.08s	remaining: 28.8s
174:	learn: 0.3131914	total: 6.11s	remaining: 28.8s
175:	learn: 0.3131572	total: 6.14s	remaining: 28.8s
176:	learn: 0.3131125	total: 6.18s	remaining: 28.7s
177:	learn: 0.3130742	total: 6.21s	remaining: 28.7s
178:	learn: 0.3130370	total: 6.25s	remaining: 28.6s
179:	learn: 0.3130045	total: 6.29s	remaining: 28.7s
180:	learn: 0.3129673	total: 6.34s	remaining: 28.7s
181:	learn: 0.31

325:	learn: 0.3078705	total: 11.7s	remaining: 24.1s
326:	learn: 0.3078451	total: 11.7s	remaining: 24.1s
327:	learn: 0.3078264	total: 11.7s	remaining: 24s
328:	learn: 0.3077952	total: 11.8s	remaining: 24s
329:	learn: 0.3077612	total: 11.8s	remaining: 23.9s
330:	learn: 0.3077378	total: 11.8s	remaining: 23.9s
331:	learn: 0.3077169	total: 11.9s	remaining: 23.8s
332:	learn: 0.3076850	total: 11.9s	remaining: 23.8s
333:	learn: 0.3076617	total: 11.9s	remaining: 23.8s
334:	learn: 0.3076268	total: 12s	remaining: 23.8s
335:	learn: 0.3075932	total: 12s	remaining: 23.7s
336:	learn: 0.3075718	total: 12s	remaining: 23.7s
337:	learn: 0.3075454	total: 12.1s	remaining: 23.6s
338:	learn: 0.3075104	total: 12.1s	remaining: 23.6s
339:	learn: 0.3074877	total: 12.2s	remaining: 23.6s
340:	learn: 0.3074620	total: 12.2s	remaining: 23.6s
341:	learn: 0.3074416	total: 12.2s	remaining: 23.5s
342:	learn: 0.3074261	total: 12.2s	remaining: 23.5s
343:	learn: 0.3073826	total: 12.3s	remaining: 23.4s
344:	learn: 0.3073557	

485:	learn: 0.3033974	total: 16.7s	remaining: 17.6s
486:	learn: 0.3033636	total: 16.7s	remaining: 17.6s
487:	learn: 0.3033331	total: 16.7s	remaining: 17.5s
488:	learn: 0.3033127	total: 16.8s	remaining: 17.5s
489:	learn: 0.3032811	total: 16.8s	remaining: 17.5s
490:	learn: 0.3032612	total: 16.8s	remaining: 17.4s
491:	learn: 0.3032322	total: 16.8s	remaining: 17.4s
492:	learn: 0.3031988	total: 16.9s	remaining: 17.4s
493:	learn: 0.3031734	total: 16.9s	remaining: 17.3s
494:	learn: 0.3031419	total: 16.9s	remaining: 17.3s
495:	learn: 0.3031175	total: 17s	remaining: 17.2s
496:	learn: 0.3030841	total: 17s	remaining: 17.2s
497:	learn: 0.3030619	total: 17s	remaining: 17.2s
498:	learn: 0.3030446	total: 17.1s	remaining: 17.1s
499:	learn: 0.3030126	total: 17.1s	remaining: 17.1s
500:	learn: 0.3029920	total: 17.1s	remaining: 17.1s
501:	learn: 0.3029620	total: 17.2s	remaining: 17s
502:	learn: 0.3029378	total: 17.2s	remaining: 17s
503:	learn: 0.3029012	total: 17.2s	remaining: 16.9s
504:	learn: 0.3028679	

644:	learn: 0.2993507	total: 21.5s	remaining: 11.8s
645:	learn: 0.2993295	total: 21.5s	remaining: 11.8s
646:	learn: 0.2993176	total: 21.5s	remaining: 11.7s
647:	learn: 0.2992869	total: 21.6s	remaining: 11.7s
648:	learn: 0.2992597	total: 21.6s	remaining: 11.7s
649:	learn: 0.2992421	total: 21.6s	remaining: 11.6s
650:	learn: 0.2992221	total: 21.6s	remaining: 11.6s
651:	learn: 0.2991923	total: 21.7s	remaining: 11.6s
652:	learn: 0.2991725	total: 21.7s	remaining: 11.5s
653:	learn: 0.2991524	total: 21.7s	remaining: 11.5s
654:	learn: 0.2991271	total: 21.8s	remaining: 11.5s
655:	learn: 0.2991039	total: 21.8s	remaining: 11.4s
656:	learn: 0.2990733	total: 21.8s	remaining: 11.4s
657:	learn: 0.2990569	total: 21.9s	remaining: 11.4s
658:	learn: 0.2990306	total: 21.9s	remaining: 11.3s
659:	learn: 0.2990008	total: 21.9s	remaining: 11.3s
660:	learn: 0.2989704	total: 22s	remaining: 11.3s
661:	learn: 0.2989413	total: 22s	remaining: 11.2s
662:	learn: 0.2989096	total: 22s	remaining: 11.2s
663:	learn: 0.2988

809:	learn: 0.2953983	total: 26.9s	remaining: 6.32s
810:	learn: 0.2953771	total: 27s	remaining: 6.29s
811:	learn: 0.2953569	total: 27s	remaining: 6.25s
812:	learn: 0.2953352	total: 27s	remaining: 6.22s
813:	learn: 0.2953113	total: 27.1s	remaining: 6.18s
814:	learn: 0.2952897	total: 27.1s	remaining: 6.15s
815:	learn: 0.2952672	total: 27.1s	remaining: 6.12s
816:	learn: 0.2952461	total: 27.2s	remaining: 6.09s
817:	learn: 0.2952187	total: 27.2s	remaining: 6.05s
818:	learn: 0.2951931	total: 27.2s	remaining: 6.02s
819:	learn: 0.2951686	total: 27.3s	remaining: 5.98s
820:	learn: 0.2951465	total: 27.3s	remaining: 5.95s
821:	learn: 0.2951289	total: 27.3s	remaining: 5.92s
822:	learn: 0.2951120	total: 27.4s	remaining: 5.88s
823:	learn: 0.2950939	total: 27.4s	remaining: 5.85s
824:	learn: 0.2950662	total: 27.4s	remaining: 5.82s
825:	learn: 0.2950394	total: 27.5s	remaining: 5.78s
826:	learn: 0.2950173	total: 27.5s	remaining: 5.75s
827:	learn: 0.2949875	total: 27.5s	remaining: 5.72s
828:	learn: 0.2949

970:	learn: 0.2918022	total: 32.6s	remaining: 972ms
971:	learn: 0.2917849	total: 32.6s	remaining: 939ms
972:	learn: 0.2917687	total: 32.6s	remaining: 905ms
973:	learn: 0.2917482	total: 32.6s	remaining: 871ms
974:	learn: 0.2917181	total: 32.7s	remaining: 838ms
975:	learn: 0.2916907	total: 32.7s	remaining: 804ms
976:	learn: 0.2916734	total: 32.7s	remaining: 771ms
977:	learn: 0.2916487	total: 32.8s	remaining: 737ms
978:	learn: 0.2916318	total: 32.8s	remaining: 704ms
979:	learn: 0.2916039	total: 32.8s	remaining: 670ms
980:	learn: 0.2915857	total: 32.9s	remaining: 636ms
981:	learn: 0.2915643	total: 32.9s	remaining: 603ms
982:	learn: 0.2915408	total: 32.9s	remaining: 569ms
983:	learn: 0.2915186	total: 33s	remaining: 536ms
984:	learn: 0.2914990	total: 33s	remaining: 502ms
985:	learn: 0.2914781	total: 33s	remaining: 469ms
986:	learn: 0.2914476	total: 33s	remaining: 435ms
987:	learn: 0.2914162	total: 33.1s	remaining: 402ms
988:	learn: 0.2913974	total: 33.1s	remaining: 368ms
989:	learn: 0.291379

In [26]:
data

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,668,33.0,3,0.00,2,1.0,0.0,181449.97,0.0,0,0,1
1,627,33.0,1,0.00,2,1.0,1.0,49503.50,0.0,0,0,1
2,678,40.0,10,0.00,2,1.0,0.0,184866.69,0.0,0,0,1
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0,0,0,1
4,716,33.0,5,0.00,2,1.0,1.0,15068.83,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
275052,570,29.0,7,116099.82,1,1.0,1.0,148087.62,,0,1,1
275053,575,36.0,4,178032.53,1,1.0,1.0,42181.68,,0,0,0
275054,712,31.0,2,0.00,2,1.0,0.0,16287.38,,0,0,1
275055,709,32.0,3,0.00,1,1.0,1.0,158816.58,,0,0,0


In [34]:
data.drop(columns=['id'],inplace=True)

In [38]:
data_trains = data[data['Exited'].notnull()]
preds_data = data[data['Exited'].isnull()]

In [53]:
model_validation(CatBoostClassifier(cat_features=['Surname','Geography','Gender'],verbose=0,n_estimators=100,max_depth=7),\
                                   x_train,y_train,x_test,y_test)

Confusion Matrix:
 [[24663  1360]
 [ 3082  3902]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26023
         1.0       0.74      0.56      0.64      6984

    accuracy                           0.87     33007
   macro avg       0.82      0.75      0.78     33007
weighted avg       0.86      0.87      0.86     33007

ROC AUC:
 0.890562374353923
Do You want to save this model y/n?y
Model has been saved


In [54]:
with open('final_model.pkl','rb') as file:
    final_model = pickle.load(file)

In [55]:
submission = pd.read_csv('sample_submission.csv')

In [56]:
submission['Exited'] = final_model.predict_proba(pred_data.drop(columns=['Exited']))[:,1]

In [57]:
submission

Unnamed: 0,id,Exited
0,165034,0.012560
1,165035,0.781761
2,165036,0.011974
3,165037,0.162802
4,165038,0.364718
...,...,...
110018,275052,0.022700
110019,275053,0.197446
110020,275054,0.010465
110021,275055,0.150469


In [58]:
submission.to_csv('file_end.csv',index=False)