## Dataset Description
The dataset for this competition (both train and test) was generated from a deep learning model trained on the Bank Customer Churn Prediction dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance (Description from Kaggle)

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
import statsmodels.formula.api as sfa
import statsmodels.api as sma

# VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor 

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, RFECV
# Terminal --> pip install mlxtend

#from mlxtend.feature_selection
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, classification_report,roc_auc_score, roc_curve
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import classification_report, cohen_kappa_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.shape

(165034, 14)

In [4]:
test.shape

(110023, 13)

In [5]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


### Data preprocessing

In [6]:
# Do have to continue it
# Recording in USL 10th January
data = train.merge(test,how = 'outer')
data

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0.0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0.0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0.0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275052,275052,15662091,P'eng,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62,
275053,275053,15774133,Cox,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68,
275054,275054,15728456,Ch'iu,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38,
275055,275055,15687541,Yegorova,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58,


In [7]:
data.isnull().sum()

id                      0
CustomerId              0
Surname                 0
CreditScore             0
Geography               0
Gender                  0
Age                     0
Tenure                  0
Balance                 0
NumOfProducts           0
HasCrCard               0
IsActiveMember          0
EstimatedSalary         0
Exited             110023
dtype: int64

In [8]:
# Lets remove some unnecessary columns on the data
data.drop(columns = ['id', 'CustomerId', 'Surname'],inplace = True)

In [9]:
data.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [10]:
data.head(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0.0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0.0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0.0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0.0


In [11]:
# Lets encode categorical columns
data['Geography'].value_counts()

France     157386
Spain       60126
Germany     57545
Name: Geography, dtype: int64

In [12]:
data['Gender'].value_counts()

Male      155092
Female    119965
Name: Gender, dtype: int64

In [13]:
data['Geography'] = np.where((data['Geography']=='France'),0,1)

In [14]:
data['Geography'].value_counts()

0    157386
1    117671
Name: Geography, dtype: int64

In [15]:
data['Gender'] = np.where((data['Gender']=='Male'),0,1)

In [16]:
data['Gender'].value_counts()

0    155092
1    119965
Name: Gender, dtype: int64

In [17]:
# Dummy Encoding
# data = pd.get_dummies(data,drop_first= True,dtype=int)

In [18]:
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,0,0,33.0,3,0.00,2,1.0,0.0,181449.97,0.0
1,627,0,0,33.0,1,0.00,2,1.0,1.0,49503.50,0.0
2,678,0,0,40.0,10,0.00,2,1.0,0.0,184866.69,0.0
3,581,0,0,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0
4,716,1,0,33.0,5,0.00,2,1.0,1.0,15068.83,0.0
...,...,...,...,...,...,...,...,...,...,...,...
275052,570,1,0,29.0,7,116099.82,1,1.0,1.0,148087.62,
275053,575,0,1,36.0,4,178032.53,1,1.0,1.0,42181.68,
275054,712,0,0,31.0,2,0.00,2,1.0,0.0,16287.38,
275055,709,0,1,32.0,3,0.00,1,1.0,1.0,158816.58,


In [19]:
data['Tenure'].value_counts()

2     30392
7     29461
4     29361
8     28966
5     28796
1     27826
3     27773
9     27730
6     26426
10     9884
0      8442
Name: Tenure, dtype: int64

In [20]:
# Checking the corelation betwee num and cols and Y variable
point_biserial_corr, p_value = stats.pointbiserialr(data['CreditScore'], data['Exited'].notnull())

print("Point-Biserial Correlation:", point_biserial_corr)
print("P-value:", p_value)

Point-Biserial Correlation: -0.0004668536896516082
P-value: 0.8065769046040783


In [21]:
# So we will drop this column
#data.drop(columns=['CreditScore'],inplace=True)

In [22]:
point_biserial_corr, p_value = stats.pointbiserialr(data['Age'], data['Exited'].notnull())

print("Point-Biserial Correlation:", point_biserial_corr)
print("P-value:", p_value)
# We will drop this column too

Point-Biserial Correlation: 0.00020351573628611707
P-value: 0.9149991132746385


In [23]:
#data.drop(columns=['Age'],inplace=True)

In [24]:
point_biserial_corr, p_value = stats.pointbiserialr(data['Tenure'], data['Exited'].notnull())

print("Point-Biserial Correlation:", point_biserial_corr)
print("P-value:", p_value)
# We will drip this column too

Point-Biserial Correlation: 0.004140370872147737
P-value: 0.029896927603803372


In [25]:
#data.drop(columns=['Tenure'],inplace=True)

In [26]:
point_biserial_corr, p_value = stats.pointbiserialr(data['Balance'], data['Exited'].notnull())

print("Point-Biserial Correlation:", point_biserial_corr)
print("P-value:", p_value)
# We will drip this column too

Point-Biserial Correlation: 0.0011269368499709008
P-value: 0.554500712606028


In [27]:
#data.drop(columns=['Balance'],inplace=True)

In [28]:
import sklearn
print(sklearn.__version__)

1.3.2


In [29]:
pip install --upgrade scikit-learn

In [30]:
!pip install pingouin

In [31]:
import pingouin as pg


In [32]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2_contingency

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


conf_matrix = pd.crosstab(data['Exited'], data['Geography']).values
cramers_value = cramers_v(conf_matrix)

print(f"Cramer's V Value: {cramers_value}")

Cramer's V Value: 0.1307592983108886


In [33]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2_contingency

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Example usage:
# Assuming 'cat_col1' and 'cat_col2' are your categorical columns in the DataFrame 'data'
conf_matrix = pd.crosstab(data['Exited'], data['Gender']).values
cramers_value = cramers_v(conf_matrix)

print(f"Cramer's V Value: {cramers_value}")

Cramer's V Value: 0.14640635014546477


In [34]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2_contingency

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Example usage:
# Assuming 'cat_col1' and 'cat_col2' are your categorical columns in the DataFrame 'data'
conf_matrix = pd.crosstab(data['Exited'], data['NumOfProducts']).values
cramers_value = cramers_v(conf_matrix)

print(f"Cramer's V Value: {cramers_value}")

Cramer's V Value: 0.4201129333659976


In [35]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2_contingency

def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Example usage:
# Assuming 'cat_col1' and 'cat_col2' are your categorical columns in the DataFrame 'data'
conf_matrix = pd.crosstab(data['Exited'], data['HasCrCard']).values
cramers_value = cramers_v(conf_matrix)

print(f"Cramer's V Value: {cramers_value}")
# We will drop this column

Cramer's V Value: 0.021986810188878585


In [36]:
def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Example usage:
# Assuming 'cat_col1' and 'cat_col2' are your categorical columns in the DataFrame 'data'
conf_matrix = pd.crosstab(data['Exited'], data['IsActiveMember']).values
cramers_value = cramers_v(conf_matrix)

print(f"Cramer's V Value: {cramers_value}")

Cramer's V Value: 0.21020841483931277


In [37]:
point_biserial_corr, p_value = stats.pointbiserialr(data['EstimatedSalary'], data['Exited'].notnull())

print("Point-Biserial Correlation:", point_biserial_corr)
print("P-value:", p_value)
# WE will drop this column too

Point-Biserial Correlation: 0.0025297888946729675
P-value: 0.18458608029254853


In [38]:
#data.drop(columns=['EstimatedSalary'],inplace =True)

In [39]:
data.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [40]:
#data.drop(columns=['HasCrCard'],inplace=True)

In [41]:
data.dtypes

CreditScore          int64
Geography            int32
Gender               int32
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited             float64
dtype: object

### Split the data

In [42]:
train_data = data[data['Exited'].notnull()]
pred_data = data[data['Exited'].isnull()]
train_data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,0,0,33.0,3,0.0,2,1.0,0.0,181449.97,0.0
1,627,0,0,33.0,1,0.0,2,1.0,1.0,49503.5,0.0
2,678,0,0,40.0,10,0.0,2,1.0,0.0,184866.69,0.0
3,581,0,0,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0
4,716,1,0,33.0,5,0.0,2,1.0,1.0,15068.83,0.0


In [43]:
x = train_data.drop(columns=['Exited'])
y = train_data['Exited']
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8,stratify=y, random_state=2)

### Modeling

In [44]:
def model_validation(model, trainx, trainy, testx, testy):
    m = model
    m.fit(trainx,trainy)
    ypred = m.predict(testx)
    print('confusion matrix:\n',confusion_matrix(testy,ypred))
    print('classification matrix:\n',classification_report(testy,ypred))
    prob = m.predict_proba(testx)[:,1]
    print('ROC AUC:\,', roc_auc_score(testy,prob))
    
    ans = input('do you want to savde this model y/n')
    if ans.lower() == 'y':
        with open('final_model.pkl','wb') as file:
            pickle.dump(m,file)
        print('Model has been saved')
    else:
        print('Not Saved')

In [45]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

In [46]:
model_validation(GaussianNB(),x_train,y_train,x_test,y_test)

confusion matrix:
 [[24852  1171]
 [ 5627  1357]]
classification matrix:
               precision    recall  f1-score   support

         0.0       0.82      0.96      0.88     26023
         1.0       0.54      0.19      0.29      6984

    accuracy                           0.79     33007
   macro avg       0.68      0.57      0.58     33007
weighted avg       0.76      0.79      0.75     33007

ROC AUC:\, 0.7654645475306252
do you want to savde this model y/nn
Not Saved


In [47]:
# Random forest
model_validation(RandomForestClassifier(n_estimators=200,max_depth=5),x_train,
               y_train,x_test, y_test)

confusion matrix:
 [[25258   765]
 [ 4178  2806]]
classification matrix:
               precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     26023
         1.0       0.79      0.40      0.53      6984

    accuracy                           0.85     33007
   macro avg       0.82      0.69      0.72     33007
weighted avg       0.84      0.85      0.83     33007

ROC AUC:\, 0.874675833616918
do you want to savde this model y/nn
Not Saved


In [48]:
# Ada Boost
model_validation(AdaBoostClassifier(n_estimators=200),
                x_train,y_train,x_test, y_test)

confusion matrix:
 [[24683  1340]
 [ 3453  3531]]
classification matrix:
               precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     26023
         1.0       0.72      0.51      0.60      6984

    accuracy                           0.85     33007
   macro avg       0.80      0.73      0.75     33007
weighted avg       0.85      0.85      0.84     33007

ROC AUC:\, 0.8728230361158615
do you want to savde this model y/nn
Not Saved


In [49]:
from xgboost import XGBClassifier

from sklearn.ensemble import GradientBoostingClassifier

# Xtreme Gradient Boosting
model_validation(XGBClassifier(n_estimators = 150, gamma = 2),
                x_train,y_train,x_test, y_test)

confusion matrix:
 [[24663  1360]
 [ 3240  3744]]
classification matrix:
               precision    recall  f1-score   support

         0.0       0.88      0.95      0.91     26023
         1.0       0.73      0.54      0.62      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.74      0.77     33007
weighted avg       0.85      0.86      0.85     33007

ROC AUC:\, 0.8823015554043984
do you want to savde this model y/nn
Not Saved


In [None]:
from sklearn.naive_bayes import GaussianNB
model_validation(BaggingClassifier(base_estimator=GaussianNB(),n_estimators=200),
                x_train,y_train,x_test, y_test)

confusion matrix:
 [[24851  1172]
 [ 5627  1357]]
classification matrix:
               precision    recall  f1-score   support

         0.0       0.82      0.95      0.88     26023
         1.0       0.54      0.19      0.29      6984

    accuracy                           0.79     33007
   macro avg       0.68      0.57      0.58     33007
weighted avg       0.76      0.79      0.75     33007

ROC AUC:\, 0.7654781985527914


In [None]:
# from sklearn.linear_model import LogisticRegression

# model_validation(LogisticRegression(),x_train,y_train,x_test,y_test)


In [None]:
# model_validation(DecisionTreeClassifier(max_depth=20,random_state=123),x_train,y_train,x_test, y_test)

In [None]:
# base_estimators = [('Decision Tree',DecisionTreeClassifier(max_depth=5)),
                  ('KNN',KNeighborsClassifier(n_neighbors=7)),
                  ('Random Forest',RandomForestClassifier(n_estimators=200,max_depth=5))]

In [None]:
# model_validation(StackingClassifier(estimators=base_estimators,final_estimator=DecisionTreeClassifier(max_depth=3)),
               x_train,y_train,x_test, y_test)                  

### Predictions

In [None]:
with open('final_model.pkl','rb') as file:
    final_model = pickle.load(file)
    

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission.head(2)

In [None]:
submission['Exited'] = final_model.predict_proba(pred_data.drop(columns=['Exited']))[:,1]

In [None]:
submission

In [None]:
submission.to_csv('final_submission.csv',index=False)