In [1]:
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
zipfile.ZipFile('Data/playground-series-s4e1.zip','r').extractall('Data/Bank_Customer_Churn_Data/')

In [3]:
df = pd.read_csv('Data/Bank_Customer_Churn_Data/train.csv')
df.head(10)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0
5,5,15771669,Genovese,588,Germany,Male,36.0,4,131778.58,1,1.0,0.0,136024.31,1
6,6,15692819,Ch'ang,593,France,Female,30.0,8,144772.69,1,1.0,0.0,29792.11,0
7,7,15669611,Chukwuebuka,678,Spain,Male,37.0,1,138476.41,1,1.0,0.0,106851.6,0
8,8,15691707,Manna,676,France,Male,43.0,4,0.0,2,1.0,0.0,142917.13,0
9,9,15591721,Cattaneo,583,Germany,Male,40.0,4,81274.33,1,1.0,1.0,170843.07,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


In [5]:
df.describe()

Unnamed: 0,id,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0
mean,82516.5,15692010.0,656.454373,38.125888,5.020353,55478.086689,1.554455,0.753954,0.49777,112574.822734,0.211599
std,47641.3565,71397.82,80.10334,8.867205,2.806159,62817.663278,0.547154,0.430707,0.499997,50292.865585,0.408443
min,0.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,41258.25,15633140.0,597.0,32.0,3.0,0.0,1.0,1.0,0.0,74637.57,0.0
50%,82516.5,15690170.0,659.0,37.0,5.0,0.0,2.0,1.0,0.0,117948.0,0.0
75%,123774.75,15756820.0,710.0,42.0,7.0,119939.5175,2.0,1.0,1.0,155152.4675,0.0
max,165033.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [6]:
df['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import RobustScaler,FunctionTransformer,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix,get_scorer_names

In [8]:
df_exited = df[df['Exited']==1]
df_not_exited  = df[df['Exited']==0]

In [9]:
dfe = df_exited._append(df_not_exited.sample(30000,replace = True))
dfe['Exited'].value_counts()

Exited
1    34921
0    30000
Name: count, dtype: int64

In [10]:
dfe.corr(numeric_only = True)['Exited']

id                 0.002120
CustomerId        -0.014458
CreditScore       -0.032786
Age                0.390828
Tenure            -0.026836
Balance            0.158375
NumOfProducts     -0.236119
HasCrCard         -0.025405
IsActiveMember    -0.262532
EstimatedSalary    0.019674
Exited             1.000000
Name: Exited, dtype: float64

In [11]:
X = dfe.drop('Exited',axis = 1)
y = dfe['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify = y)

In [12]:
CT = ColumnTransformer([
    ('drop',"drop",['CustomerId','id','Surname']),
    ('add_avg_bal_tenure',FunctionTransformer(lambda df: df.assign(avg_balance_per_tenure = np.round(df['Balance']/(df['Tenure']+0.00000001)),decimals = 2),validate=False),['Balance','Tenure']),
    ('cat_to_num',OrdinalEncoder(),['Geography','Gender']),
    ('Pass',"passthrough",['CreditScore','Age','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary'])
])

In [13]:
pipe = Pipeline([
    ('Preprocessing',CT),
    ('Scaling',RobustScaler()),
    ('model',HistGradientBoostingClassifier(scoring = 'average_precision'))
])

pipe.fit(X_train,y_train)

In [14]:
preds = pipe.predict(X_test)
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79      6000
           1       0.82      0.82      0.82      6985

    accuracy                           0.81     12985
   macro avg       0.81      0.81      0.81     12985
weighted avg       0.81      0.81      0.81     12985



In [15]:
test_df = pd.read_csv('Data/Bank_Customer_Churn_Data/test.csv')
fpreds = pipe.predict(test_df)

In [16]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110023 entries, 0 to 110022
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               110023 non-null  int64  
 1   CustomerId       110023 non-null  int64  
 2   Surname          110023 non-null  object 
 3   CreditScore      110023 non-null  int64  
 4   Geography        110023 non-null  object 
 5   Gender           110023 non-null  object 
 6   Age              110023 non-null  float64
 7   Tenure           110023 non-null  int64  
 8   Balance          110023 non-null  float64
 9   NumOfProducts    110023 non-null  int64  
 10  HasCrCard        110023 non-null  float64
 11  IsActiveMember   110023 non-null  float64
 12  EstimatedSalary  110023 non-null  float64
dtypes: float64(5), int64(5), object(3)
memory usage: 10.9+ MB


In [18]:
fdf = pd.DataFrame({'id':test_df['id'],'Exited':fpreds}).set_index('id')
fdf.to_csv('Predictions/Bank_Customer_Churn_Dataset/submission_1.csv')