## Import Essential Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Download Dataset From Kaggle

In [2]:
!pip install -q kaggle

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"alirezabinayiaan","key":"6695753f607fbe237d61a47bae7a758b"}'}

In [4]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [5]:
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle datasets download gauravtopre/bank-customer-churn-dataset

Downloading bank-customer-churn-dataset.zip to /content
  0% 0.00/187k [00:00<?, ?B/s]
100% 187k/187k [00:00<00:00, 61.1MB/s]


In [7]:
!unzip -xq /content/bank-customer-churn-dataset.zip

## Read Dataset

In [8]:
data = pd.read_csv('/content/Bank Customer Churn Prediction.csv')
data.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
data.shape

(10000, 12)

In [10]:
data.isnull().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

## Separate Features and Targets

In [11]:
y = data.iloc[:,-1].values

In [12]:
y

array([1, 0, 1, ..., 1, 1, 0])

## PreProcessing

In [13]:
data['gender'].unique()

array(['Female', 'Male'], dtype=object)

### Drop not important Columns

In [14]:
data.drop(['customer_id', 'churn'], axis=1, inplace=True)

In [15]:
data.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


### Convert Categorical into Numeric

#### One-Hot Encoding

In [16]:
data['country'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [17]:
pd.get_dummies(data, columns=['country'], prefix='country')

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_France,country_Germany,country_Spain
0,619,Female,42,2,0.00,1,1,1,101348.88,1,0,0
1,608,Female,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,Female,42,8,159660.80,3,1,0,113931.57,1,0,0
3,699,Female,39,1,0.00,2,0,0,93826.63,1,0,0
4,850,Female,43,2,125510.82,1,1,1,79084.10,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,Male,39,5,0.00,2,1,0,96270.64,1,0,0
9996,516,Male,35,10,57369.61,1,1,1,101699.77,1,0,0
9997,709,Female,36,7,0.00,1,0,1,42085.58,1,0,0
9998,772,Male,42,3,75075.31,2,1,0,92888.52,0,1,0


In [18]:
data = pd.get_dummies(data, columns=['country'], prefix='country')

#### Label Encoding

In [19]:
data['gender'] = data['gender'].apply(lambda x: 1 if x == 'Male' else 0)

In [20]:
data.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,country_France,country_Germany,country_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,1,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,1


In [21]:
x = data.values

In [22]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sc = StandardScaler()
x = sc.fit_transform(x)

In [23]:
x

array([[-0.32622142, -1.09598752,  0.29351742, ...,  0.99720391,
        -0.57873591, -0.57380915],
       [-0.44003595, -1.09598752,  0.19816383, ..., -1.00280393,
        -0.57873591,  1.74273971],
       [-1.53679418, -1.09598752,  0.29351742, ...,  0.99720391,
        -0.57873591, -0.57380915],
       ...,
       [ 0.60498839, -1.09598752, -0.27860412, ...,  0.99720391,
        -0.57873591, -0.57380915],
       [ 1.25683526,  0.91241915,  0.29351742, ..., -1.00280393,
         1.72790383, -0.57380915],
       [ 1.46377078, -1.09598752, -1.04143285, ...,  0.99720391,
        -0.57873591, -0.57380915]])

In [24]:
print(x.shape)
print(y.shape)

(10000, 12)
(10000,)


### Split Dataset

$0.6$ train, $0.2$ validation, and $0.2$ test

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)

In [26]:
print(X_train.shape)
print(X_test.shape)

(6000, 12)
(4000, 12)


In [27]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [28]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

(6000, 12)
(2000, 12)
(2000, 12)


### Model Selection

#### Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [30]:
lr.fit(X_train, y_train)

In [31]:
y_pred_train = lr.predict(X_train)
y_pred_val = lr.predict(X_val)

In [32]:
from sklearn.metrics import accuracy_score

print(f"Train accuracy: {accuracy_score(y_train, y_pred_train)}")
print(f"Val accuracy: {accuracy_score(y_val, y_pred_val)}")

Train accuracy: 0.8128333333333333
Val accuracy: 0.8165


#### Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier().fit(X_train, y_train)

In [34]:
y_pred_train = rfc.predict(X_train)
y_pred_val = rfc.predict(X_val)

In [35]:
print(f"Train accuracy: {accuracy_score(y_train, y_pred_train)}")
print(f"Val accuracy: {accuracy_score(y_val, y_pred_val)}")

Train accuracy: 1.0
Val accuracy: 0.862
