In [1]:
# data manipulation
import numpy as np
import pandas as pd

# from feature-engine
from feature_engine.encoding import (
    OrdinalEncoder,
    OneHotEncoder,
)

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Pickle for data preprocessing
import pickle

In [2]:
df= pd.read_csv(r"C:\Users\furka\JupyterNotebookProjects\End-to-End-Churn-Modelling-Binary-Classification\Churn_Modelling.csv")

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
## Drop the unnecessary columns
df=df.drop(['RowNumber','CustomerId','Surname'],axis=1)
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [5]:
# Drop the null values.
# Since the number of missing data is low, we can directly delete the missing data instead of filling it in. 
# Because it will not affect the model. We can use dropna() for this.
df = df.dropna()
# Dropping the duplicate values
df = df.drop_duplicates()

print('Rows containing missing data and duplicate data were deleted.')

Rows containing missing data and duplicate data were deleted.


In [6]:
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [7]:
# Separate into train and test set
# Remember to set the seed (random_state for this sklearn function)
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['Exited'], axis=1), # predictive variables
    df['Exited'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

print('The dataset separate dataset into train and test successfully')
X_train.shape, X_test.shape

The dataset separate dataset into train and test successfully


((8000, 10), (2000, 10))

- Label Encoding for 'Gender' column

In [8]:
label_encoder = OrdinalEncoder(encoding_method='arbitrary', variables='Gender')

# fit the encoder
label_encoder.fit(X_train)

# mappings are stored and class can be saved
label_encoder.encoder_dict_

{'Gender': {'Female': 0, 'Male': 1}}

In [9]:
X_train = label_encoder.transform(X_train)
X_test = label_encoder.transform(X_test)

In [10]:
X_train

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
7389,667,Spain,0,34,5,0.00,2,1,0,163830.64
9275,427,Germany,1,42,1,75681.52,1,1,1,57098.00
2995,535,France,0,29,2,112367.34,1,1,0,185630.76
5316,654,Spain,1,40,5,105683.63,1,1,0,173617.09
356,850,Spain,0,57,8,126776.30,2,1,1,132298.49
...,...,...,...,...,...,...,...,...,...,...
9225,594,Germany,0,32,4,120074.97,2,1,1,162961.79
4859,794,Spain,0,22,4,114440.24,1,1,1,107753.07
3264,738,France,1,35,5,161274.05,2,1,0,181429.87
9845,590,Spain,0,38,9,0.00,2,1,1,148750.16


In [11]:
X_test

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9394,597,Germany,0,35,8,131101.04,1,1,1,192852.67
898,523,France,0,40,2,102967.41,1,1,0,128702.10
2398,706,Spain,0,42,8,95386.82,1,1,1,75732.25
5906,788,France,1,32,4,112079.58,1,0,0,89368.59
2343,706,Germany,1,38,5,163034.82,2,1,1,135662.17
...,...,...,...,...,...,...,...,...,...,...
1037,625,France,0,24,1,0.00,2,1,1,180969.55
2899,586,France,0,35,7,0.00,2,1,0,70760.69
9549,578,Spain,1,36,1,157267.95,2,1,0,141533.19
2740,650,Germany,1,34,4,142393.11,1,1,1,11276.48


- One Hot Encoding for 'Geography' column

In [12]:
# set up encoder
oh_encoder = OneHotEncoder(
    variables='Geography',
    drop_last=False,  # to return k-1, use drop=false to return k dummies
)

# fit the encoder
oh_encoder.fit(X_train)

# mappings are stored and class can be saved
oh_encoder.encoder_dict_

{'Geography': ['Spain', 'Germany', 'France']}

In [13]:
X_train = oh_encoder.transform(X_train)
X_test = oh_encoder.transform(X_test)

In [14]:
X_train

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Spain,Geography_Germany,Geography_France
7389,667,0,34,5,0.00,2,1,0,163830.64,1,0,0
9275,427,1,42,1,75681.52,1,1,1,57098.00,0,1,0
2995,535,0,29,2,112367.34,1,1,0,185630.76,0,0,1
5316,654,1,40,5,105683.63,1,1,0,173617.09,1,0,0
356,850,0,57,8,126776.30,2,1,1,132298.49,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9225,594,0,32,4,120074.97,2,1,1,162961.79,0,1,0
4859,794,0,22,4,114440.24,1,1,1,107753.07,1,0,0
3264,738,1,35,5,161274.05,2,1,0,181429.87,0,0,1
9845,590,0,38,9,0.00,2,1,1,148750.16,1,0,0


In [15]:
X_test

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Spain,Geography_Germany,Geography_France
9394,597,0,35,8,131101.04,1,1,1,192852.67,0,1,0
898,523,0,40,2,102967.41,1,1,0,128702.10,0,0,1
2398,706,0,42,8,95386.82,1,1,1,75732.25,1,0,0
5906,788,1,32,4,112079.58,1,0,0,89368.59,0,0,1
2343,706,1,38,5,163034.82,2,1,1,135662.17,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1037,625,0,24,1,0.00,2,1,1,180969.55,0,0,1
2899,586,0,35,7,0.00,2,1,0,70760.69,0,0,1
9549,578,1,36,1,157267.95,2,1,0,141533.19,1,0,0
2740,650,1,34,4,142393.11,1,1,1,11276.48,0,1,0


In [16]:
# Save the encoders
with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder,file)

with open('oh_encoder_geo.pkl','wb') as file:
    pickle.dump(oh_encoder,file)

In [17]:
X_train.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Spain,Geography_Germany,Geography_France
7389,667,0,34,5,0.0,2,1,0,163830.64,1,0,0
9275,427,1,42,1,75681.52,1,1,1,57098.0,0,1,0
2995,535,0,29,2,112367.34,1,1,0,185630.76,0,0,1
5316,654,1,40,5,105683.63,1,1,0,173617.09,1,0,0
356,850,0,57,8,126776.3,2,1,1,132298.49,1,0,0


In [18]:
X_test.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Spain,Geography_Germany,Geography_France
9394,597,0,35,8,131101.04,1,1,1,192852.67,0,1,0
898,523,0,40,2,102967.41,1,1,0,128702.1,0,0,1
2398,706,0,42,8,95386.82,1,1,1,75732.25,1,0,0
5906,788,1,32,4,112079.58,1,0,0,89368.59,0,0,1
2343,706,1,38,5,163034.82,2,1,1,135662.17,0,1,0


- Feature Scaling

In [19]:
# create scaler
scaler = StandardScaler()

#  fit  the scaler to the train set
scaler.fit(X_train) 

# transform the train and test set

# sklearn returns numpy arrays, so we wrap the
# array with a pandas dataframe

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)

In [22]:
X_train.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Spain,Geography_Germany,Geography_France
0,0.169582,-1.091687,-0.464608,0.006661,-1.215717,0.809503,0.642595,-1.03227,1.106432,1.74309,-0.569844,-1.014607
1,-2.304559,0.916013,0.301026,-1.37744,-0.006312,-0.921591,0.642595,0.968738,-0.748664,-0.573694,1.754865,-1.014607
2,-1.191196,-1.091687,-0.943129,-1.031415,0.579935,-0.921591,0.642595,-1.03227,1.485335,-0.573694,-0.569844,0.985604
3,0.035566,0.916013,0.109617,0.006661,0.473128,-0.921591,0.642595,-1.03227,1.276528,1.74309,-0.569844,-1.014607
4,2.056114,-1.091687,1.736588,1.044737,0.810193,0.809503,0.642595,0.968738,0.558378,1.74309,-0.569844,-1.014607


In [23]:
X_test.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Spain,Geography_Germany,Geography_France
0,-0.552043,-1.091687,-0.368904,1.044737,0.879303,-0.921591,0.642595,0.968738,1.610857,-0.573694,1.754865,-1.014607
1,-1.314903,-1.091687,0.109617,-1.031415,0.429722,-0.921591,0.642595,-1.03227,0.49587,-0.573694,-0.569844,0.985604
2,0.57163,-1.091687,0.301026,1.044737,0.308583,-0.921591,0.642595,0.968738,-0.424787,1.74309,-0.569844,-1.014607
3,1.416961,0.916013,-0.656016,-0.339364,0.575336,-0.921591,-1.55619,-1.03227,-0.187777,-0.573694,-0.569844,0.985604
4,0.57163,0.916013,-0.081791,0.006661,1.389611,0.809503,0.642595,0.968738,0.616842,-0.573694,1.754865,-1.014607


In [24]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

- All Preprocessing and feature engineering is done