In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv('Churn_Modelling.csv')

## Exploratory data analysis

In [3]:
data.head(n=10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [6]:
data[['Geography', 'Gender']].describe()

Unnamed: 0,Geography,Gender
count,10000,10000
unique,3,2
top,France,Male
freq,5014,5457


## PreProcessing
Removing features that doesn't contribute to the output.

In [7]:
data = data.iloc[:,3:]
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Encoding categorical values

#### Encoding Geograohy

In [8]:
#Labelling Geography
label_encoder_Geo = LabelEncoder()
data['Geography'] = label_encoder_Geo.fit_transform(data['Geography'])


#Labelling Gender
label_encoder_Gender = LabelEncoder()
data['Gender'] = label_encoder_Gender.fit_transform(data['Gender'])

#Encoding Geography
encoder = OneHotEncoder(categorical_features = [1], sparse=False)
data = pd.DataFrame(encoder.fit_transform(data))
data = data.iloc[:,1:]

In [9]:
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.0,619.0,0.0,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,1.0
1,0.0,1.0,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0.0
2,0.0,0.0,502.0,0.0,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1.0
3,0.0,0.0,699.0,0.0,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,0.0
4,0.0,1.0,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.1,0.0


In [15]:
X = data.iloc[:,:11]
y = data.iloc[:,11]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

## Modelling

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier as KNeighbors
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DecisionTree

In [20]:
#Logistic Regression
logReg = LogisticRegression()
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_test)
acc_logReg = round(logReg.score(X_test, y_test) * 100, 2)
acc_logReg

80.15

85.75

In [21]:
#Random Forest
rf = RandomForest(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
acc_rf = round(rf.score(X_test, y_test) * 100, 2)
acc_rf

86.45

In [35]:
# Ada Boost
AdaBoost = AdaBoostClassifier()
AdaBoost.fit(X_train, y_train)
y_pred = AdaBoost.predict(X_test)
acc_AdaBoost = round(AdaBoost.score(X_test, y_test) * 100, 2)
acc_AdaBoost

85.75

In [36]:
# Extra Trees
ExtraTrees = ExtraTreesClassifier()
ExtraTrees.fit(X_train, y_train)
y_pred = ExtraTrees.predict(X_test)
acc_ExtraTrees = round(ExtraTrees.score(X_test, y_test) * 100, 2)
acc_ExtraTrees

84.8

In [28]:
#KNN
knn = KNeighbors(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_test, y_test) * 100, 2)
acc_knn

79.6

In [29]:
#SVM
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_test, y_test) * 100, 2)
acc_svc

80.35

In [30]:
# Decision Tree

dt = DecisionTree()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
acc_dt = round(dt.score(X_test, y_test) * 100, 2)
acc_dt

77.45

## Evaluation

In [37]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest','Ada Boost', 'Extra Trees',  'KNN', 'SVC', 'Decision Tree'],\
    'Score': [acc_logReg, acc_rf,acc_AdaBoost,acc_ExtraTrees,  acc_knn, acc_svc, acc_dt]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
1,Random Forest,86.45
2,Ada Boost,85.75
3,Extra Trees,84.8
5,SVC,80.35
0,Logistic Regression,80.15
4,KNN,79.6
6,Decision Tree,77.45


# Optimization - Hyper Parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import fbeta_score, make_scorer


clf = RandomForest(random_state=42)

param_grid = {"n_estimators": [100, 400, 700, 1000, 1500],\
              "min_samples_leaf" : [1, 5, 10, 25, 50, 70],\
              "min_samples_split" : [2, 4, 10, 12, 16, 18, 25, 35],\
              "criterion" : ["gini", "entropy"]}

scorer = make_scorer(fbeta_score, beta=0.5)

grid_obj = GridSearchCV(clf, param_grid=param_grid, cv=10, scoring=scorer)

grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

grid_y_pred = grid_obj.predict(X_test)

grid_score = round(grid_obj.score(X_test, y_test) * 100, 2)