In [28]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Churn_Modelling.csv')

In [3]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10002.0,10002.0,10002.0,10001.0,10002.0,10002.0,10002.0,10001.0,10001.0,10002.0,10002.0
mean,5001.4996,15690930.0,650.555089,38.922311,5.012498,76491.112875,1.530194,0.705529,0.514949,100083.331145,0.203759
std,2887.472338,71931.77,96.661615,10.4872,2.891973,62393.474144,0.581639,0.455827,0.499801,57508.117802,0.402812
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2501.25,15628520.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,50983.75,0.0
50%,5001.5,15690730.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100185.24,0.0
75%,7501.75,15753230.0,718.0,44.0,7.0,127647.84,2.0,1.0,1.0,149383.6525,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10002 non-null  int64  
 1   CustomerId       10002 non-null  int64  
 2   Surname          10002 non-null  object 
 3   CreditScore      10002 non-null  int64  
 4   Geography        10001 non-null  object 
 5   Gender           10002 non-null  object 
 6   Age              10001 non-null  float64
 7   Tenure           10002 non-null  int64  
 8   Balance          10002 non-null  float64
 9   NumOfProducts    10002 non-null  int64  
 10  HasCrCard        10001 non-null  float64
 11  IsActiveMember   10001 non-null  float64
 12  EstimatedSalary  10002 non-null  float64
 13  Exited           10002 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [5]:
#Finding missing values 
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
#Removing null values
data = data.dropna(axis=0)

In [7]:
#Removing column surname
data.drop(columns="Surname", inplace=True)

In [8]:
#Converting category columns into numeric type
cat_cols = data.select_dtypes("object").columns

for col in cat_cols:
    data[col] = pd.factorize(data[col])[0]

In [25]:
data.head()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,619,0,0,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,608,1,0,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,502,0,0,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,699,0,0,39.0,1,0.0,2,0.0,0.0,93826.63,0
5,6,15574012,645,1,1,44.0,8,113755.78,2,1.0,0.0,149756.71,1


In [10]:
X = data.drop(columns="Exited")
y = data["Exited"]

In [11]:
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.3)

##### Model

In [12]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train,y_train)

In [13]:
y_predictions = model.predict(x_test)

##### Metrics

In [20]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [21]:
print(f"Classification Report: \n {classification_report(y_test,y_predictions)}")

Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.96      0.87      2374
           1       0.28      0.05      0.09       626

    accuracy                           0.77      3000
   macro avg       0.54      0.51      0.48      3000
weighted avg       0.69      0.77      0.71      3000



In [22]:
print(f"Confusion Matrix:\n {confusion_matrix(y_test,y_predictions)}")
print(f"Accuracy: {accuracy_score(y_test,y_predictions)}")

Confusion Matrix:
 [[2286   88]
 [ 592   34]]
Accuracy: 0.7733333333333333


#### Grid Search Cross Validation

In [25]:
from sklearn.model_selection import GridSearchCV

In [24]:
hyperparameters = {'penalty':['l1', 'l2', 'elasticnet'],
                  'C':[0.01,0.1,1.0],
                   'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

In [26]:
model_GSCV = GridSearchCV(estimator=LogisticRegression(), param_grid=hyperparameters, cv=3)

In [29]:
model_GSCV.fit(x_train,y_train)

In [30]:
model_GSCV.best_params_

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

In [31]:
model_LR = LogisticRegression(C=0.1,penalty='l1',solver='liblinear')

In [32]:
model_LR.fit(x_train,y_train)

In [34]:
predictions = model_LR.predict(x_test)

In [35]:
print(f"Classification Report: \n {classification_report(y_test,predictions)}")

Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.96      0.89      2374
           1       0.61      0.22      0.33       626

    accuracy                           0.81      3000
   macro avg       0.72      0.59      0.61      3000
weighted avg       0.78      0.81      0.77      3000



In [36]:
print(f"Confusion Matrix:\n {confusion_matrix(y_test,predictions)}")
print(f"Accuracy: {accuracy_score(y_test,predictions)}")

Confusion Matrix:
 [[2284   90]
 [ 486  140]]
Accuracy: 0.808
