In [None]:
import os

import pandas as pd
import numpy as np

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

# Function for creating model pipelines - sklearn
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE

In [None]:
abt = pd.read_csv("https://raw.githubusercontent.com/SumantaSethi/Customer-Churn-Prediction-/master/Resources/Analytical_Base_Table.csv")
abt.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Model Traning

splitting dataframe into separate objects

x for the input features

y for the target variable

In [None]:
# Object for target variable
y = abt.Exited

# object for input features
X = abt.drop(['Exited'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(10000, 10) (10000,)


In [None]:
# List numerical features
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [None]:
# List categorical features
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['Geography', 'Gender']

In [None]:
def class_count(a):
    counter=Counter(a)
    kv=[list(counter.keys()),list(counter.values())]
    abt2 = pd.DataFrame(np.array(kv).T, columns=['Exited','Count'])
    abt2['Count'] = abt2['Count'].astype('int64')
    abt2['%'] = round(abt2['Count'] / a.shape[0] * 100, 2)
    return abt2.sort_values('Count',ascending=False)

In [None]:
class_count(y)

Unnamed: 0,Exited,Count,%
1,0,7963,79.63
0,1,2037,20.37


Splitting our data into separate training and test sets

training set(70%): to teach the model

testing set(30%): to evaluate performance

In [None]:
random_state = 10

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=random_state,
                                                    stratify=abt.Exited)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

7000 3000 7000 3000


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 8061 to 4741
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      7000 non-null   int64  
 1   Geography        7000 non-null   object 
 2   Gender           7000 non-null   object 
 3   Age              7000 non-null   int64  
 4   Tenure           7000 non-null   int64  
 5   Balance          7000 non-null   float64
 6   NumOfProducts    7000 non-null   int64  
 7   HasCrCard        7000 non-null   int64  
 8   IsActiveMember   7000 non-null   int64  
 9   EstimatedSalary  7000 non-null   float64
dtypes: float64(2), int64(6), object(2)
memory usage: 601.6+ KB


# Preprocessing pipeline

Scale numerical data and encode categorical data

MinMaxScaler and OneHotEncoder Create lists of indexes from the list of column namesNeed to be numeric not string to specify columns name in column transformer

In [None]:
num_features = []

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
    print(num_features)

[0]
[0, 3]
[0, 3, 4]
[0, 3, 4, 5]
[0, 3, 4, 5, 6]
[0, 3, 4, 5, 6, 7]
[0, 3, 4, 5, 6, 7, 8]
[0, 3, 4, 5, 6, 7, 8, 9]


In [None]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
    print(cat_features)

[1]
[1, 2]


In [None]:
# Define column transformer
# Need to be numeric not string to specify columns name
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(), cat_features)
)
preprocess

In [None]:
# Import classifier
from sklearn.linear_model import LogisticRegression

# Define model with pipeline
model = imbl_pipe(preprocess,
                  SMOTE(sampling_strategy='auto', random_state=random_state),
                  LogisticRegression(random_state=random_state))

model

In [None]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
lr_param_grid = {
    'logisticregression__C' : [0.01, 0.05, 0.1, 0.5, 1, 5],
    'logisticregression__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'logisticregression__max_iter' : [5000, 10000]
}

lr_grid = GridSearchCV(model, lr_param_grid, verbose=3, cv= 5, scoring='accuracy')

In [None]:
X_train = X_train
X_test = X_test

In [None]:
lr_grid.fit(X_train, y_train)

# Save the fitted model
import joblib
filename = 'Nate_lr_model.sav'
joblib.dump(lr_grid, filename)
print(f"Fitted model saved to {filename}")

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END logisticregression__C=0.01, logisticregression__max_iter=5000, logisticregression__solver=liblinear;, score=0.663 total time=   0.1s
[CV 2/5] END logisticregression__C=0.01, logisticregression__max_iter=5000, logisticregression__solver=liblinear;, score=0.685 total time=   0.1s
[CV 3/5] END logisticregression__C=0.01, logisticregression__max_iter=5000, logisticregression__solver=liblinear;, score=0.681 total time=   0.1s
[CV 4/5] END logisticregression__C=0.01, logisticregression__max_iter=5000, logisticregression__solver=liblinear;, score=0.688 total time=   0.1s
[CV 5/5] END logisticregression__C=0.01, logisticregression__max_iter=5000, logisticregression__solver=liblinear;, score=0.687 total time=   0.1s
[CV 1/5] END logisticregression__C=0.01, logisticregression__max_iter=5000, logisticregression__solver=newton-cg;, score=0.663 total time=   0.1s
[CV 2/5] END logisticregression__C=0.01, logisticregression__m

In [None]:
print(lr_grid.best_params_)

{'logisticregression__C': 5, 'logisticregression__solver': 'liblinear'}


In [None]:
print(lr_grid.best_score_)

0.7197142857142858


In [None]:
print(f"Training Data Score: {lr_grid.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_grid.score(X_test, y_test)}")

Training Data Score: 0.722
Testing Data Score: 0.707


In [None]:
predictions = lr_grid.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 1 1 0 0 0 0]
First 10 Actual labels: [1, 0, 0, 0, 0, 1, 0, 0, 0, 0]


In [None]:

pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)


Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,0
3,0,0
4,1,0
...,...,...
2995,0,0
2996,0,0
2997,0,0
2998,0,0


In [None]:
cm = confusion_matrix(y_test, predictions)
print(cm)

[[1708  681]
 [ 198  413]]


In [None]:
cm = np.around(cm / cm.sum(axis=1)[:, np.newaxis], 2)
print(cm)

[[0.71 0.29]
 [0.32 0.68]]


In [None]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.71      0.80      2389
           1       0.38      0.68      0.48       611

    accuracy                           0.71      3000
   macro avg       0.64      0.70      0.64      3000
weighted avg       0.79      0.71      0.73      3000



In [None]:
pred = lr_grid.predict(X_test[:1])

In [None]:
print(f"Predicted classes: {pred}")
print(f"Actual Labels: {list(y_test[:1])}")

Predicted classes: [0]
Actual Labels: [1]


In [None]:
import joblib

# Use a local filename to save the model
filename = 'Nate_lr_model.sav'
joblib.dump(lr_grid, filename)

['Nate_lr_model.sav']

In [None]:
import joblib
lr_model = joblib.load('Nate_lr_model.sav')
print(f"Model loaded successfully. Score: {lr_model.score(X_test, y_test)}")

Model loaded successfully. Score: 0.707


# Predict class for new data

In [None]:
# Let's use the first X_test record as new data
X_test[:1]

array([[638, 'France', 'Male', 36, 6, 188455.19, 1, 0, 0, 47031.4]],
      dtype=object)

In [None]:
pred_new = lr_grid.predict(X_test[:1])

In [None]:
print(f"Predicted classes: {pred_new}")
print(f"Actual Labels: {list(y_test[:1])}")

Predicted classes: [0]
Actual Labels: [1]


In [None]:
X_test[:1].values

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [None]:
pred_new1 = lr_grid.predict(X_test[:1])
pred_new1

array([0])