### This notebook will used for model development

#### Import libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler       # scaling
from sklearn.model_selection import train_test_split   # spiltting the data

from sklearn.linear_model import LogisticRegression    # model building algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier    # ensemble algorithms
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from keras.models import Sequential                    # neural network
from keras.layers import Input, Dense, Flatten, Dropout, BatchNormalization
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

                                                       # evalution metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve, recall_score, precision_score

from sklearn.model_selection import GridSearchCV       # hyperparameter tuning

from imblearn.over_sampling import SMOTE               # for handling imbalamce data

import warnings
warnings.filterwarnings('ignore')




In [2]:
df = pd.read_csv('new_custchurn.csv')
df

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.50,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1990.50,0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,7362.90,0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,346.45,0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,306.60,1


##### We begin with our model building first step, Spiltting the data into independent and target variable.

In [3]:
df.shape

(7043, 20)

In [4]:
x=df.iloc[:,0:-1]                                      #independent variables
y=df.iloc[:,-1]                                        #Target variable

In [5]:
print(x.shape)
print(y.shape)

(7043, 19)
(7043,)


##### Scaling

In [6]:
np.set_printoptions(suppress=True)

In [7]:
scaler = StandardScaler()                               
scaler.fit(x)                                           #training on data
x=scaler.transform(x)                                   #transforming the data

print(x)

[[-1.00955867 -0.43991649  1.03453023 ...  0.39855772 -1.16032292
  -0.99424193]
 [ 0.99053183 -0.43991649 -0.96662231 ...  1.33486261 -0.25962894
  -0.17324412]
 [ 0.99053183 -0.43991649 -0.96662231 ...  1.33486261 -0.36266036
  -0.95967407]
 ...
 [-1.00955867 -0.43991649  1.03453023 ...  0.39855772 -1.1686319
  -0.85446944]
 [ 0.99053183  2.27315869  1.03453023 ...  1.33486261  0.32033821
  -0.87206241]
 [ 0.99053183 -0.43991649 -0.96662231 ... -1.47405205  1.35896134
   2.01428802]]


Here, we conclude the exploratory data analysis (EDA) and preprocessing steps, transitioning into the model-building phase

##### Building the model and spilting the data into train and test

In [8]:
#split the data into train and test

x_train, x_test,y_train, y_test = train_test_split(x, y, test_size=0.3,
                                                   random_state=77)

In [9]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
print("Percentage of train data",x_train.shape[0]/x.shape[0]*100)

(4930, 19)
(4930,)
(2113, 19)
(2113,)
Percentage of train data 69.99858015050404


### Applying different algorithms to determine which one performs the best , if possible, for tuning

Machine learning alogrithms

In [10]:
# first, initialize the classificators

tree= DecisionTreeClassifier(random_state=77)                       #using the random state for reproducibility
knn= KNeighborsClassifier(metric='euclidean')
svm=SVC(random_state=77)
logreg=LogisticRegression(random_state=77)

In [11]:
#create a list with the objects
models = [tree, knn, svm, logreg]

# Initialize an empty list to store results
model_performance = []

for model in models:
    model.fit(x_train, y_train)                                  # Fit the model
    y_pred = model.predict(x_test)                               # Then predict on the test set
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    class_repo = classification_report(y_test, y_pred)
    
    # Store each model's performance metrics in a dictionary and add it to the list
    performance = {
        'Model': type(model).__name__,
        'Accuracy': round(accuracy*100,2),
        'Precision': round(precision*100,2),
        'Recall': round(recall*100,2)
    }
    model_performance.append(performance)
    
    print(f"Model: {type(model).__name__}")
    print(f'Accuracy: {round(accuracy*100,2)}')
    print(f'Classification report: \n{class_repo}')
    print("----------------------")

Model: DecisionTreeClassifier
Accuracy: 72.55
Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.80      0.81      1568
           1       0.47      0.52      0.49       545

    accuracy                           0.73      2113
   macro avg       0.65      0.66      0.65      2113
weighted avg       0.73      0.73      0.73      2113

----------------------
Model: KNeighborsClassifier
Accuracy: 76.34
Classification report: 
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      1568
           1       0.54      0.54      0.54       545

    accuracy                           0.76      2113
   macro avg       0.69      0.69      0.69      2113
weighted avg       0.76      0.76      0.76      2113

----------------------
Model: SVC
Accuracy: 80.03
Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1568
  

In [12]:
# Convert the list of performance metrics into a DataFrame for a nice table view
performance_df = pd.DataFrame(model_performance)
performance_df

Unnamed: 0,Model,Accuracy,Precision,Recall
0,DecisionTreeClassifier,72.55,64.83,65.7
1,KNeighborsClassifier,76.34,69.08,69.03
2,SVC,80.03,74.46,69.42
3,LogisticRegression,79.93,73.88,71.22


Applying Ensemble Techniques

In [13]:
# first, initialize the classificators

ensemble_models = [RandomForestClassifier(n_estimators=100, random_state=77),
                   ExtraTreesClassifier(n_estimators=100, random_state=77),
                   GradientBoostingClassifier(n_estimators=100, random_state=77),
                   AdaBoostClassifier(n_estimators=100, random_state=77),
                   XGBClassifier(random_state=77)]

In [14]:
# Initialize an empty list to store results
model_performance = []

for model in ensemble_models:
    model.fit(x_train, y_train)                                  # Fit the model
    y_pred = model.predict(x_test)                               # Then predict on the test set
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    class_repo = classification_report(y_test, y_pred)
    
    # Store each model's performance metrics in a dictionary and add it to the list
    performance = {
        'Model': type(model).__name__,
        'Accuracy': round(accuracy*100,2),
        'Precision': round(precision*100,2),
        'Recall': round(recall*100,2)
    }
    model_performance.append(performance)
    
    print(f"Model: {type(model).__name__}")
    print(f'Accuracy: {round(accuracy*100,2)}')
    print(f'Classification report: \n{class_repo}')
    print("----------------------")

Model: RandomForestClassifier
Accuracy: 78.47
Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1568
           1       0.61      0.48      0.53       545

    accuracy                           0.78      2113
   macro avg       0.72      0.68      0.70      2113
weighted avg       0.77      0.78      0.78      2113

----------------------
Model: ExtraTreesClassifier
Accuracy: 78.14
Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1568
           1       0.59      0.48      0.53       545

    accuracy                           0.78      2113
   macro avg       0.71      0.68      0.70      2113
weighted avg       0.77      0.78      0.77      2113

----------------------
Model: GradientBoostingClassifier
Accuracy: 80.36
Classification report: 
              precision    recall  f1-score   support

           0       0.84      0.91

In [15]:
# Convert the list of performance metrics into a DataFrame for a nice table view
performance_df_ens = pd.DataFrame(model_performance)
performance_df_ens

Unnamed: 0,Model,Accuracy,Precision,Recall
0,RandomForestClassifier,78.47,71.77,68.37
1,ExtraTreesClassifier,78.14,71.24,68.45
2,GradientBoostingClassifier,80.36,74.75,70.67
3,AdaBoostClassifier,80.12,74.19,71.28
4,XGBClassifier,77.57,70.4,68.07


Building a neural Network

In [16]:
df.shape

(7043, 20)

In [18]:
# Initialising the ANN
model_ann = Sequential()

# Adding the input layer and the first hidden layer
model_ann.add(Dense(units = 19, activation = 'relu', input_dim = 19))


# Adding the second hidden layer 
model_ann.add(Dense(units=19, activation='relu'))

# dropout for second layer 
model_ann.add(Dropout(0.1))

# Adding the third hidden layer 
model_ann.add(Dense(units=19, activation='relu'))

# dropout for third layer 
model_ann.add(Dropout(0.1))

# Adding the fourth hidden layer 
model_ann.add(Dense(units=19, activation='relu'))


# Adding the output layer 
model_ann.add(Dense(units=1, activation='sigmoid'))

# Compiling the ANN
model_ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)

# Fitting the ANN to the Training set
model_ann.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100, callbacks=[early_stopping, model_checkpoint])

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.45506, saving model to best_model.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.45506 to 0.44538, saving model to best_model.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.44538 to 0.43458, saving model to best_model.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.43458 to 0.43361, saving model to best_model.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.43361 to 0.42943, saving model to best_model.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.42943 to 0.42641, saving model to best_model.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.42641 to 0.42430, saving model to best_model.h5
Epoch 8/100
Epoch 8: val_loss did not improve from 0.42430
Epoch 9/100
Epoch 9: val_loss did not improve from 0.42430
Epoch 10/100
Epoch 10: val_loss improved from 0.42430 to 0.42424, saving model to best_model.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.42424 to 0.42245, saving model to best_model.h5
Epoch 12/100
Epoch 12: val_loss did 

<keras.src.callbacks.History at 0x1499e526390>

In [19]:
model_ann.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 19)                380       
                                                                 
 dense_6 (Dense)             (None, 19)                380       
                                                                 
 dropout_2 (Dropout)         (None, 19)                0         
                                                                 
 dense_7 (Dense)             (None, 19)                380       
                                                                 
 dropout_3 (Dropout)         (None, 19)                0         
                                                                 
 dense_8 (Dense)             (None, 19)                380       
                                                                 
 dense_9 (Dense)             (None, 1)                

In [20]:
# Load the best model
model_ann = load_model('best_model.h5')

# Predict class probabilities
y_pred_probs = model_ann.predict(x_test)

# Convert probabilities to class labels based on a 0.5 threshold
y_pred = (y_pred_probs >= 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate a classification report
class_repo = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {round(accuracy * 100, 2)}')
print(f'Classification report: \n{class_repo}')

Accuracy: 80.08
Classification report: 
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1568
           1       0.64      0.52      0.57       545

    accuracy                           0.80      2113
   macro avg       0.74      0.71      0.72      2113
weighted avg       0.79      0.80      0.79      2113



We can clearly see that the accuracy of most algorithms, including the neural network, is around 80%, with the best-performing models being AdaBoost and Gradient Boosting. However, it's important to remember that the dataset is imbalanced. To address this issue, we will use SMOTE.

#### SMOTE
Synthetic minority oversampling Technique. This technique allows us to upsample the minority class observation to reach the level of majority class by creating synthetic samples similar to existing samples

In [21]:
df.Churn.value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [22]:
X = df.values[:,:-1]
Y = df.values[:,-1]
print(X.shape)
print(Y.shape)

(7043, 19)
(7043,)


In [23]:
#scaling
scaler.fit(X)
X = scaler.transform(X)

In [24]:
#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,
                                                    random_state=10)

In [25]:
print("Before OverSampling, counts of label '1': ", (sum(Y_train == 1)))
print("Before OverSampling, counts of label '0': ", (sum(Y_train == 0)))
  
# import SMOTE from imblearn library
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 77,k_neighbors=10)
x_train_new, y_train_new = sm.fit_resample(X_train, Y_train)
  
print('After OverSampling, the shape of train_X: ', (x_train_new.shape))
print('After OverSampling, the shape of train_y: ', (y_train_new.shape))
  
print("After OverSampling, counts of label '1': ", (sum(y_train_new == 1)))
print("After OverSampling, counts of label '0': ", (sum(y_train_new == 0)))

Before OverSampling, counts of label '1':  1312
Before OverSampling, counts of label '0':  3618
After OverSampling, the shape of train_X:  (7236, 19)
After OverSampling, the shape of train_y:  (7236,)
After OverSampling, counts of label '1':  3618
After OverSampling, counts of label '0':  3618


Applying tradtional ML algo on SMOTE

In [26]:
# first, initialize the classificators

tree= DecisionTreeClassifier(random_state=77)                       #using the random state for reproducibility
knn= KNeighborsClassifier(metric='euclidean')
svm=SVC(random_state=77)
logreg=LogisticRegression(random_state=77)

#create a list with the objects
models = [tree, knn, svm, logreg]

# Initialize an empty list to store results
model_performance = []

for model in models:
    model.fit(x_train_new, y_train_new)                                  # Fit the model
    y_pred = model.predict(X_test)                               # Then predict on the test set
    
    # Calculate performance metrics
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred, average='macro')
    recall = recall_score(Y_test, y_pred, average='macro')
    class_repo = classification_report(Y_test, y_pred)
    
    # Store each model's performance metrics in a dictionary and add it to the list
    performance = {
        'Model': type(model).__name__,
        'Accuracy': round(accuracy*100,2),
        'Precision': round(precision*100,2),
        'Recall': round(recall*100,2)
    }
    model_performance.append(performance)
    
    print(f"Model: {type(model).__name__}")
    print(f'Accuracy: {round(accuracy*100,2)}')
    print(f'Classification report: \n{class_repo}')
    print("----------------------")

# Convert the list of performance metrics into a DataFrame for a nice table view
performance_df_smote = pd.DataFrame(model_performance)
performance_df_smote

Model: DecisionTreeClassifier
Accuracy: 71.42
Classification report: 
              precision    recall  f1-score   support

         0.0       0.82      0.78      0.80      1556
         1.0       0.46      0.53      0.49       557

    accuracy                           0.71      2113
   macro avg       0.64      0.66      0.65      2113
weighted avg       0.73      0.71      0.72      2113

----------------------
Model: KNeighborsClassifier
Accuracy: 66.26
Classification report: 
              precision    recall  f1-score   support

         0.0       0.85      0.66      0.74      1556
         1.0       0.41      0.68      0.52       557

    accuracy                           0.66      2113
   macro avg       0.63      0.67      0.63      2113
weighted avg       0.74      0.66      0.68      2113

----------------------
Model: SVC
Accuracy: 75.77
Classification report: 
              precision    recall  f1-score   support

         0.0       0.88      0.78      0.83      1556
  

Unnamed: 0,Model,Accuracy,Precision,Recall
0,DecisionTreeClassifier,71.42,64.31,65.55
1,KNeighborsClassifier,66.26,63.35,66.89
2,SVC,75.77,70.42,73.81
3,LogisticRegression,75.11,71.22,76.24


Applying Ensemble Techniques on SMOTE

In [27]:
# first, initialize the classificators

ensemble_models = [RandomForestClassifier(n_estimators=100, random_state=77),
                   ExtraTreesClassifier(n_estimators=100, random_state=77),
                   GradientBoostingClassifier(n_estimators=100, random_state=77),
                   AdaBoostClassifier(n_estimators=100, random_state=77),
                   XGBClassifier(random_state=77)]

# Initialize an empty list to store results
model_performance = []

for model in ensemble_models:
    model.fit(x_train_new, y_train_new)                                  # Fit the model
    y_pred = model.predict(X_test)                               # Then predict on the test set
    
    # Calculate performance metrics
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred, average='macro')
    recall = recall_score(Y_test, y_pred, average='macro')
    class_repo = classification_report(Y_test, y_pred)
    
    # Store each model's performance metrics in a dictionary and add it to the list
    performance = {
        'Model': type(model).__name__,
        'Accuracy': round(accuracy*100,2),
        'Precision': round(precision*100,2),
        'Recall': round(recall*100,2)
    }
    model_performance.append(performance)
    
    print(f"Model: {type(model).__name__}")
    print(f'Accuracy: {round(accuracy*100,2)}')
    print(f'Classification report: \n{class_repo}')
    print("----------------------")

# Convert the list of performance metrics into a DataFrame for a nice table view
performance_df_ens_smote = pd.DataFrame(model_performance)
performance_df_ens_smote

Model: RandomForestClassifier
Accuracy: 77.8
Classification report: 
              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85      1556
         1.0       0.58      0.58      0.58       557

    accuracy                           0.78      2113
   macro avg       0.71      0.71      0.71      2113
weighted avg       0.78      0.78      0.78      2113

----------------------
Model: ExtraTreesClassifier
Accuracy: 76.67
Classification report: 
              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84      1556
         1.0       0.56      0.51      0.54       557

    accuracy                           0.77      2113
   macro avg       0.70      0.68      0.69      2113
weighted avg       0.76      0.77      0.76      2113

----------------------
Model: GradientBoostingClassifier
Accuracy: 79.46
Classification report: 
              precision    recall  f1-score   support

         0.0       0.88      0.83 

Unnamed: 0,Model,Accuracy,Precision,Recall
0,RandomForestClassifier,77.8,71.4,71.33
1,ExtraTreesClassifier,76.67,69.7,68.48
2,GradientBoostingClassifier,79.46,73.86,76.03
3,AdaBoostClassifier,78.04,72.53,75.47
4,XGBClassifier,78.42,72.2,72.15


Building a neural Network on SMOTE

In [28]:
# Initialising the ANN
model_ann_sm = Sequential()

# Adding the input layer and the first hidden layer
model_ann_sm.add(Dense(units = 19, activation = 'relu', input_dim = 19))


# Adding the second hidden layer 
model_ann_sm.add(Dense(units=19, activation='relu'))

# dropout for second layer 
model_ann_sm.add(Dropout(0.1))

# Adding the third hidden layer 
model_ann_sm.add(Dense(units=19, activation='relu'))

# dropout for third layer 
model_ann_sm.add(Dropout(0.1))

# Adding the fourth hidden layer 
model_ann_sm.add(Dense(units=19, activation='relu'))


# Adding the output layer 
model_ann_sm.add(Dense(units=1, activation='sigmoid'))

# Compiling the ANN
model_ann_sm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# early stopping and model checkpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
model_checkpoint = ModelCheckpoint('best_model1.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)

# Fitting the ANN to the Training set
model_ann_sm.fit(x_train_new, y_train_new, validation_data=(X_test, Y_test), epochs=100, callbacks=[early_stopping, model_checkpoint])

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.51738, saving model to best_model1.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.51738 to 0.50740, saving model to best_model1.h5
Epoch 3/100
Epoch 3: val_loss did not improve from 0.50740
Epoch 4/100
Epoch 4: val_loss improved from 0.50740 to 0.47519, saving model to best_model1.h5
Epoch 5/100
Epoch 5: val_loss did not improve from 0.47519
Epoch 6/100
Epoch 6: val_loss did not improve from 0.47519
Epoch 7/100
Epoch 7: val_loss did not improve from 0.47519
Epoch 8/100
Epoch 8: val_loss did not improve from 0.47519
Epoch 9/100
Epoch 9: val_loss did not improve from 0.47519
Epoch 10/100
Epoch 10: val_loss did not improve from 0.47519
Epoch 11/100
Epoch 11: val_loss did not improve from 0.47519
Epoch 12/100
Epoch 12: val_loss did not improve from 0.47519
Epoch 13/100
Epoch 13: val_loss did not improve from 0.47519
Epoch 14/100
Epoch 14: val_loss did not improve from 0.47519
Epoch 14: early stopping


<keras.src.callbacks.History at 0x149a4eebb10>

In [29]:
# Load the best model
model_ann_sm = load_model('best_model1.h5')

# Predict class probabilities
y_pred_probs = model_ann_sm.predict(X_test)

# Convert probabilities to class labels based on a 0.5 threshold
y_pred = (y_pred_probs >= 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)

# Generate a classification report
class_repo = classification_report(Y_test, y_pred)

# Print the results
print(f'Accuracy: {round(accuracy * 100, 2)}')
print(f'Classification report: \n{class_repo}')

Accuracy: 75.72
Classification report: 
              precision    recall  f1-score   support

         0.0       0.89      0.76      0.82      1556
         1.0       0.53      0.74      0.62       557

    accuracy                           0.76      2113
   macro avg       0.71      0.75      0.72      2113
weighted avg       0.80      0.76      0.77      2113



We can infer that out of all the algorithms, Gradient Boosting and AdaBoost perform the best. However, we have selected Gradient Boosting for further fine-tuning the model because it has almost the same recall for the minority class but with better accuracy.

###### Hyperparameter Tuinig using RandomizedSearchCV.<br>

RandomizedSearchCV is a hyperparameter tuning technique in machine learning that is used to find the best parameters for a model. It generates a grid of hyperparameter values and randomly selects combinations to train the model and score. This allows users to control the number of parameter combinations that are attempted.

In [148]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribtuion to search
param_dist = {
    'criterion': ['friedman_mse', 'squared_error'],
    'n_estimators': randint(100, 200),               # Uniformly sample over the given range
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 6),
    'max_features': ['sqrt', 'log2', None]
}

model_GB = GradientBoostingClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=model_GB, param_distributions=param_dist,
                                   n_iter=100, cv=5, verbose=3, random_state=42, n_jobs=-1,
                                   scoring='recall')

# Fit RandomizedSearchCV to the data
random_search.fit(x_train_new, y_train_new)

# Print the best parameters and the corresponding score
print("Best Hyperparameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'criterion': 'friedman_mse', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 126}
Best Score: 0.8570952064372663


In [149]:
# buidling a model using Randomsearchcv parameters

y_pred = random_search.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)

# Generate a classification report
class_repo = classification_report(Y_test, y_pred)

# Print the results
print(f'Accuracy: {round(accuracy * 100, 2)}')
print(f'Classification report: \n{class_repo}')

Accuracy: 78.23
Classification report: 
              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85      1556
         1.0       0.59      0.58      0.58       557

    accuracy                           0.78      2113
   macro avg       0.72      0.72      0.72      2113
weighted avg       0.78      0.78      0.78      2113



Base Gradient boosting model on Smote

In [30]:
model_gb = GradientBoostingClassifier(random_state=77)

model_gb.fit(x_train_new,y_train_new)
y_pred = model_gb.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)

# Generate a classification report
class_repo = classification_report(Y_test, y_pred)

# Print the results
print(f'Accuracy: {round(accuracy * 100, 2)}')
print(f'Classification report: \n{class_repo}')

Accuracy: 79.46
Classification report: 
              precision    recall  f1-score   support

         0.0       0.88      0.83      0.86      1556
         1.0       0.60      0.69      0.64       557

    accuracy                           0.79      2113
   macro avg       0.74      0.76      0.75      2113
weighted avg       0.81      0.79      0.80      2113



In [61]:
model_gb_tuned = GradientBoostingClassifier(
    n_estimators=1000,  
    learning_rate=0.001, 
    max_depth=7,  # Shallower trees
    min_samples_split=3,  
    min_samples_leaf=1,
    #subsample=0.8,  # Subsampling 80% of the data
    random_state=77  # For reproducibility
)

# Train the model on the entire training set
model_gb_tuned.fit(x_train_new, y_train_new)

# Predict on the test set
y_pred = model_gb_tuned.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(Y_test, y_pred)
class_repo = classification_report(Y_test, y_pred)

print(f'Accuracy: {round(accuracy * 100, 2)}')
print(f'Classification report: \n{class_repo}')

Accuracy: 77.95
Classification report: 
              precision    recall  f1-score   support

         0.0       0.88      0.81      0.84      1556
         1.0       0.57      0.70      0.63       557

    accuracy                           0.78      2113
   macro avg       0.72      0.75      0.73      2113
weighted avg       0.80      0.78      0.79      2113



Since there is no increase in performace of model, we'll try subsetting some features using PCA

In [53]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)                  #to view the entire eigen vector,no subsetting
x_train_pca = pca.fit_transform(x_train_new)
x_test_pca = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.22587257 0.12599686 0.07656962 0.06444913 0.05827133 0.05386417
 0.0527947  0.04386427 0.04146521 0.03932528 0.03886149 0.03463311
 0.03367991 0.03127884 0.02835668 0.02312644]


In [54]:
len(explained_variance)

16

In [55]:
model_gb_pca = GradientBoostingClassifier(random_state=77)

model_gb_pca.fit(x_train_pca,y_train_new)
y_pred = model_gb_pca.predict(x_test_pca)

# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)

# Generate a classification report
class_repo = classification_report(Y_test, y_pred)

# Print the results
print(f'Accuracy: {round(accuracy * 100, 2)}')
print(f'Classification report: \n{class_repo}')

Accuracy: 74.73
Classification report: 
              precision    recall  f1-score   support

         0.0       0.90      0.74      0.81      1556
         1.0       0.51      0.76      0.61       557

    accuracy                           0.75      2113
   macro avg       0.71      0.75      0.71      2113
weighted avg       0.80      0.75      0.76      2113



After using PCA and exploring various options for tuning to optimize our model, we found that our base Gradient Boosting model performs better in terms of accuracy and recall. Therefore, it will be used for further model deployment and predictions.

Deployment

In [57]:
import pickle

In [58]:
filname = 'model.sav'

In [59]:
pickle.dump(model_gb, open(filname, 'wb'))

In [60]:
load_model = pickle.load(open(filname, 'rb'))

In [61]:
load_model.score(X_test,Y_test)

0.7946048272598202