In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Enable inline plotting for Jupyter Notebook
%matplotlib inline

# Load dataset
data = pd.read_csv('E:\\College\\Python\\Churn_Modelling.csv')

# Display the first few rows
data.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
# Check for missing values
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [3]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Drop unnecessary columns
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# Define feature matrix X and target vector y
X = data.drop(columns=['Exited'])
y = data['Exited']

# Feature scaling (excluding categorical variables)
scaler = StandardScaler()
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
X[numeric_features] = scaler.fit_transform(X[numeric_features])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Function to evaluate models
def evaluate_model(y_true, y_pred):
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    roc_auc = roc_auc_score(y_true, y_pred)
    print(f"ROC AUC Score: {roc_auc:.4f}")
    return roc_auc

# Train and evaluate models
roc_auc_scores = {}
for name, model in models.items():
    print(f"\n{name}:")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    roc_auc_scores[name] = evaluate_model(y_test, y_pred)


Logistic Regression:
Accuracy: 0.8050
Confusion Matrix:
[[1552   41]
 [ 349   58]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1593
           1       0.59      0.14      0.23       407

    accuracy                           0.81      2000
   macro avg       0.70      0.56      0.56      2000
weighted avg       0.77      0.81      0.75      2000

ROC AUC Score: 0.5584

Random Forest:
Accuracy: 0.8585
Confusion Matrix:
[[1530   63]
 [ 220  187]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.96      0.92      1593
           1       0.75      0.46      0.57       407

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.74      2000
weighted avg       0.85      0.86      0.84      2000

ROC AUC Score: 0.7100

Gradient Boosting:
Accuracy: 0.8675
Confusion Matrix:
[[1541   52]
 [ 213  194]]
Classificati

In [5]:
# Define parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Display the best parameters
print("\nBest parameters found:", grid_search.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best parameters found: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}


In [None]:
# Train the optimized Gradient Boosting model
best_gb = grid_search.best_estimator_
best_gb.fit(X_train, y_train)

# Evaluate the optimized model
print("\nOptimized Gradient Boosting:")
evaluate_model(y_test, best_gb.predict(X_test))


Optimized Gradient Boosting:
Accuracy: 0.8675
Confusion Matrix:
[[1541   52]
 [ 213  194]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1593
           1       0.79      0.48      0.59       407

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.76      2000
weighted avg       0.86      0.87      0.85      2000

ROC AUC Score: 0.7220


np.float64(0.7220078321773237)