In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV


In [2]:

data = pd.read_csv("Churn_Modelling.csv")


In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
data = data.drop(columns=["RowNumber", "CustomerId", "Surname"])

In [6]:
data = pd.get_dummies(data, columns=["Geography", "Gender"], drop_first=True)

In [7]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [8]:
X = data.drop(columns=["Exited"])  # Features
y = data["Exited"]  # Target variable

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
print("Training set - Features:", X_train.shape, "Target:", y_train.shape)
print("Testing set - Features:", X_test.shape, "Target:", y_test.shape)

Training set - Features: (8000, 11) Target: (8000,)
Testing set - Features: (2000, 11) Target: (2000,)


In [13]:
model_results = {}

In [14]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

LogisticRegression()

In [15]:
lr_predictions = lr_model.predict(X_test)

In [16]:
# Evaluate the Logistic Regression model
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_roc_auc = roc_auc_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)
lr_confusion = confusion_matrix(y_test, lr_predictions)


In [17]:
# Store the results in the dictionary
model_results['Logistic Regression'] = {
    'Accuracy': lr_accuracy,
    'ROC AUC': lr_roc_auc,
    'F1 Score': lr_f1,
    'Confusion Matrix': lr_confusion
}

In [18]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [19]:
rf_predictions = rf_model.predict(X_test)

In [20]:
# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_roc_auc = roc_auc_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)
rf_confusion = confusion_matrix(y_test, rf_predictions)

In [21]:

# Store the results in the dictionary
model_results['Random Forest'] = {
    'Accuracy': rf_accuracy,
    'ROC AUC': rf_roc_auc,
    'F1 Score': rf_f1,
    'Confusion Matrix': rf_confusion
}

In [22]:

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [23]:

gb_predictions = gb_model.predict(X_test)

In [24]:

# Evaluate the Gradient Boosting model
gb_accuracy = accuracy_score(y_test, gb_predictions)
gb_roc_auc = roc_auc_score(y_test, gb_predictions)
gb_f1 = f1_score(y_test, gb_predictions)
gb_confusion = confusion_matrix(y_test, gb_predictions)

In [25]:
# Store the results in the dictionary
model_results['Gradient Boosting'] = {
    'Accuracy': gb_accuracy,
    'ROC AUC': gb_roc_auc,
    'F1 Score': gb_f1,
    'Confusion Matrix': gb_confusion
}

In [26]:
# Display the results
for model_name, results in model_results.items():
    print(f"Results for {model_name}:")
    print(f"Accuracy: {results['Accuracy']:.4f}")
    print(f"ROC AUC: {results['ROC AUC']:.4f}")
    print(f"F1 Score: {results['F1 Score']:.4f}")
    print("Confusion Matrix:")
    print(results['Confusion Matrix'])
    print('\n')

Results for Logistic Regression:
Accuracy: 0.8005
ROC AUC: 0.5250
F1 Score: 0.1231
Confusion Matrix:
[[1573   34]
 [ 365   28]]


Results for Random Forest:
Accuracy: 0.8665
ROC AUC: 0.7170
F1 Score: 0.5808
Confusion Matrix:
[[1548   59]
 [ 208  185]]


Results for Gradient Boosting:
Accuracy: 0.8675
ROC AUC: 0.7244
F1 Score: 0.5917
Confusion Matrix:
[[1543   64]
 [ 201  192]]




In [28]:
param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
param_grid_rf = {'n_estimators': [50, 100, 200, 300],
                 'max_depth': [10, 20, 30, 40]}
param_grid_gb = {'n_estimators': [50, 100, 200, 300],
                 'learning_rate': [0.01, 0.1, 0.2, 0.3]}

In [29]:
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='f1')
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='f1')
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, scoring='f1')


In [30]:
grid_search_lr.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=42),
             param_grid={'learning_rate': [0.01, 0.1, 0.2, 0.3],
                         'n_estimators': [50, 100, 200, 300]},
             scoring='f1')

In [31]:
best_params_lr = grid_search_lr.best_params_
best_params_rf = grid_search_rf.best_params_
best_params_gb = grid_search_gb.best_params_


In [32]:

print("Best hyperparameters for Logistic Regression:", best_params_lr)
print("Best hyperparameters for Random Forest:", best_params_rf)
print("Best hyperparameters for Gradient Boosting:", best_params_gb)

Best hyperparameters for Logistic Regression: {'C': 0.001}
Best hyperparameters for Random Forest: {'max_depth': 30, 'n_estimators': 200}
Best hyperparameters for Gradient Boosting: {'learning_rate': 0.1, 'n_estimators': 300}
