In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [6]:
df = pd.read_csv("Churn_Modelling.csv")
print(df.head())
print(df['Exited'].value_counts()) 

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [7]:
df = df.dropna()

In [8]:
if 'customer_id' in df.columns:
    df.drop('customer_id', axis=1, inplace=True)

In [9]:
df = pd.get_dummies(df, drop_first=True)

In [11]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [12]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,test_size=0.3,random_state=42,stratify=y)

In [14]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [15]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [16]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

In [17]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"\n--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

In [18]:
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_gb, "Gradient Boosting")


--- Logistic Regression ---
Accuracy: 0.7463333333333333
Confusion Matrix:
 [[2024  365]
 [ 396  215]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.84      2389
           1       0.37      0.35      0.36       611

    accuracy                           0.75      3000
   macro avg       0.60      0.60      0.60      3000
weighted avg       0.74      0.75      0.74      3000


--- Random Forest ---
Accuracy: 0.8583333333333333
Confusion Matrix:
 [[2363   26]
 [ 399  212]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.99      0.92      2389
           1       0.89      0.35      0.50       611

    accuracy                           0.86      3000
   macro avg       0.87      0.67      0.71      3000
weighted avg       0.86      0.86      0.83      3000


--- Gradient Boosting ---
Accuracy: 0.8673333333333333
Confusion Matrix:
 [[2311   78]
 [ 320  

In [19]:
import joblib
joblib.dump(gb, 'churn_prediction_model.pkl')

['churn_prediction_model.pkl']