In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Churn_Modelling.csv')
print(df.shape)
df.head()


(10000, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
y = df['Exited'].astype(float)
x = df.drop('Exited', axis=1)

print("NaNs in y:", y.isna().sum())


NaNs in y: 0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)

y = df['Exited']
X = df.drop('Exited', axis=1)


In [6]:
num_cols = [
    "CreditScore",
    "Age",
    "Tenure",
    "Balance",
    "NumOfProducts",
    "EstimatedSalary",
    "HasCrCard",
    "IsActiveMember"
]

cat_cols = [
    "Geography",
    "Gender"
]


In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop='first',sparse_output=False), cat_cols)
    ]
)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, Ridge,Lasso,ElasticNet
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, matthews_corrcoef, classification_report
)

X_train,X_test, y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [9]:
train_samples = len(X_train)
test_samples = len(X_test)
train_test_ratio = train_samples / (train_samples + test_samples)

print(f"Train samples: {train_samples}")
print(f"Test samples: {test_samples}")
print(f"Split ratio: {train_test_ratio:.1%}")


Train samples: 8000
Test samples: 2000
Split ratio: 80.0%


In [10]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    
    # Some regression models output continuous values
    if y_pred.ndim > 1:
        y_pred = y_pred.ravel()
        
    y_pred_class = (y_pred >= 0.5).astype(int)

    print(f"\n===== {model_name} =====")
    print("Accuracy:", accuracy_score(y_test, y_pred_class))
    print("Precision:", precision_score(y_test, y_pred_class))
    print("Recall:", recall_score(y_test, y_pred_class))
    print("F1 Score:", f1_score(y_test, y_pred_class))
    print("ROC AUC:", roc_auc_score(y_test, y_pred))
    print("MCC Score: ", matthews_corrcoef(y_test, y_pred_class))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_class))


In [11]:
log_reg = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])
log_reg.fit(X_train, y_train)

evaluate_model(log_reg, X_test, y_test, "Logistic Regression")



===== Logistic Regression =====
Accuracy: 0.811
Precision: 0.5524475524475524
Recall: 0.2010178117048346
F1 Score: 0.2947761194029851
ROC AUC: 0.5805960247074267
MCC Score:  0.2485843937619467
Confusion Matrix:
 [[1543   64]
 [ 314   79]]


In [12]:
log_reg_elastic = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='elasticnet',
        solver='saga',         # REQUIRED
        l1_ratio=0.5,          # 0 = Ridge, 1 = Lasso
        C=1.0,
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ))
])

log_reg_elastic.fit(X_train, y_train)

evaluate_model(log_reg_elastic, X_test, y_test, "Logistic Regression (Elastic Net)")



===== Logistic Regression (Elastic Net) =====
Accuracy: 0.7195
Precision: 0.38461538461538464
Recall: 0.712468193384224
F1 Score: 0.49955396966993754
ROC AUC: 0.7168439286771774
MCC Score:  0.35815599990241
Confusion Matrix:
 [[1159  448]
 [ 113  280]]




In [13]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

log_reg_ridge = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l2',          # Ridge
        C=1.0,
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ))
])

log_reg_ridge.fit(X_train, y_train)

evaluate_model(log_reg_ridge, X_test, y_test, "Logistic Regression (Ridge / L2)")



===== Logistic Regression (Ridge / L2) =====
Accuracy: 0.719
Precision: 0.3840877914951989
Recall: 0.712468193384224
F1 Score: 0.49910873440285203
ROC AUC: 0.7165327899092869
MCC Score:  0.35753728781844946
Confusion Matrix:
 [[1158  449]
 [ 113  280]]




In [14]:
from sklearn.tree import DecisionTreeClassifier

dt_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(
        criterion='gini',
        max_depth=6,
        min_samples_split=20,
        class_weight='balanced',
        random_state=42
    ))
])

dt_pipe.fit(X_train, y_train)

evaluate_model(dt_pipe, X_test, y_test, "Decision Tree Classifier")



===== Decision Tree Classifier =====
Accuracy: 0.747
Precision: 0.42375168690958165
Recall: 0.7989821882951654
F1 Score: 0.5537918871252204
ROC AUC: 0.766634840258348
MCC Score:  0.43876213062195285
Confusion Matrix:
 [[1180  427]
 [  79  314]]


In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(
        n_neighbors=5,
        weights='distance'
    ))
])

knn_pipe.fit(X_train, y_train)

evaluate_model(knn_pipe, X_test, y_test, "K-Nearest Neighbors")



===== K-Nearest Neighbors =====
Accuracy: 0.8455
Precision: 0.6578947368421053
Recall: 0.44529262086513993
F1 Score: 0.5311077389984825
ROC AUC: 0.6943326825545364
MCC Score:  0.45479396180904996
Confusion Matrix:
 [[1516   91]
 [ 218  175]]


In [16]:
from sklearn.naive_bayes import GaussianNB

nb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

nb_pipe.fit(X_train, y_train)

evaluate_model(nb_pipe, X_test, y_test, "Gaussian Naive Bayes")



===== Gaussian Naive Bayes =====
Accuracy: 0.8335
Precision: 0.6351351351351351
Recall: 0.35877862595419846
F1 Score: 0.4585365853658537
ROC AUC: 0.6541870727779704
MCC Score:  0.39006753620209933
Confusion Matrix:
 [[1526   81]
 [ 252  141]]


In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=20,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipe.fit(X_train, y_train)

evaluate_model(rf_pipe, X_test, y_test, "Random Forest Classifier")



===== Random Forest Classifier =====
Accuracy: 0.839
Precision: 0.5728952772073922
Recall: 0.7099236641221374
F1 Score: 0.634090909090909
ROC AUC: 0.790244968339849
MCC Score:  0.5374214102094945
Confusion Matrix:
 [[1399  208]
 [ 114  279]]


In [18]:
from xgboost import XGBClassifier

xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42
    ))
])

xgb_pipe.fit(X_train, y_train)

evaluate_model(xgb_pipe, X_test, y_test, "XGBoost Classifier")



===== XGBoost Classifier =====
Accuracy: 0.871
Precision: 0.7454545454545455
Recall: 0.5216284987277354
F1 Score: 0.6137724550898204
ROC AUC: 0.739034535611534
MCC Score:  0.5516123294111714
Confusion Matrix:
 [[1537   70]
 [ 188  205]]


In [None]:
import joblib
import os

os.makedirs("models", exist_ok=True)

joblib.dump(log_reg, "models/logistic_regression.pkl")
joblib.dump(rf_pipe, "models/random_forest.pkl")
joblib.dump(xgb_pipe, "models/XGBoost.pkl")


['models/XGBoost_model.pkl']