In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#Data Preprocess
# Load the dataset
file_path = '/content/drive/MyDrive/web3gle/Last Task/customer_churn (1).csv'
df = pd.read_csv(file_path)

# Drop unnecessary columns
df = df.drop('customerID', axis=1)

# Convert TotalCharges to numeric (coerce errors to NaN)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Handle missing values
df = df.dropna()

# Encode categorical variables
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0)

# One-Hot Encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Scaling the numerical features
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = MinMaxScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Split the data into features and labels
X = df.drop('Churn_Yes', axis=1)
y = df['Churn_Yes']


Baseline Model Evaluation

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return model

# Evaluate each model on imbalanced data
for name, model in models.items():
    print(f"\nEvaluating {name} on imbalanced data:")
    evaluate_model(model, X_train, X_test, y_train, y_test)



Evaluating Logistic Regression on imbalanced data:
Accuracy: 0.8045
AUC-ROC: 0.8348
Classification Report:
               precision    recall  f1-score   support

       False       0.85      0.89      0.87      1033
        True       0.65      0.57      0.61       374

    accuracy                           0.80      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.80      0.80      1407


Evaluating K-Nearest Neighbors on imbalanced data:
Accuracy: 0.7512
AUC-ROC: 0.7668
Classification Report:
               precision    recall  f1-score   support

       False       0.84      0.82      0.83      1033
        True       0.53      0.57      0.55       374

    accuracy                           0.75      1407
   macro avg       0.68      0.69      0.69      1407
weighted avg       0.76      0.75      0.75      1407


Evaluating Random Forest on imbalanced data:
Accuracy: 0.7903
AUC-ROC: 0.8171
Classification Report:
               precision  

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7669
AUC-ROC: 0.8142
Classification Report:
               precision    recall  f1-score   support

       False       0.83      0.86      0.84      1033
        True       0.57      0.52      0.54       374

    accuracy                           0.77      1407
   macro avg       0.70      0.69      0.69      1407
weighted avg       0.76      0.77      0.76      1407



Applying Class Imbalance Techniques

In [7]:
# Apply Random Over-Sampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

# Evaluate each model with Random Over-Sampling
for name, model in models.items():
    print(f"\nEvaluating {name} after Random Over-Sampling:")
    evaluate_model(model, X_ros, X_test, y_ros, y_test)



Evaluating Logistic Regression after Random Over-Sampling:
Accuracy: 0.7264
AUC-ROC: 0.8346
Classification Report:
               precision    recall  f1-score   support

       False       0.90      0.70      0.79      1033
        True       0.49      0.79      0.61       374

    accuracy                           0.73      1407
   macro avg       0.70      0.75      0.70      1407
weighted avg       0.79      0.73      0.74      1407


Evaluating K-Nearest Neighbors after Random Over-Sampling:
Accuracy: 0.6851
AUC-ROC: 0.7515
Classification Report:
               precision    recall  f1-score   support

       False       0.88      0.66      0.76      1033
        True       0.45      0.75      0.56       374

    accuracy                           0.69      1407
   macro avg       0.66      0.71      0.66      1407
weighted avg       0.76      0.69      0.70      1407


Evaluating Random Forest after Random Over-Sampling:
Accuracy: 0.7783
AUC-ROC: 0.8161
Classification Report:
  

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7498
AUC-ROC: 0.8135
Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.78      0.82      1033
        True       0.52      0.66      0.58       374

    accuracy                           0.75      1407
   macro avg       0.69      0.72      0.70      1407
weighted avg       0.77      0.75      0.76      1407



In [8]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# Evaluate each model with SMOTE
for name, model in models.items():
    print(f"\nEvaluating {name} after SMOTE:")
    evaluate_model(model, X_smote, X_test, y_smote, y_test)



Evaluating Logistic Regression after SMOTE:
Accuracy: 0.7420
AUC-ROC: 0.8276
Classification Report:
               precision    recall  f1-score   support

       False       0.89      0.74      0.81      1033
        True       0.51      0.74      0.61       374

    accuracy                           0.74      1407
   macro avg       0.70      0.74      0.71      1407
weighted avg       0.79      0.74      0.75      1407


Evaluating K-Nearest Neighbors after SMOTE:
Accuracy: 0.6979
AUC-ROC: 0.7620
Classification Report:
               precision    recall  f1-score   support

       False       0.88      0.69      0.77      1033
        True       0.46      0.73      0.56       374

    accuracy                           0.70      1407
   macro avg       0.67      0.71      0.67      1407
weighted avg       0.76      0.70      0.71      1407


Evaluating Random Forest after SMOTE:
Accuracy: 0.7704
AUC-ROC: 0.8116
Classification Report:
               precision    recall  f1-score   

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.7655
AUC-ROC: 0.8143
Classification Report:
               precision    recall  f1-score   support

       False       0.87      0.80      0.83      1033
        True       0.55      0.66      0.60       374

    accuracy                           0.77      1407
   macro avg       0.71      0.73      0.72      1407
weighted avg       0.78      0.77      0.77      1407



In [11]:
def evaluate_model_with_class_weight(model, X_train, X_test, y_train, y_test):
    if hasattr(model, 'class_weight'):  # Ensure the model supports class_weight
        # Fit the model with class_weight set to 'balanced'
        model.set_params(class_weight='balanced')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.4f}")
        print("Classification Report:\n", classification_report(y_test, y_pred))
    else:
        print(f"{model} does not support class_weight parameter.")
    return model

# Evaluate each model with class weight adjustment
for name, model in models.items():
    print(f"\nEvaluating {name} with class weight adjustment:")
    evaluate_model_with_class_weight(model, X_train, X_test, y_train, y_test)



Evaluating Logistic Regression with class weight adjustment:
Accuracy: 0.7271
AUC-ROC: 0.8345
Classification Report:
               precision    recall  f1-score   support

       False       0.90      0.70      0.79      1033
        True       0.49      0.79      0.61       374

    accuracy                           0.73      1407
   macro avg       0.70      0.75      0.70      1407
weighted avg       0.79      0.73      0.74      1407


Evaluating K-Nearest Neighbors with class weight adjustment:
KNeighborsClassifier() does not support class_weight parameter.

Evaluating Random Forest with class weight adjustment:
Accuracy: 0.7875
AUC-ROC: 0.8187
Classification Report:
               precision    recall  f1-score   support

       False       0.83      0.90      0.86      1033
        True       0.63      0.48      0.54       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.77      0.79      0.78

**Conclusion**
In this study, we evaluated five classification algorithms (Logistic Regression, K-Nearest Neighbors, Random Forest, Support Vector Machine, and XGBoost) on an imbalanced dataset. We applied different class imbalance (CI) techniques, including Random Over-Sampling, SMOTE, and class weight adjustments. The results demonstrated that CI techniques generally improved recall for the minority class but often led to a decrease in overall accuracy.