In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

from xgboost import XGBClassifier

In [3]:
os.makedirs("data", exist_ok=True)

In [4]:
np.random.seed(42)
n = 2000

df = pd.DataFrame({
    "tenure": np.random.randint(1, 72, n),
    "monthly_charges": np.random.uniform(20, 120, n),
    "total_charges": np.random.uniform(100, 8000, n),
    "contract_type": np.random.choice(
        ["Month-to-month", "One year", "Two year"], n),
    "internet_service": np.random.choice(
        ["DSL", "Fiber optic", "None"], n),
    "payment_method": np.random.choice(
        ["Electronic check", "Mailed check", "Bank transfer", "Credit card"], n),
    "tech_support": np.random.choice(["Yes", "No"], n)
})

df["churn"] = np.where(
    (df["contract_type"] == "Month-to-month") &
    (df["monthly_charges"] > 70), 1, 0
)

df.to_csv("data/churn_data.csv", index=False)
print("✅ Dataset Created")

✅ Dataset Created


In [5]:
df = pd.read_csv("data/churn_data.csv")

In [6]:
encoder = LabelEncoder()
for col in df.select_dtypes(include="object"):
    df[col] = encoder.fit_transform(df[col])

# 6. SPLIT DATA
X = df.drop("churn", axis=1)
y = df["churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [7]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

log_pred = log_model.predict(X_test)
log_prob = log_model.predict_proba(X_test)[:,1]

print("\n--- Logistic Regression ---")
print(classification_report(y_test, log_pred))
log_auc = roc_auc_score(y_test, log_prob)
print("ROC-AUC:", log_auc)



--- Logistic Regression ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       417
           1       1.00      0.99      0.99        83

    accuracy                           1.00       500
   macro avg       1.00      0.99      1.00       500
weighted avg       1.00      1.00      1.00       500

ROC-AUC: 1.0


In [8]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_prob = rf_model.predict_proba(X_test)[:,1]

print("\n--- Random Forest ---")
print(classification_report(y_test, rf_pred))
rf_auc = roc_auc_score(y_test, rf_prob)
print("ROC-AUC:", rf_auc)


--- Random Forest ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       417
           1       1.00      1.00      1.00        83

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500

ROC-AUC: 1.0


In [9]:
xgb_model = XGBClassifier(
    eval_metric="logloss",
    random_state=42
)
xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_test)
xgb_prob = xgb_model.predict_proba(X_test)[:,1]

print("\n--- XGBoost ---")
print(classification_report(y_test, xgb_pred))
xgb_auc = roc_auc_score(y_test, xgb_prob)
print("ROC-AUC:", xgb_auc)



--- XGBoost ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       417
           1       1.00      1.00      1.00        83

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500

ROC-AUC: 1.0


In [10]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "XGBoost"],
    "ROC_AUC": [log_auc, rf_auc, xgb_auc]
})

print("\n=== MODEL COMPARISON ===")
print(results)


=== MODEL COMPARISON ===
                 Model  ROC_AUC
0  Logistic Regression      1.0
1        Random Forest      1.0
2              XGBoost      1.0
