In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve
)

import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
df = pd.read_csv("Churn.csv")

In [28]:
df.drop('customerID', axis=1, inplace=True)

In [29]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [30]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [31]:
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
cat_cols = df.drop(columns=num_cols + ['Churn']).columns

In [32]:
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [33]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [39]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
X_train_scaled.shape
X_test_scaled.shape

(1409, 30)

In [45]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

In [47]:
y_pred = log_model.predict(X_test_scaled)
y_prob = log_model.predict_proba(X_test_scaled)[:, 1]

In [48]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8069552874378992

In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.57      0.61       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [50]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_prob)

np.float64(0.8415846443979436)

In [51]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.57      0.61       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [52]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_prob)

np.float64(0.8415846443979436)

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train, y_train)


In [55]:
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_rf)

0.7970191625266146

In [57]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1035
           1       0.65      0.50      0.57       374

    accuracy                           0.80      1409
   macro avg       0.74      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [58]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_prob_rf)

np.float64(0.8262355007879305)

In [59]:
feature_names = X.columns

In [60]:
coefficients = log_model.coef_[0]

In [61]:
import pandas as pd

feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

feature_importance.sort_values(by='Coefficient', ascending=False).head(10)

Unnamed: 0,Feature,Coefficient
10,InternetService_Fiber optic,0.77876
2,TotalCharges,0.497246
23,StreamingMovies_Yes,0.258653
21,StreamingTV_Yes,0.258042
9,MultipleLines_Yes,0.216356
26,PaperlessBilling_Yes,0.181833
28,PaymentMethod_Electronic check,0.181456
17,DeviceProtection_Yes,0.053625
4,SeniorCitizen_1,0.052901
29,PaymentMethod_Mailed check,0.033133


In [62]:
feature_importance.sort_values(by='Coefficient', ascending=False).tail(10)

Unnamed: 0,Feature,Coefficient
18,TechSupport_No internet service,-0.092861
16,DeviceProtection_No internet service,-0.092861
22,StreamingMovies_No internet service,-0.092861
19,TechSupport_Yes,-0.100249
6,Dependents_Yes,-0.104249
13,OnlineSecurity_Yes,-0.12343
24,Contract_One year,-0.286473
25,Contract_Two year,-0.588975
1,MonthlyCharges,-0.921369
0,tenure,-1.219639
