In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import os
import joblib

plt.style.use("default")
sns.set()

In [None]:
df = pd.read_csv("Telco-Customer-Churn.csv")

In [None]:
display(df.head())
print(df.info())
display(df.describe(include='all'))

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

missing_pct = df.isna().mean() * 100
print(missing_pct)

plt.figure(figsize=(10,5))
missing_pct.sort_values(ascending=False).plot(kind='bar')
plt.title("Persentase Missing Value")
plt.ylabel("%")
plt.tight_layout()
plt.show()

In [None]:
df['Churn'].value_counts().plot(kind='bar')
plt.title("Distribusi Target (Churn)")
plt.tight_layout()
plt.show()

print(df['Churn'].value_counts(normalize=True) * 100)

In [None]:
df_num = df.select_dtypes(include=['int64','float64'])

plt.figure(figsize=(8,6))
sns.heatmap(df_num.corr(), annot=True, cmap='coolwarm')
plt.title("Heatmap Korelasi")
plt.tight_layout()
plt.show()

In [None]:
df_prep = df.copy()

df_prep['TotalCharges'] = pd.to_numeric(df_prep['TotalCharges'], errors='coerce')

df_prep['TotalCharges'].fillna(df_prep['TotalCharges'].median(), inplace=True)

for col in ['tenure', 'MonthlyCharges', 'TotalCharges']:
    q1, q99 = df_prep[col].quantile([0.01, 0.99])
    df_prep[col] = df_prep[col].clip(q1, q99)

In [None]:
X = df_prep.drop(columns=['Churn', 'customerID'])
y = df_prep['Churn'].map({'No': 0, 'Yes': 1})

cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
preprocess_full = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols)
])

In [None]:
def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(name)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

In [None]:
preprocess_direct = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', 'passthrough', num_cols)
])

evaluate_model(
    "Direct - Logistic Regression",
    Pipeline([
        ('prep', preprocess_direct),
        ('model', LogisticRegression(max_iter=1000))
    ])
)

evaluate_model(
    "Direct - Random Forest",
    Pipeline([
        ('prep', preprocess_direct),
        ('model', RandomForestClassifier(random_state=42))
    ])
)

evaluate_model(
    "Direct - Voting",
    Pipeline([
        ('prep', preprocess_direct),
        ('model', VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(max_iter=1000)),
                ('knn', KNeighborsClassifier()),
                ('svm', SVC(probability=True))
            ],
            voting='soft'
        ))
    ])
)

In [None]:
evaluate_model(
    "Preprocess + SMOTE - Logistic Regression",
    ImbPipeline([
        ('prep', preprocess_full),
        ('smote', SMOTE(random_state=42)),
        ('model', LogisticRegression(max_iter=1000))
    ])
)

evaluate_model(
    "Preprocess + SMOTE - Random Forest",
    ImbPipeline([
        ('prep', preprocess_full),
        ('smote', SMOTE(random_state=42)),
        ('model', RandomForestClassifier(random_state=42))
    ])
)

evaluate_model(
    "Preprocess + SMOTE - Voting",
    ImbPipeline([
        ('prep', preprocess_full),
        ('smote', SMOTE(random_state=42)),
        ('model', VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(max_iter=1000)),
                ('knn', KNeighborsClassifier()),
                ('svm', SVC(probability=True))
            ],
            voting='soft'
        ))
    ])
)

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [None]:
lr_smote = ImbPipeline([
    ('prep', preprocess_full),
    ('smote', SMOTE(random_state=42)),
    ('model', LogisticRegression(max_iter=1000))
])

rf_smote = ImbPipeline([
    ('prep', preprocess_full),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42))
])

voting_smote = ImbPipeline([
    ('prep', preprocess_full),
    ('smote', SMOTE(random_state=42)),
    ('model', VotingClassifier(
        estimators=[
            ('lr', LogisticRegression(max_iter=1000)),
            ('knn', KNeighborsClassifier()),
            ('svm', SVC(probability=True))
        ],
        voting='soft'
    ))
])

In [None]:
param_lr = {
    'model__C': [0.01, 0.1, 1, 10]
}

In [None]:
param_rf = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5]
}

In [None]:
param_voting = {
    'model__weights': [
        [1,1,1],
        [2,1,1],
        [1,2,1],
        [1,1,2]
    ]
}

In [None]:
grid_lr = GridSearchCV(
    lr_smote,
    param_lr,
    cv=3,
    scoring='f1',
    n_jobs=-1
)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_

In [None]:
grid_rf = GridSearchCV(
    rf_smote,
    param_rf,
    cv=3,
    scoring='f1',
    n_jobs=-1
)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

In [None]:
grid_voting = GridSearchCV(
    voting_smote,
    param_voting,
    cv=3,
    scoring='f1',
    n_jobs=-1
)
grid_voting.fit(X_train, y_train)
best_voting = grid_voting.best_estimator_

In [None]:
models = {
    "LogReg + SMOTE + Tuning": best_lr,
    "RandomForest + SMOTE + Tuning": best_rf,
    "Voting + SMOTE + Tuning": best_voting
}

results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)

    f1_churn = report['1']['f1-score']
    results.append([name, f1_churn])

    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(name)
    plt.tight_layout()
    plt.show()

In [None]:
results_df = pd.DataFrame(results, columns=['Model', 'F1_Churn'])
results_df = results_df.sort_values(by='F1_Churn', ascending=False)

display(results_df)

In [None]:
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]

joblib.dump(best_model, "best_churn_model_final.pkl")

print("Model terbaik:", best_model_name)
print("Model berhasil disimpan")