In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

#Data Loading & Cleaning
print("--- Loading Data ---")
df = pd.read_csv("data/DataSet.csv")

#Clean TotalCharges and handle missing values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)

#Drop ID and normalize categorical values
df.drop('customerID', axis=1, inplace=True)
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)

print(f"Data Shape: {df.shape}")
print(f"Class Distribution:\n{df['Churn'].value_counts()}")

# Preprocessing Setup
X = df.drop('Churn', axis=1)
y = LabelEncoder().fit_transform(df['Churn'])

#Identify columns
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()


print(f"Numerical Columns: {num_cols}")
print(f"Categorical Columns: {cat_cols}")

#Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

# Pipeline Construction (Preprocessing -> SMOTE -> Model)
# Using LightGBM as it generally outperforms other models on tabular churn data
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(random_state=42, verbose=-1))
])

# Hyperparameter Tuning
param_dist = {
    'classifier__n_estimators': [100, 200, 500],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [-1, 10, 20],
    'classifier__num_leaves': [20, 31, 50],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

print("--- Starting Hyperparameter Tuning ---")
search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_dist, 
    n_iter=20, 
    cv=3, 
    scoring='f1', 
    n_jobs=-1, 
    verbose=0,
    random_state=42
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

print(f"Best Parameters: {search.best_params_}")

#Evaluation
print("--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save Model
print("--- Saving Model ---")
joblib.dump(best_model, 'data/best_churn_model.pkl')
print("Model saved as 'data/best_churn_model.pkl'")

--- Loading Data ---
Data Shape: (7032, 20)
Class Distribution:
Churn
No     5163
Yes    1869
Name: count, dtype: int64
Numerical Columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical Columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
--- Starting Hyperparameter Tuning ---




Best Parameters: {'classifier__subsample': 1.0, 'classifier__num_leaves': 20, 'classifier__n_estimators': 200, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.01, 'classifier__colsample_bytree': 0.8}
--- Evaluating Best Model ---
Confusion Matrix:
[[809 224]
 [100 274]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.78      0.83      1033
           1       0.55      0.73      0.63       374

    accuracy                           0.77      1407
   macro avg       0.72      0.76      0.73      1407
weighted avg       0.80      0.77      0.78      1407

--- Saving Model ---
Model saved as 'best_churn_model.pkl'


