In [1]:
# -----------------------------
# 1️⃣ Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.combine import SMOTEENN
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# -----------------------------
# 2️⃣ Load Dataset
# -----------------------------
df = pd.read_excel("P585 Churn.xlsx")

df.columns = df.columns.str.strip()
df.replace(['Nan', 'nan', 'NaN', 'NAN'], np.nan, inplace=True)
df.drop(columns=['Unnamed: 0'], errors='ignore', inplace=True)

# -----------------------------
# 3️⃣ Feature & Target
# -----------------------------
X = df.drop('churn', axis=1)
y = df['churn']

# -----------------------------
# 4️⃣ Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 5️⃣ Impute + Encode
# -----------------------------
categorical_cols = ['state', 'voice.plan', 'intl.plan']
numeric_cols = X_train.select_dtypes(include=np.number).columns

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

X_train_num = num_imputer.fit_transform(X_train[numeric_cols])
X_test_num = num_imputer.transform(X_test[numeric_cols])

X_train_cat = cat_imputer.fit_transform(X_train[categorical_cols])
X_test_cat = cat_imputer.transform(X_test[categorical_cols])

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_enc = ohe.fit_transform(X_train_cat)
X_test_enc = ohe.transform(X_test_cat)

X_train_final = np.hstack([X_train_num, X_train_enc])
X_test_final = np.hstack([X_test_num, X_test_enc])

# -----------------------------
# 6️⃣ Scaling
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

# -----------------------------
# 7️⃣ SMOTEENN
# -----------------------------
smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X_train_scaled, y_train)

# -----------------------------
# 8️⃣ Logistic Regression + GridSearch
# -----------------------------
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

lr = LogisticRegression(max_iter=1000, random_state=42)
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
grid_lr.fit(X_res, y_res)
best_lr = grid_lr.best_estimator_

print("Best Logistic Regression Params:", grid_lr.best_params_)

# -----------------------------
# 9️⃣ XGBoost Model
# -----------------------------
y_res_encoded = (y_res == 'yes').astype(int)
y_test_encoded = (y_test == 'yes').astype(int)

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=y_res_encoded.value_counts()[0] / y_res_encoded.value_counts()[1]
)
xgb.fit(X_res, y_res_encoded)

# -----------------------------
# 🔟 Threshold Prediction Function
# -----------------------------
def predict_with_threshold(model, X, threshold=0.4):
    probs = model.predict_proba(X)[:, 1]
    pred = np.where(probs >= threshold, 'yes', 'no')
    return pred

# -----------------------------
# 1️⃣1️⃣ Final Evaluation on XGBoost
# -----------------------------
y_pred = predict_with_threshold(xgb, X_test_scaled, threshold=0.4)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# -----------------------------
# 1️⃣2️⃣ Save Artifacts for Deployment
# -----------------------------
joblib.dump(num_imputer, "num_imputer.pkl")
joblib.dump(cat_imputer, "cat_imputer.pkl")
joblib.dump(ohe, "encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(xgb, "xgb_model.pkl")

print("✅ Saved: num_imputer.pkl, cat_imputer.pkl, encoder.pkl, scaler.pkl, xgb_model.pkl")


  df.replace(['Nan', 'nan', 'NaN', 'NAN'], np.nan, inplace=True)


Best Logistic Regression Params: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

          no       0.97      0.90      0.94       859
         yes       0.59      0.83      0.69       141

    accuracy                           0.89      1000
   macro avg       0.78      0.87      0.81      1000
weighted avg       0.92      0.89      0.90      1000

[[777  82]
 [ 24 117]]
✅ Saved: num_imputer.pkl, cat_imputer.pkl, encoder.pkl, scaler.pkl, xgb_model.pkl
