In [None]:
!pip install xgboost imbalanced-learn shap

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, f1_score
import shap
import matplotlib.pyplot as plt

In [None]:
# Step 2: Load and Clean Data
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = df[df['TotalCharges'] != ' ']
df['TotalCharges'] = df['TotalCharges'].astype(float)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
import numpy as np

def add_features(df):
    df = df.copy()
    df['AvgChargesPerMonth'] = df['TotalCharges'] / df['tenure']
    df['hasStreaming'] = df[['StreamingTV', 'StreamingMovies']].apply(lambda x: int('Yes' in x.values), axis=1)
    df['ServiceCount'] = df[['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                             'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']].apply(lambda x: list(x).count('Yes'), axis=1)
    return df




In [None]:
# Step 4: Define Features and Target
X = df.drop(columns=['Churn', 'customerID'])
y = df['Churn']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [None]:

# Step 6: Preprocessing Pipeline
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgChargesPerMonth', 'ServiceCount']
cat_cols = [col for col in X.columns if col not in num_cols]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Step 7: Define Base Models
xgb = XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1],
    eval_metric='logloss',

    random_state=42
)

log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Step 8: Stacking Classifier
base_learners = [
    ('xgb', xgb),
    ('lr', log_reg),
    ('rf', rf)
]

stack_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(),
    passthrough=True,
    cv=3
)

In [None]:

from ml_utils import add_features
from sklearn.preprocessing import FunctionTransformer
from imblearn.pipeline import Pipeline as ImbPipeline  # if not already imported

# Step 2: Define feature engineering transformer
feature_engineering = FunctionTransformer(add_features, validate=False)

# Step 3: Recreate your pipeline using the imported function
pipeline = ImbPipeline(steps=[
    ('feature_engineering', feature_engineering),
    ('preprocessing', preprocessor),   # your existing preprocessor
    ('resample', SMOTETomek(random_state=42)),  # optional, if used
    ('classifier', stack_model)
])
pipeline.fit(X_train, y_train)
# Step 11: Evaluation
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]


In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

In [None]:
from sklearn.metrics import f1_score
import numpy as np

# Assuming y_test and y_proba are available
thresholds = np.arange(0.1, 0.9, 0.05)
scores = [(t, f1_score(y_test, (y_proba > t).astype(int))) for t in thresholds]
best_thresh = max(scores, key=lambda x: x[1])[0]
print(f"Best threshold for F1-score: {best_thresh}")




In [None]:
def predict_churn(new_customer_df, threshold=0.5):
    customer_id = new_customer_df['customerID'].values[0]
    input_df = new_customer_df.drop(columns=['customerID'])

    # Let the full pipeline handle feature engineering and preprocessing
    prob = pipeline.predict_proba(input_df)[0][1]
    pred = int(prob > threshold)

    print(f"CustomerID: {customer_id}")
    print(f"Predicted Churn: {'Yes' if pred else 'No'}")
    print(f"Churn Probability: {prob:.2f}")


In [None]:
customer_no_churn = pd.DataFrame([{
  "customerID": "8888-CHURN",
  "gender": "Female",
  "SeniorCitizen": 1,
  "Partner": "No",
  "Dependents": "No",
  "tenure": 1,
  "PhoneService": "Yes",
  "MultipleLines": "No",
  "InternetService": "Fiber optic",
  "OnlineSecurity": "No",
  "OnlineBackup": "No",
  "DeviceProtection": "No",
  "TechSupport": "No",
  "StreamingTV": "No",
  "StreamingMovies": "No",
  "Contract": "Month-to-month",
  "PaperlessBilling": "Yes",
  "PaymentMethod": "Electronic check",
  "MonthlyCharges": 95.0,
  "TotalCharges": 95.0
}])



In [None]:
predict_churn(customer_no_churn )

In [None]:
%%writefile ml_utils.py
def add_features(df):
    df = df.copy()
    df['AvgChargesPerMonth'] = df['TotalCharges'] / df['tenure']
    df['hasStreaming'] = df[['StreamingTV', 'StreamingMovies']].apply(lambda x: int('Yes' in x.values), axis=1)
    df['ServiceCount'] = df[['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                             'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']].apply(lambda x: list(x).count('Yes'), axis=1)
    return df


In [None]:
from ml_utils import add_features


In [None]:


import joblib
joblib.dump(pipeline, 'churn_model.pkl')  # Save with imported add_features


In [None]:
!pip freeze > colab_requirements.txt


In [None]:
from google.colab import files
files.download("colab_requirements.txt")


In [None]:
pip freeze > requirements.txt
