
# CIS 9660 – Project #1 (Unified)
## Part A: Telco Customer Churn Classification  
## Part B: Telco CLV (Total Charges) **Regression AI Agent** with Streamlit Deployment

**Dataset:** aai510-group1/telco-customer-churn (Hugging Face)  
**Due:** August 13, 2025

This single notebook contains:
- **Part A (Classification):** your original churn modeling pipeline with EDA, XGBoost, feature importance, SHAP, and insights.
- **Part B (Regression Agent - Option D):** CLV proxy prediction (`Total Charges`) with multiple models, 70/15/15 split, 5-fold CV, MAE/RMSE/R2, 95% conformal intervals, and a Streamlit app + deployment files.



## Part A – Churn Classification

In [None]:

# If on Colab, you may need:
# !pip install datasets pandas scikit-learn xgboost matplotlib plotly pyarrow shap

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from datasets import load_dataset
import shap
import warnings
warnings.filterwarnings('ignore')

print("Loading dataset...")
dataset = load_dataset("aai510-group1/telco-customer-churn")
df = pd.DataFrame(dataset['train'])

print("\n=== Dataset Overview ===")
print(f"Shape: {df.shape}")
display(df.head())
print(df.dtypes)

# Ensure numeric types where needed
for col in ["Churn", "Monthly Charge", "Total Charges", "Tenure in Months"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df["Total Charges"] = df["Total Charges"].fillna(0.0)
df["Monthly Charge"] = df["Monthly Charge"].fillna(df["Monthly Charge"].median())
df["Tenure in Months"] = df["Tenure in Months"].fillna(df["Tenure in Months"].median())

# Convert target to binary (0/1) if needed
df["Churn"] = pd.to_numeric(df["Churn"], errors="coerce").fillna(0).astype(int)

fig = px.histogram(df, x='Churn', color='Churn', title='Churn Distribution')
fig.show()

# Features/target
X = df.drop([c for c in ["Customer ID","Churn"] if c in df.columns], axis=1)
y = df["Churn"]

numeric_features = [c for c in ["Tenure in Months","Monthly Charge","Total Charges"] if c in X.columns]
categorical_features = [c for c in [
    "Gender","Senior Citizen","Partner","Dependents","Phone Service","Multiple Lines",
    "Internet Type","Online Security","Online Backup","Device Protection Plan",
    "Premium Tech Support","Streaming TV","Streaming Movies","Contract",
    "Paperless Billing","Payment Method"
] if c in X.columns]

preprocessor_cls = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model_cls = Pipeline([
    ('preprocessor', preprocessor_cls),
    ('classifier', XGBClassifier(
        scale_pos_weight=(len(y_train[y_train==0])/max(1,len(y_train[y_train==1]))),
        eval_metric='aucpr',
        random_state=42,
        use_label_encoder=False
    ))
])

print("\nTraining churn model...")
model_cls.fit(X_train, y_train)

y_pred = model_cls.predict(X_test)
y_proba = model_cls.predict_proba(X_test)[:, 1]

print("\n=== Classification Metrics ===")
print(classification_report(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.4f}")

cm = confusion_matrix(y_test, y_pred)
fig = px.imshow(cm, text_auto=True, labels=dict(x="Predicted", y="Actual"),
                x=['Not Churn', 'Churn'], y=['Not Churn', 'Churn'], title="Confusion Matrix")
fig.show()

# Feature importances
cat_encoder = model_cls.named_steps['preprocessor'].named_transformers_['cat']
cat_features = list(cat_encoder.get_feature_names_out(categorical_features))
all_features = numeric_features + cat_features

importance = model_cls.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({'Feature': all_features, 'Importance': importance}).sort_values('Importance', ascending=False)

fig = px.bar(feature_importance.head(20), x='Importance', y='Feature', title='Top 20 Important Features')
fig.show()

print("\n=== Business Insights (Classification) ===")
print("Top churn drivers (by importance):")
print(feature_importance.head(5).to_string(index=False))

if "Contract" in df.columns:
    print("\nChurn rate by contract type:")
    print(df.groupby('Contract')['Churn'].mean().sort_values(ascending=False))

if "Monthly Charge" in df.columns:
    print("\nMonthly charges by churn segment:")
    print(df.groupby('Churn')['Monthly Charge'].mean())

# SHAP
print("\nComputing SHAP values on a sample (for speed)...")
X_train_proc = model_cls.named_steps['preprocessor'].transform(X_train)
explainer = shap.TreeExplainer(model_cls.named_steps['classifier'])
# shap_values can be large; take a sample
sample_idx = np.random.choice(X_train_proc.shape[0], size=min(200, X_train_proc.shape[0]), replace=False)
shap_values = explainer.shap_values(X_train_proc[sample_idx])
shap.summary_plot(shap_values, X_train_proc[sample_idx], feature_names=all_features, max_display=20, show=False)
plt.show()


## Part B – Regression AI Agent (CLV via Total Charges)

In [None]:

# If on Colab, you may need:
# !pip install xgboost datasets scikit-learn pandas numpy joblib plotly streamlit

import json
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor
from sklearn.base import clone
import joblib

# Reuse df from Part A
print("Using same Telco dataset. Target = Total Charges (continuous).")

y_reg = df["Total Charges"].copy()

numeric_features_reg = [c for c in ["Tenure in Months","Monthly Charge"] if c in df.columns]
categorical_features_reg = [c for c in [
    "Gender","Senior Citizen","Partner","Dependents","Phone Service","Multiple Lines",
    "Internet Type","Online Security","Online Backup","Device Protection Plan",
    "Premium Tech Support","Streaming TV","Streaming Movies","Contract",
    "Paperless Billing","Payment Method"
] if c in df.columns]

X_reg = df[numeric_features_reg + categorical_features_reg].copy()

preprocessor_reg = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features_reg),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features_reg)
    ]
)

# 70/15/15 split
X_train_temp, X_test_r, y_train_temp, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.15, random_state=42
)
X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(
    X_train_temp, y_train_temp, test_size=0.1765, random_state=42
)  # ~15 percent absolute

# Pipelines
pipe_lr = Pipeline([("prep", preprocessor_reg), ("model", LinearRegression())])
pipe_ridge = Pipeline([("prep", preprocessor_reg), ("model", Ridge(random_state=42))])
pipe_xgb = Pipeline([("prep", preprocessor_reg),
                     ("model", XGBRegressor(random_state=42, objective="reg:squarederror", n_estimators=400, n_jobs=-1))])

# Param grids
param_grid_ridge = {"model__alpha": [0.1, 1.0, 5.0, 10.0]}
param_grid_xgb = {
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [4, 6, 8],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Baseline Linear
pipe_lr.fit(X_train_r, y_train_r)
y_val_lr = pipe_lr.predict(X_val_r)
metrics_lr = {"Model":"LinearRegression","MAE": mean_absolute_error(y_val_r,y_val_lr),
              "RMSE": mean_squared_error(y_val_r,y_val_lr,squared=False),"R2": r2_score(y_val_r,y_val_lr)}

# Ridge
gs_ridge = GridSearchCV(pipe_ridge, param_grid_ridge, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1, verbose=1)
gs_ridge.fit(X_train_r, y_train_r)
y_val_ridge = gs_ridge.predict(X_val_r)
metrics_ridge = {"Model":"Ridge","MAE": mean_absolute_error(y_val_r,y_val_ridge),
                 "RMSE": mean_squared_error(y_val_r,y_val_ridge,squared=False),"R2": r2_score(y_val_r,y_val_ridge),
                 "BestParams": gs_ridge.best_params_}

# XGBoost
gs_xgb = GridSearchCV(pipe_xgb, param_grid_xgb, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1, verbose=1)
gs_xgb.fit(X_train_r, y_train_r)
y_val_xgb = gs_xgb.predict(X_val_r)
metrics_xgb = {"Model":"XGBoost","MAE": mean_absolute_error(y_val_r,y_val_xgb),
               "RMSE": mean_squared_error(y_val_r,y_val_xgb,squared=False),"R2": r2_score(y_val_r,y_val_xgb),
               "BestParams": gs_xgb.best_params_}

comparison = pd.DataFrame([metrics_lr, metrics_ridge, metrics_xgb]).sort_values("RMSE")
display(comparison)

# Select best and evaluate on Test
best_name = comparison.iloc[0]["Model"]
if best_name == "LinearRegression":
    final_model = pipe_lr
elif best_name == "Ridge":
    final_model = gs_ridge.best_estimator_
else:
    final_model = gs_xgb.best_estimator_

X_train_full = pd.concat([X_train_r, X_val_r], axis=0)
y_train_full = pd.concat([y_train_r, y_val_r], axis=0)
final_model.fit(X_train_full, y_train_full)

y_test_pred = final_model.predict(X_test_r)
test_metrics = {"MAE": mean_absolute_error(y_test_r, y_test_pred),
                "RMSE": mean_squared_error(y_test_r, y_test_pred, squared=False),
                "R2": r2_score(y_test_r, y_test_pred)}
display(pd.DataFrame([test_metrics]))

fig = px.scatter(x=y_test_r, y=y_test_pred, labels={"x":"Actual Total Charges","y":"Predicted Total Charges"},
                 title="Test Set: Predicted vs Actual Total Charges")
fig.add_shape(type="line", x0=float(y_test_r.min()), y0=float(y_test_r.min()),
              x1=float(y_test_r.max()), y1=float(y_test_r.max()))
fig.show()

# Split conformal (95% PI)
base_estimator = clone(final_model)
base_estimator.fit(X_train_r, y_train_r)
val_pred = base_estimator.predict(X_val_r)
residuals = np.abs(y_val_r - val_pred)
alpha = 0.05
q_hat = float(np.quantile(residuals, 1 - alpha))

def predict_with_interval(model, X_new, q_hat):
    preds = model.predict(X_new)
    lower = preds - q_hat
    upper = preds + q_hat
    return preds, lower, upper

preds, lo, hi = predict_with_interval(base_estimator, X_test_r, q_hat)
display(pd.DataFrame({"pred": preds[:10], "lower_95": lo[:10], "upper_95": hi[:10], "actual": y_test_r.iloc[:10].values}))

# Save artifacts and write app + requirements
ART_DIR = Path("artifacts_telco_regression")
ART_DIR.mkdir(exist_ok=True)
joblib.dump(final_model, ART_DIR / "final_model.pkl")
meta = {"alpha": alpha, "q_hat": q_hat, "best_model_name": best_name,
        "metrics_validation": comparison.to_dict(orient="records"),
        "metrics_test": test_metrics,
        "numeric_features": numeric_features_reg,
        "categorical_features": categorical_features_reg}
with open(ART_DIR / "meta.json","w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", list(ART_DIR.glob("*")))


### Write Streamlit app (`app_telco_regression.py`) and `requirements.txt`

In [None]:

from pathlib import Path

app_code = '''import json
import joblib
import numpy as np
import pandas as pd
import streamlit as st
from pathlib import Path

st.set_page_config(page_title="Telco CLV (Total Charges) Regression Agent", layout="wide")

ARTIFACT_DIR = Path("artifacts_telco_regression")
MODEL_PATH = ARTIFACT_DIR / "final_model.pkl"
META_PATH = ARTIFACT_DIR / "meta.json"

@st.cache_resource
def load_artifacts():
    model = joblib.load(MODEL_PATH)
    with open(META_PATH, "r") as f:
        meta = json.load(f)
    return model, meta

model, meta = load_artifacts()
q_hat = meta["q_hat"]

st.title("Telco CLV Prediction (Total Charges)")
st.markdown("Estimate Customer Lifetime Value using a regression model trained on a public Telco dataset. Includes 95% prediction intervals via split conformal.")

with st.sidebar:
    st.header("Inputs")
    tenure = st.number_input("Tenure in Months", min_value=0.0, max_value=120.0, value=24.0, step=1.0)
    monthly = st.number_input("Monthly Charge", min_value=0.0, max_value=1000.0, value=70.0, step=1.0)

    def pick(label, options, default_idx=0):
        return st.selectbox(label, options, index=default_idx) if options else None

    gender = pick("Gender", ["Male", "Female"])
    senior = pick("Senior Citizen", ["No", "Yes"])
    partner = pick("Partner", ["No", "Yes"])
    dependents = pick("Dependents", ["No", "Yes"])
    phone = pick("Phone Service", ["Yes", "No"])
    multiline = pick("Multiple Lines", ["No", "Yes"])
    internet = pick("Internet Type", ["Fiber Optic", "DSL", "Cable", "None"])
    online_sec = pick("Online Security", ["No", "Yes"])
    online_bkp = pick("Online Backup", ["No", "Yes"])
    device_plan = pick("Device Protection Plan", ["No", "Yes"])
    tech_support = pick("Premium Tech Support", ["No", "Yes"])
    stream_tv = pick("Streaming TV", ["No", "Yes"])
    stream_movies = pick("Streaming Movies", ["No", "Yes"])
    contract = pick("Contract", ["Month-to-Month", "One Year", "Two Year"])
    paperless = pick("Paperless Billing", ["Yes", "No"])
    pay_method = pick("Payment Method", ["Credit Card", "Bank Transfer", "Mailed Check", "Electronic Check"])

if st.button("Predict CLV"):
    rec = {
        "Tenure in Months": tenure,
        "Monthly Charge": monthly,
        "Gender": gender,
        "Senior Citizen": senior,
        "Partner": partner,
        "Dependents": dependents,
        "Phone Service": phone,
        "Multiple Lines": multiline,
        "Internet Type": internet,
        "Online Security": online_sec,
        "Online Backup": online_bkp,
        "Device Protection Plan": device_plan,
        "Premium Tech Support": tech_support,
        "Streaming TV": stream_tv,
        "Streaming Movies": stream_movies,
        "Contract": contract,
        "Paperless Billing": paperless,
        "Payment Method": pay_method
    }
    X_new = pd.DataFrame([rec])

    y_pred = model.predict(X_new)[0]
    lower = y_pred - q_hat
    upper = y_pred + q_hat

    st.subheader("Result")
    st.metric("Predicted Total Charges (USD)", f"${y_pred:,.2f}")
    st.write(f"95% Prediction Interval: ${lower:,.2f} — ${upper:,.2f}")
    st.caption("Note: Interval uses split conformal prediction.")

st.divider()
st.subheader("Model Performance (Test Summary)")
st.write(pd.DataFrame([meta["metrics_test"]]))

st.divider()
st.subheader("About")
st.markdown(f"""
- Final Model: {meta['best_model_name']}
- Target: Total Charges (USD) - CLV proxy
- Use Case: Retention budgeting, discount caps, upsell targeting
- Disclaimer: Educational demonstration, not financial advice.
""")
'''
Path("app_telco_regression.py").write_text(app_code, encoding="utf-8")

reqs = (
    "datasets>=2.19\n"
    "scikit-learn>=1.3\n"
    "xgboost>=1.7\n"
    "pandas>=2.0\n"
    "numpy>=1.25\n"
    "joblib>=1.2\n"
    "plotly>=5.15\n"
    "streamlit>=1.32\n"
)
Path("requirements.txt").write_text(reqs, encoding="utf-8")

print("Wrote app_telco_regression.py and requirements.txt")



## Submission Checklist
- This notebook (contains **Part A** and **Part B**).
- `artifacts_telco_regression/final_model.pkl` and `meta.json` (generated after running Part B).
- `app_telco_regression.py` and `requirements.txt` (written in Part B).
- Live Streamlit URL (deploy with `app_telco_regression.py`).
- 1-page technical report (use the template from earlier response or your own).
