In [None]:
# churn_prediction.py
"""
Telco Customer Churn Prediction (IBM Telco Customer Churn)
- Loads dataset (auto-download from Kaggle if configured, or local CSV fallback)
- Preprocesses (encoding, scaling)
- Trains Logistic Regression and RandomForest
- Evaluates models (accuracy, ROC AUC, confusion matrix, classification report)
- Saves artifacts (models + preprocessing)
- Interactive prediction mode (choose model and enter feature values)

Dataset reference: Kaggle "Telco Customer Churn" (blastchar). See dataset page for CSV and details.
"""

import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
)

# ---------- User-configurable ----------
KAGGLE_AUTO_DOWNLOAD = True   # If True, try to download using Kaggle CLI (requires kaggle.json)
DATA_FILENAME = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
ARTIFACTS_DIR = "churn_artifacts"
RANDOM_STATE = 42
# --------------------------------------

os.makedirs(ARTIFACTS_DIR, exist_ok=True)

def download_from_kaggle():
    """
    Attempt to download dataset using Kaggle API.
    Requires 'kaggle' python package or kaggle CLI configured with ~/.kaggle/kaggle.json
    """
    try:
        # prefer Kaggle python package if installed
        import kaggle
        print("Found kaggle python package; attempting download...")
        # dataset slug
        dataset = "blastchar/telco-customer-churn"
        kaggle.api.dataset_download_files(dataset, path=".", unzip=True, quiet=False)
        print("Download complete.")
        return True
    except Exception as e:
        # try system kaggle CLI
        try:
            print("kaggle python package not available or failed. Trying system 'kaggle' CLI...")
            exit_code = os.system(f'kaggle datasets download -d blastchar/telco-customer-churn -p . --unzip')
            if exit_code == 0:
                print("Downloaded via kaggle CLI.")
                return True
            else:
                print("kaggle CLI download failed (exit code != 0).")
                return False
        except Exception as ex:
            print("Automatic download failed:", ex)
            return False

def load_dataset():
    # Try local first
    if Path(DATA_FILENAME).exists():
        print(f"Loading dataset from local file: {DATA_FILENAME}")
        df = pd.read_csv(DATA_FILENAME)
        return df
    if KAGGLE_AUTO_DOWNLOAD:
        print("Local dataset not found. Attempting Kaggle download (requires kaggle.json or CLI configured).")
        success = download_from_kaggle()
        if success and Path(DATA_FILENAME).exists():
            print("Loading dataset after download.")
            df = pd.read_csv(DATA_FILENAME)
            return df
        else:
            print("Automatic download failed. Please download the dataset CSV from Kaggle and place it here:")
            print("https://www.kaggle.com/datasets/blastchar/telco-customer-churn")
            sys.exit(1)
    else:
        print("Dataset not found and auto-download disabled. Please place the CSV in the script folder.")
        sys.exit(1)

def preprocess(df, fit_transform=True, preprocessor=None):
    """
    Preprocess the Telco dataset:
    - Fix total charges numeric conversion
    - Drop customerID
    - Encode categorical features with OneHotEncoder (drop='first' is optional)
    - Scale numeric features with StandardScaler
    Returns (X_transformed, y, preprocessor)
    If fit_transform=False, expects preprocessor to be a fitted ColumnTransformer.
    """
    df = df.copy()
    # Target
    if 'Churn' not in df.columns:
        raise ValueError("Expected 'Churn' column in dataset.")
    y = (df['Churn'].str.strip().map({'Yes':1, 'No':0})).astype(int)

    # Drop ID if present
    if 'customerID' in df.columns:
        df = df.drop(columns=['customerID'])

    # Clean TotalCharges (some rows are blank or spaces)
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = df['TotalCharges'].replace(" ", np.nan)
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        # fill na with 0 or with MonthlyCharges * tenure as an option
        df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])

    # Separate features
    X = df.drop(columns=['Churn'])

    # Identify categorical vs numeric
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

    # For binary-like object columns with 'Yes'/'No' or 'Male'/'Female' we can map to 0/1 first
    # But we'll let OneHotEncoder handle categoricals for simplicity
    # Create ColumnTransformer
    if fit_transform:
        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])
        from sklearn.compose import ColumnTransformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_cols),
                ('cat', categorical_transformer, categorical_cols)
            ],
            remainder='drop'
        )
        X_transformed = preprocessor.fit_transform(X)
        # Get feature names (for later interpretability)
        # OneHotEncoder categories
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        ohe_cat_names = []
        if hasattr(ohe, 'get_feature_names_out'):
            ohe_cat_names = list(ohe.get_feature_names_out(categorical_cols))
        else:
            # sklearn < 1.0 fallback
            for i, col in enumerate(categorical_cols):
                cats = ohe.categories_[i]
                ohe_cat_names += [f"{col}_{c}" for c in cats]
        feature_names = list(numeric_cols) + ohe_cat_names
        return X_transformed, y, preprocessor, feature_names
    else:
        X_transformed = preprocessor.transform(X)
        # feature_names must be provided separately if needed
        return X_transformed, y, preprocessor, None

def train_and_evaluate(X_train, X_test, y_train, y_test, feature_names):
    # Logistic Regression
    log = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    log.fit(X_train, y_train)
    y_pred_log = log.predict(X_test)
    y_prob_log = log.predict_proba(X_test)[:,1]
    acc_log = accuracy_score(y_test, y_pred_log)
    auc_log = roc_auc_score(y_test, y_prob_log)

    print("\n=== Logistic Regression Results ===")
    print("Accuracy: {:.4f}".format(acc_log))
    print("ROC AUC: {:.4f}".format(auc_log))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
    print("Classification Report:\n", classification_report(y_test, y_pred_log))

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_prob_rf = rf.predict_proba(X_test)[:,1]
    acc_rf = accuracy_score(y_test, y_pred_rf)
    auc_rf = roc_auc_score(y_test, y_prob_rf)

    print("\n=== Random Forest Results ===")
    print("Accuracy: {:.4f}".format(acc_rf))
    print("ROC AUC: {:.4f}".format(auc_rf))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
    print("Classification Report:\n", classification_report(y_test, y_pred_rf))

    # Feature importances from RF (map back to feature names)
    try:
        importances = rf.feature_importances_
        if feature_names is not None and len(feature_names) == len(importances):
            fi = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)[:20]
            print("\nTop 20 feature importances (Random Forest):")
            for name, val in fi:
                print(f"  {name}: {val:.4f}")
        else:
            print("\nFeature importances computed but feature names unavailable or mismatch.")
    except Exception as e:
        print("Could not compute feature importances:", e)

    results = {
        'logistic': {'accuracy': float(acc_log), 'roc_auc': float(auc_log)},
        'random_forest': {'accuracy': float(acc_rf), 'roc_auc': float(auc_rf)}
    }
    return log, rf, results

def save_artifacts(preprocessor, feature_names, log_model, rf_model, results):
    with open(os.path.join(ARTIFACTS_DIR, 'preprocessor.pkl'), 'wb') as f:
        pickle.dump(preprocessor, f)
    with open(os.path.join(ARTIFACTS_DIR, 'feature_names.json'), 'w') as f:
        json.dump(feature_names, f)
    with open(os.path.join(ARTIFACTS_DIR, 'logistic_model.pkl'), 'wb') as f:
        pickle.dump(log_model, f)
    with open(os.path.join(ARTIFACTS_DIR, 'rf_model.pkl'), 'wb') as f:
        pickle.dump(rf_model, f)
    with open(os.path.join(ARTIFACTS_DIR, 'results_summary.json'), 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved artifacts to '{ARTIFACTS_DIR}'")

def interactive_prediction(preprocessor, feature_names, log_model, rf_model):
    """
    Prompt user for input values for a new customer.
    We'll request values for the original input columns in a friendly order.
    The preprocessor expects the original columns (numeric + categorical).
    To simplify, load a sample row of the training data columns names from feature_names via the preprocessor.
    """
    print("\n--- Interactive Prediction Mode ---")
    print("You will be prompted to enter values for features. Press ENTER to use defaults shown in [] brackets.")
    # To get original column names, attempt to reconstruct from preprocessor
    # preprocessor.transformers_ contains the mapping
    col_names = []
    # Try to recover original columns from transformers
    try:
        transformers = preprocessor.transformers_
        # transformers is a list like [('num', Pipeline, [num_cols]), ('cat', Pipeline, [cat_cols])]
        for name, trans, cols in transformers:
            if cols == 'remainder' or cols == 'drop':
                continue
            if isinstance(cols, (list, tuple)):
                col_names.extend(list(cols))
            else:
                # if col spec is slice or array, try to coerce
                try:
                    col_names.extend(list(cols))
                except:
                    pass
    except Exception:
        pass

    if not col_names:
        print("Could not recover original column names from preprocessor. Please provide a CSV row instead.")
        return

    input_data = {}
    for c in col_names:
        # Provide basic guidance: if numeric, default 0; otherwise empty string
        val = input(f"Enter value for {c} (example/default empty): ")
        if val.strip() == "":
            input_data[c] = np.nan
        else:
            input_data[c] = val

    # Convert to DataFrame with single row
    single_df = pd.DataFrame([input_data], columns=col_names)

    # Attempt to coerce numeric columns to numeric where appropriate (we'll detect from preprocessor)
    # We'll try casting the numeric columns from the preprocessor
    numeric_cols = []
    for name, trans, cols in preprocessor.transformers_:
        if name == 'num':
            numeric_cols = list(cols)
    for c in numeric_cols:
        if c in single_df.columns:
            single_df[c] = pd.to_numeric(single_df[c], errors='coerce')

    # Fill NaNs reasonably: numeric -> 0, categorical -> 'No' or ''
    for c in single_df.columns:
        if single_df[c].dtype.kind in 'biufc':
            if pd.isna(single_df.at[0, c]):
                single_df.at[0, c] = 0
        else:
            if pd.isna(single_df.at[0, c]):
                single_df.at[0, c] = ''

    # Transform
    X_single = preprocessor.transform(single_df)

    # Choose model
    choice = input("Which model to use for prediction? [logistic / rf] (default: rf): ").strip().lower()
    if choice == '' or choice not in ['logistic', 'rf']:
        choice = 'rf'

    if choice == 'logistic':
        prob = log_model.predict_proba(X_single)[0,1]
        pred = log_model.predict(X_single)[0]
    else:
        prob = rf_model.predict_proba(X_single)[0,1]
        pred = rf_model.predict(X_single)[0]

    print(f"\nModel: {choice.upper()}  -> Predicted Churn: {'Yes' if pred==1 else 'No'}  (probability={prob:.4f})")

# --------------------- Main -----------------------
if __name__ == "__main__":
    print("Telco Customer Churn Prediction Script")
    print("Dataset reference: Kaggle 'Telco Customer Churn' (blastchar). If you haven't downloaded it, place the CSV named:\n  ", DATA_FILENAME)

    # 1) Load dataset
    df = load_dataset()
    print("Dataset shape:", df.shape)
    print("Columns:", df.columns.tolist())

    # 2) Preprocess (fit)
    X_all, y_all, preprocessor, feature_names = preprocess(df, fit_transform=True, preprocessor=None)
    print("Preprocessing complete. Number of features after transform:", X_all.shape[1])

    # 3) Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=RANDOM_STATE, stratify=y_all)
    print("Train/Test sizes:", X_train.shape[0], X_test.shape[0])

    # 4) Train and evaluate
    log_model, rf_model, results = train_and_evaluate(X_train, X_test, y_train, y_test, feature_names)

    # 5) Save artifacts
    save_artifacts(preprocessor, feature_names, log_model, rf_model, results)

    # 6) Interactive prediction loop (optional)
    while True:
        resp = input("\nDo you want to make an interactive prediction now? (y/n): ").strip().lower()
        if resp in ['y','yes']:
            interactive_prediction(preprocessor, feature_names, log_model, rf_model)
        else:
            print("Exiting interactive mode.")
            break

    print("Script finished. Artifacts saved in:", ARTIFACTS_DIR)
