In [1]:
"""
customer_churn.py
=================
A single‑file, end‑to‑end machine‑learning workflow for Customer Churn Prediction.

Usage examples
--------------
# Train on a CSV and evaluate on a held‑out test split
python customer_churn.py --data data/telco.csv --target Churn --test_size 0.2 --model xgboost

# Predict churn probability for a JSON record (after training)
python customer_churn.py --predict sample_customer.json --model_path models/best_model.pkl --target Churn

Dependencies
------------
- pandas
- numpy
- scikit‑learn
- imbalanced‑learn
- xgboost (optional but recommended)
- joblib
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Tuple, Dict

import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

try:
    from xgboost import XGBClassifier  # type: ignore
    _HAS_XGB = True
except ImportError:  # pragma: no cover
    _HAS_XGB = False
    XGBClassifier = None  # type: ignore

RANDOM_STATE = 42


def load_data(path: str | Path) -> pd.DataFrame:
    """Load a CSV file into a DataFrame."""
    df = pd.read_csv(path)
    return df


def split_X_y(df: pd.DataFrame, target: str) -> Tuple[pd.DataFrame, pd.Series]:
    """Separate features ``X`` and label ``y``."""
    if target not in df.columns:
        raise ValueError(f"Target column '{target}' not found in data.")
    X = df.drop(columns=[target])
    y = df[target].copy()
    if y.dtype == "object":
        y = y.map({"Yes": 1, "No": 0}).fillna(y)
    return X, y


def build_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    """Return a ColumnTransformer that preprocesses numeric & categorical columns."""
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = X.select_dtypes(exclude=["int64", "float64"]).columns.tolist()

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ])
    return preprocessor


def get_model(name: str) -> Tuple[str, object]:
    """Return (name, estimator) pair for the requested model."""
    name = name.lower()
    if name == "logreg":
        return "logreg", LogisticRegression(max_iter=1000, n_jobs=-1, random_state=RANDOM_STATE)
    if name == "rf":
        return "rf", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)
    if name == "gb":
        return "gb", GradientBoostingClassifier(random_state=RANDOM_STATE)
    if name == "xgboost":
        if not _HAS_XGB:
            raise ImportError("xgboost is not installed. pip install xgboost")
        return "xgboost", XGBClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=RANDOM_STATE,
            objective="binary:logistic",
        )
    raise ValueError(f"Unknown model '{name}'. Choose from logreg, rf, gb, xgboost.")


def evaluate_model(model, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, float]:
    """Return common classification metrics as a dict."""
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
    }
    if y_prob is not None:
        metrics["roc_auc"] = roc_auc_score(y_test, y_prob)
    cm = confusion_matrix(y_test, y_pred)
    metrics["tn"], metrics["fp"], metrics["fn"], metrics["tp"] = cm.ravel()
    return metrics


def pretty_print_metrics(metrics: Dict[str, float]):
    header = (
        f"Accuracy:  {metrics['accuracy']:.3f}\n"
        f"Precision: {metrics['precision']:.3f}\n"
        f"Recall:    {metrics['recall']:.3f}\n"
        f"F1‑score:  {metrics['f1']:.3f}"
    )
    if "roc_auc" in metrics:
        header += f"\nROC‑AUC:  {metrics['roc_auc']:.3f}"
    print(header)
    print("Confusion matrix [TN FP; FN TP] ⇒", metrics["tn"], metrics["fp"], metrics["fn"], metrics["tp"])


def train_and_evaluate(
    df: pd.DataFrame,
    target: str,
    model_name: str,
    test_size: float = 0.2,
) -> Tuple[Pipeline, Dict[str, float]]:
    X, y = split_X_y(df, target)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=RANDOM_STATE
    )

    preprocessor = build_preprocessor(X_train)
    model_label, clf = get_model(model_name)

    pipe = Pipeline([
        ("pre", preprocessor),
        (model_label, clf),
    ])

    pipe.fit(X_train, y_train)
    metrics = evaluate_model(pipe, X_test, y_test)
    return pipe, metrics


def predict_single(model_path: str | Path, json_path: str | Path):
    model = joblib.load(model_path)
    with open(json_path) as f:
        record = json.load(f)
    df = pd.DataFrame([record])
    prob = model.predict_proba(df)[0][1]
    label = int(prob >= 0.5)
    print(f"Churn probability: {prob:.3f} → Label: {label}")


def main():
    parser = argparse.ArgumentParser(description="Customer Churn Prediction")
    parser.add_argument("--data", type=str, help="Path to CSV data file")
    parser.add_argument("--target", type=str, default="Churn", help="Target column name")
    parser.add_argument("--model", type=str, default="xgboost", help="Model: logreg, rf, gb, xgboost")
    parser.add_argument("--test_size", type=float, default=0.2, help="Test size fraction")
    parser.add_argument("--save_dir", type=str, default="models", help="Directory to save trained model")
    parser.add_argument("--predict", type=str, help="Path to JSON file for single prediction")
    parser.add_argument("--model_path", type=str, help="Path to saved model for prediction")
    args = parser.parse_args()

    if args.predict and args.model_path:
        predict_single(args.model_path, args.predict)
        sys.exit()

    if not args.data:
        parser.error("--data is required unless using --predict with --model_path")

    df = load_data(args.data)
    print(f"Loaded data shape: {df.shape}")

    pipe, metrics = train_and_evaluate(df, args.target, args.model, args.test_size)
    pretty_print_metrics(metrics)

    Path(args.save_dir).mkdir(exist_ok=True)
    model_fp = Path(args.save_dir) / "best_model.pkl"
    joblib.dump(pipe, model_fp)
    print(f"Model saved to {model_fp.resolve()}")


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--data DATA] [--target TARGET]
                             [--model MODEL] [--test_size TEST_SIZE]
                             [--save_dir SAVE_DIR] [--predict PREDICT]
                             [--model_path MODEL_PATH]
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\Hp\AppData\Roaming\jupyter\runtime\kernel-v385f5e451f6670c4a935d292b217a2dba180d9465.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
