In [None]:
# Cell 1: generate synthetic churn dataset, train a pipeline, save model and CSV
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

RANDOM_SEED = 42

# 1) Create synthetic dataset
np.random.seed(RANDOM_SEED)
n = 1000
df = pd.DataFrame({
    "customer_id": [f"CUST_{i:05d}" for i in range(n)],
    "age": np.random.randint(18, 80, size=n),
    "tenure_months": np.random.randint(0, 72, size=n),
    "monthly_charges": np.round(np.random.uniform(20, 150, size=n), 2),
    "num_support_tickets": np.random.poisson(1.2, size=n),
    "contract_type": np.random.choice(["month-to-month", "one-year", "two-year"], size=n, p=[0.6,0.25,0.15]),
    "payment_method": np.random.choice(["electronic_check","mailed_check","bank_transfer","credit_card"], size=n),
    "has_internet": np.random.choice([0,1], size=n, p=[0.1,0.9])
})

# 2) Create target 'churn'
logit = (
    0.02 * (df["monthly_charges"] - df["monthly_charges"].mean()) +
    -0.03 * (df["tenure_months"]) +
    0.5 * df["num_support_tickets"] +
    0.6 * (df["contract_type"] == "month-to-month").astype(int) +
    0.3 * (df["has_internet"] == 0).astype(int)
)
prob = 1 / (1 + np.exp(- ( -1.0 + logit / 10 )))
df["churn"] = (np.random.rand(n) < prob).astype(int)

# 3) Shuffle rows and save CSV
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df.to_csv("customer_data.csv", index=False)
print("Saved customer_data.csv with shape:", df.shape)

# 4) Prepare features and pipeline
feature_cols = ["age", "tenure_months", "monthly_charges", "num_support_tickets", "contract_type", "payment_method", "has_internet"]
X = df[feature_cols]
y = df["churn"]

numeric_features = ["age", "tenure_months", "monthly_charges", "num_support_tickets", "has_internet"]
categorical_features = ["contract_type", "payment_method"]

numeric_transformer = StandardScaler()
# Use sparse_output=False for newer scikit-learn; if your sklearn is older, you can use sparse=False
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop"
)

clf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)

pipeline = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", clf)
])

# 5) Train pipeline
pipeline.fit(X, y)
print("Trained pipeline.")

# 6) Show feature names after preprocessing (robust across sklearn versions)
try:
    feat_names = pipeline.named_steps["prep"].get_feature_names_out()
except Exception:
    # Fallback: build feature names manually
    cat_names = pipeline.named_steps["prep"].named_transformers_["cat"].get_feature_names_out(categorical_features)
    feat_names = list(numeric_features) + list(cat_names)
print("Number of features after preprocessing:", len(feat_names))
print(feat_names[:20])

# 7) Save model
joblib.dump(pipeline, "churn_model.pkl")
print("Saved churn_model.pkl")
