In [9]:
# 0) Imports & settings
import os, warnings
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

PROCESSED_DIR = "../data/processed"
REPORTS_DIR = "../reports"
os.makedirs(REPORTS_DIR, exist_ok=True)

RAW_FILE = f"{PROCESSED_DIR}/telco_churn_step1_clean.csv"

# Utility
def title(s): 
    print("\n" + "="*len(s)); print(s); print("="*len(s))

print("Imports & settings ✅")


Imports & settings ✅


In [10]:
title("1) LOAD STEP 1 CLEAN DATA")
df = pd.read_csv(RAW_FILE)
print("Shape:", df.shape)
display(df.head(3))



1) LOAD STEP 1 CLEAN DATA
Shape: (7043, 22)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_bin
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0-6
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25-36
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0-6


In [11]:
title("2) TARGET & FEATURE SPLIT")

# Define target
TARGET = "Churn"
df[TARGET] = df[TARGET].map({"Yes":1, "No":0})  # binary encoding

# Drop customerID (identifier, not predictive)
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])

X = df.drop(columns=[TARGET])
y = df[TARGET]

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts(normalize=True).mul(100).round(2))



2) TARGET & FEATURE SPLIT
X shape: (7043, 20)
y distribution:
 Churn
0    73.46
1    26.54
Name: proportion, dtype: float64


In [12]:
title("3) FEATURE TYPES")

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)



3) FEATURE TYPES
Numeric columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Categorical columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_bin']


In [13]:
title("4) PREPROCESSING PIPELINES")

# Numeric pipeline
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline
cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
)



4) PREPROCESSING PIPELINES


In [14]:
title("5) TRAIN/TEST SPLIT")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)
print("Churn % in train:", y_train.mean().round(3))
print("Churn % in test:", y_test.mean().round(3))



5) TRAIN/TEST SPLIT
Train size: (5634, 20) Test size: (1409, 20)
Churn % in train: 0.265
Churn % in test: 0.265


In [15]:
title("6) FIT & TRANSFORM")

# Fit only on training data
preprocessor.fit(X_train)

# Transform train & test
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed  = preprocessor.transform(X_test)

print("Transformed shapes:", X_train_transformed.shape, X_test_transformed.shape)

joblib.dump(preprocessor, "../models/preprocessor.joblib")



6) FIT & TRANSFORM
Transformed shapes: (5634, 51) (1409, 51)


['../models/preprocessor.joblib']

In [16]:
title("7) SAVE PROCESSED DATA & METADATA")

# Save transformed arrays
np.save(f"{PROCESSED_DIR}/X_train.npy", X_train_transformed)
np.save(f"{PROCESSED_DIR}/X_test.npy",  X_test_transformed)
np.save(f"{PROCESSED_DIR}/y_train.npy", y_train.values)
np.save(f"{PROCESSED_DIR}/y_test.npy",  y_test.values)

# Save feature names (important for interpretation later)
ohe = preprocessor.named_transformers_["cat"]["onehot"]
cat_features = list(ohe.get_feature_names_out(cat_cols))
final_features = num_cols + cat_features

pd.Series(final_features).to_csv(f"{REPORTS_DIR}/final_features.csv", index=False)
print("Saved processed data & feature list.")



7) SAVE PROCESSED DATA & METADATA
Saved processed data & feature list.
