In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


In [3]:
data = pd.read_csv("../data/raw/hmQOVnDvRN.csv")
print("Original Shape:", data.shape)

# Data Cleaning
# ================================

# Drop customerID
data.drop("customerID", axis=1, inplace=True)

# Convert TotalCharges to numeric and handle missing
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors='coerce')
data["TotalCharges"] = data["TotalCharges"].fillna(data["TotalCharges"].median())

# Encode target
data["Churn"] = data["Churn"].map({"Yes": 1, "No": 0})



Original Shape: (7043, 21)


In [4]:
# Feature Engineering
# ================================

# Tenure categories
bins = [0, 12, 24, 48, 72]
labels = ["New", "Established", "Loyal", "Very Loyal"]
data["TenureCategory"] = pd.cut(data["tenure"], bins=bins, labels=labels, right=False)

# Service adoption score (count number of services)
service_cols = ["PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", 
                "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
data["ServiceAdoptionScore"] = data[service_cols].apply(lambda row: sum(val == "Yes" for val in row), axis=1)

# Average monthly charges per service
num_services = data["ServiceAdoptionScore"].replace(0, 1)
data["AvgChargesPerService"] = data["MonthlyCharges"] / num_services

# Payment reliability indicators (e.g., electronic check vs auto-pay)
data["IsElectronicCheck"] = (data["PaymentMethod"] == "Electronic check").astype(int)
data["IsAutoPay"] = data["PaymentMethod"].str.contains("auto|bank|credit", case=False).astype(int)


In [5]:
# Train-Test Split (after feature engineering)
# ================================

X = data.drop("Churn", axis=1)
y = data["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (5634, 24) (5634,)
Test shape: (1409, 24) (1409,)


In [6]:
# Encoding Strategies + Scaling
# ================================

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

# Fit transform on training data, transform test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("After preprocessing:")
print("Train:", X_train_processed.shape)
print("Test:", X_test_processed.shape)

After preprocessing:
Train: (5634, 54)
Test: (1409, 54)


In [7]:
# Apply SMOTE (only on training data) # ================================ 
smote = SMOTE(random_state=42) 
X_train_res, y_train_res = smote.fit_resample(X_train_processed, y_train) 
print("After SMOTE:") 
print("Train:", X_train_res.shape, y_train_res.shape) 
print("Test:", X_test_processed.shape, y_test.shape)

After SMOTE:
Train: (8278, 54) (8278,)
Test: (1409, 54) (1409,)


In [8]:
# Save Preprocessed Data # ================================ 
np.savez("../new_artifacts/X_train_smote.npz", X_train_res) 
np.savez("../new_artifacts/Y_train_smote.npz", y_train_res) 
np.savez("../new_artifacts/X_test_smote.npz", X_test_processed) 
np.savez("../new_artifacts/Y_test_smote.npz", y_test)

In [10]:

# Get feature names from ColumnTransformer
ohe = preprocessor.named_transformers_["cat"]["encoder"]

# Numeric + OneHotEncoded categorical names
num_feature_names = list(numeric_features)
cat_feature_names = list(ohe.get_feature_names_out(categorical_features))

all_feature_names = num_feature_names + cat_feature_names
np.save("../new_artifacts/feature_names.npy", np.array(all_feature_names))

