In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print("Original Shape:", df.shape)

Original Shape: (7043, 21)


### 1. Data Cleaning

In [3]:
df.drop("customerID", axis=1, inplace=True)

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

### 2. Feature Engineering

In [4]:

bins = [0, 12, 24, 48, 72]
labels = ["New", "Established", "Loyal", "Very Loyal"]
df["TenureCategory"] = pd.cut(df["tenure"], bins=bins, labels=labels, right=False)


service_cols = ["PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", 
                "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
df["ServiceAdoptionScore"] = df[service_cols].apply(lambda row: sum(val == "Yes" for val in row), axis=1)


num_services = df["ServiceAdoptionScore"].replace(0, 1)
df["AvgChargesPerService"] = df["MonthlyCharges"] / num_services


df["IsElectronicCheck"] = (df["PaymentMethod"] == "Electronic check").astype(int)
df["IsAutoPay"] = df["PaymentMethod"].str.contains("auto|bank|credit", case=False).astype(int)


### 3. Split Data

In [8]:

X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (5634, 24) (5634,)
Test shape: (1409, 24) (1409,)


### 4. Feature Encoding and Scaling

In [9]:


numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object", "category"]).columns

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

# Fit transform on training data, transform test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("After preprocessing:")
print("Train:", X_train_processed.shape)
print("Test:", X_test_processed.shape)


After preprocessing:
Train: (5634, 54)
Test: (1409, 54)


### 5. Class ImbalanceHandling on Training Data

In [8]:
# Save preprocessed data before SMOTE
np.savez("../new_artifacts/X_train_preprocessed.npz", X_train_processed)
np.savez("../new_artifacts/X_test_preprocessed.npz", X_test_processed)
np.savez("../new_artifacts/y_train_preprocessed.npz", y_train)
np.savez("../new_artifacts/y_test_preprocessed.npz", y_test)

print("Saved preprocessed data (before SMOTE):")
print("Train:", X_train_processed.shape, y_train.shape)
print("Test:", X_test_processed.shape, y_test.shape)

Saved preprocessed data (before SMOTE):
Train: (5634, 54) (5634,)
Test: (1409, 54) (1409,)


### 4.1 Save Preprocessed Data (Before SMOTE)

Saving the preprocessed data before applying SMOTE allows us to:
1. Have access to the original class distribution
2. Use this data for different sampling techniques if needed
3. Compare results with and without SMOTE

In [9]:
smote = SMOTE(random_state=42) 
X_train_res, y_train_res = smote.fit_resample(X_train_processed, y_train) 
print("After SMOTE:") 
print("Train:", X_train_res.shape, y_train_res.shape) 
print("Test:", X_test_processed.shape, y_test.shape)

After SMOTE:
Train: (8278, 54) (8278,)
Test: (1409, 54) (1409,)


### 6. Saving Data

In [10]:
np.savez("../new_artifacts/X_train_smote.npz", X_train_res) 
np.savez("../new_artifacts/Y_train_smote.npz", y_train_res) 
np.savez("../new_artifacts/X_test_smote.npz", X_test_processed) 
np.savez("../new_artifacts/Y_test_smote.npz", y_test)

In [11]:

ohe = preprocessor.named_transformers_["cat"]["encoder"]

num_feature_names = list(numeric_features)
cat_feature_names = list(ohe.get_feature_names_out(categorical_features))

all_feature_names = num_feature_names + cat_feature_names
np.save("../new_artifacts/feature_names.npy", np.array(all_feature_names))