# Notebook 02 – Data Preprocessing & Feature Engineering

In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


Load cleaned dataset from previous notebook

In [21]:
data_path = r"C:/Users/USER/PycharmProjects/ML/data/Telco-Customer-Churn.csv"
df = pd.read_csv(data_path)

df.head()



Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Basic Cleaning

In [22]:
# Remove customerID from processing (not useful for training)
df = df.drop("customerID", axis=1)

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Fill missing values — correct future-proof method
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

df.isna().sum()


gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Encode Target Variable

In [23]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df["Churn"].value_counts()


Churn
0    5174
1    1869
Name: count, dtype: int64

Feature Split

In [24]:
X = df.drop("Churn", axis=1)
y = df["Churn"]


Identify Column Types

In [25]:
numeric_features = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]

categorical_features = X.select_dtypes(include=["object"]).columns.tolist()


Preprocessing Transformer

In [26]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


Fit the Preprocessor

In [28]:
preprocessor.fit(X)

# Save preprocessor for reuse in NN and Decision Tree notebooks
joblib.dump(preprocessor, "results/preprocessor.pkl")

print("Preprocessor saved successfully.")


Preprocessor saved successfully.


Transform Training Data

In [29]:
X_preprocessed = preprocessor.transform(X)

X_preprocessed


array([[-0.43991649, -1.27744458, -1.16032292, ...,  0.        ,
         1.        ,  0.        ],
       [-0.43991649,  0.06632742, -0.25962894, ...,  0.        ,
         0.        ,  1.        ],
       [-0.43991649, -1.23672422, -0.36266036, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.43991649, -0.87024095, -1.1686319 , ...,  0.        ,
         1.        ,  0.        ],
       [ 2.27315869, -1.15528349,  0.32033821, ...,  0.        ,
         0.        ,  1.        ],
       [-0.43991649,  1.36937906,  1.35896134, ...,  0.        ,
         0.        ,  0.        ]], shape=(7043, 45))

Train-Test Split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((5634, 45), (1409, 45))