# Notebook 02 â€“ Data Preprocessing & Feature Engineering

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

Load cleaned dataset from previous notebook

In [3]:
file_path = "results/EDA_ready_telco.csv"
df = pd.read_csv(file_path)

Drop customerID

In [4]:
df = df.drop('customerID', axis=1)

Convert target variable

In [5]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

Identify categorical and numerical columns

In [6]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Churn') # Remove label from features


print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical Columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']


Train-test Split

In [7]:
X = df.drop('Churn', axis=1)
y = df['Churn']


X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)


print("Train-test split completed.")

Train-test split completed.


Preprocessing Pipeline

In [8]:
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
('encoder', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
]
)

Fit transformer and save

In [9]:
joblib.dump(preprocessor, "results/preprocessor.pkl")


print("Preprocessing and feature engineering pipeline saved.")

Preprocessing and feature engineering pipeline saved.


Save preprocessed train-test splits

In [10]:
X_train.to_csv("results/X_train_raw.csv", index=False)
X_test.to_csv("results/X_test_raw.csv", index=False)
y_train.to_csv("results/y_train.csv", index=False)
y_test.to_csv("results/y_test.csv", index=False)


print("Raw split data saved for reference.")

Raw split data saved for reference.
