# ðŸ§© Day 13 â€” Encoding + Feature Scaling

Preparing categorical data & standardizing features for ML pipelines.

**Goal:** Convert text-based categories into numeric format and normalize features for model readiness.

In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Step 2: Create a sample dataset
data = {
    'CustomerID': [1, 2, 3, 4, 5],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Subscription': ['Basic', 'Premium', 'Standard', 'Basic', 'Premium'],
    'Age': [23, 45, 31, 35, 52],
    'Monthly_Spend': [25.0, 70.5, 45.2, 35.0, 80.1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,CustomerID,Gender,Subscription,Age,Monthly_Spend
0,1,Male,Basic,23,25.0
1,2,Female,Premium,45,70.5
2,3,Female,Standard,31,45.2
3,4,Male,Basic,35,35.0
4,5,Female,Premium,52,80.1


### Step 3: Label Encoding (for binary or ordinal columns)

In [3]:
label_encoder = LabelEncoder()
df['Gender_encoded'] = label_encoder.fit_transform(df['Gender'])
df[['Gender', 'Gender_encoded']]

Unnamed: 0,Gender,Gender_encoded
0,Male,1
1,Female,0
2,Female,0
3,Male,1
4,Female,0


### Step 4: One-Hot Encoding (for multi-class categories)

In [4]:
ohe = OneHotEncoder(sparse_output=False)
encoded_subscription = pd.DataFrame(
    ohe.fit_transform(df[['Subscription']]),
    columns=ohe.get_feature_names_out(['Subscription'])
)

# Combine with original dataframe
df_encoded = pd.concat([df, encoded_subscription], axis=1).drop(columns=['Subscription'])
df_encoded

Unnamed: 0,CustomerID,Gender,Age,Monthly_Spend,Gender_encoded,Subscription_Basic,Subscription_Premium,Subscription_Standard
0,1,Male,23,25.0,1,1.0,0.0,0.0
1,2,Female,45,70.5,0,0.0,1.0,0.0
2,3,Female,31,45.2,0,0.0,0.0,1.0
3,4,Male,35,35.0,1,1.0,0.0,0.0
4,5,Female,52,80.1,0,0.0,1.0,0.0


### Step 5: Feature Scaling â€” Standardization & Normalization

In [5]:
# Select numerical columns for scaling
num_cols = ['Age', 'Monthly_Spend']

scaler_std = StandardScaler()
scaler_minmax = MinMaxScaler()

df_scaled_std = df_encoded.copy()
df_scaled_minmax = df_encoded.copy()

df_scaled_std[num_cols] = scaler_std.fit_transform(df_encoded[num_cols])
df_scaled_minmax[num_cols] = scaler_minmax.fit_transform(df_encoded[num_cols])

print("\nStandard Scaled Data:")
display(df_scaled_std[num_cols].head())

print("\nMin-Max Scaled Data:")
display(df_scaled_minmax[num_cols].head())


Standard Scaled Data:


Unnamed: 0,Age,Monthly_Spend
0,-1.386042,-1.24924
1,0.761347,0.923559
2,-0.605173,-0.284613
3,-0.214739,-0.771702
4,1.444607,1.381995



Min-Max Scaled Data:


Unnamed: 0,Age,Monthly_Spend
0,0.0,0.0
1,0.758621,0.825771
2,0.275862,0.366606
3,0.413793,0.181488
4,1.0,1.0


### Step 6: Combined Preprocessing Pipeline

In [6]:
categorical_cols = ['Gender', 'Subscription']
numerical_cols = ['Age', 'Monthly_Spend']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ]
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

processed_array = pipeline.fit_transform(df)
processed_df = pd.DataFrame(processed_array)
processed_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,1.0,1.0,0.0,0.0,-1.386042,-1.24924
1,1.0,0.0,0.0,1.0,0.0,0.761347,0.923559
2,1.0,0.0,0.0,0.0,1.0,-0.605173,-0.284613
3,0.0,1.0,1.0,0.0,0.0,-0.214739,-0.771702
4,1.0,0.0,0.0,1.0,0.0,1.444607,1.381995


âœ… **Youâ€™ve just built a clean preprocessing pipeline!**

Now your data is fully encoded & scaled â€” ready for ML models.