# Task 2.2: Dataset Preprocessing Pipeline - SOLUTIONS

**Module:** 2 - Python for AI/ML  

This notebook contains solutions to all exercises from the Preprocessing Pipeline Lab.

---

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print("Solutions Notebook for Preprocessing Pipeline")
print("=" * 50)

In [None]:
# Recreate the sample data
np.random.seed(42)
n_samples = 1000

data = {
    'age': np.random.randint(18, 80, n_samples).astype(float),
    'income': np.random.lognormal(10.5, 0.5, n_samples),
    'credit_score': np.random.randint(300, 850, n_samples).astype(float),
    'years_employed': np.random.exponential(5, n_samples),
    'education': np.random.choice(
        ['High School', 'Bachelor', 'Master', 'PhD', None], 
        n_samples, 
        p=[0.3, 0.35, 0.2, 0.1, 0.05]
    ),
    'employment_type': np.random.choice(
        ['Full-time', 'Part-time', 'Self-employed', 'Unemployed'],
        n_samples,
        p=[0.6, 0.15, 0.15, 0.1]
    ),
    'default': np.random.choice([0, 1], n_samples, p=[0.85, 0.15])
}

df = pd.DataFrame(data)

# Add missing values
df.loc[np.random.choice(n_samples, 50, replace=False), 'age'] = np.nan
df.loc[np.random.choice(n_samples, 80, replace=False), 'income'] = np.nan
df.loc[np.random.choice(n_samples, 30, replace=False), 'credit_score'] = np.nan

print(f"Dataset shape: {df.shape}")
print(f"\nMissing values:\n{df.isnull().sum()[df.isnull().sum() > 0]}")

---

## Exercise 1: Group-Based Imputation

**Task:** Impute missing income using the median income for each education level.

In [None]:
# SOLUTION - Exercise 1
df_group_imputed = df.copy()

# First, fill education missing values (we need groups to exist)
education_mode = df_group_imputed['education'].mode()[0]
df_group_imputed['education'].fillna(education_mode, inplace=True)

# Show median income by education BEFORE imputation
print("Median income by education (before imputation):")
print(df.groupby('education')['income'].median().round(0))

# Method 1: Using groupby + transform + fillna
df_group_imputed['income'] = df_group_imputed.groupby('education')['income'].transform(
    lambda x: x.fillna(x.median())
)

print(f"\nMissing values after imputation: {df_group_imputed['income'].isna().sum()}")

# Verify by checking a sample
print("\nSample of imputed data (first 5 originally missing):")
original_missing = df[df['income'].isna()].head(5).index
print(df_group_imputed.loc[original_missing, ['education', 'income']])

### Alternative Method: Manual Group Imputation

In [None]:
# Alternative: More explicit approach
df_alt = df.copy()
df_alt['education'].fillna(education_mode, inplace=True)

# Calculate group medians
group_medians = df_alt.groupby('education')['income'].median()
print("Group medians:")
print(group_medians.round(0))

# Fill missing values
for education_level in df_alt['education'].unique():
    mask = (df_alt['education'] == education_level) & (df_alt['income'].isna())
    df_alt.loc[mask, 'income'] = group_medians[education_level]

print(f"\nMissing after: {df_alt['income'].isna().sum()}")
print(f"Same result? {np.allclose(df_group_imputed['income'], df_alt['income'], equal_nan=True)}")

---

## Exercise 2: Extended Preprocessor with Log Transform

**Task:** Add log transformation capability to the Preprocessor class.

In [None]:
# SOLUTION - Exercise 2: Extended Preprocessor

class PreprocessorWithLog:
    """
    Extended Preprocessor with log transformation support.
    """
    
    def __init__(
        self,
        numeric_features,
        categorical_features=None,
        ordinal_mappings=None,
        scaling='standard',
        impute_strategy='median',
        log_features=None  # NEW PARAMETER
    ):
        self.numeric_features = numeric_features
        self.categorical_features = categorical_features or []
        self.ordinal_mappings = ordinal_mappings or {}
        self.scaling = scaling
        self.impute_strategy = impute_strategy
        self.log_features = log_features or []  # NEW
        
        self.numeric_stats_ = {}
        self.categorical_values_ = {}
        self.scale_params_ = {}
        self._is_fitted = False
        
    def fit(self, df):
        # Learn imputation values
        for col in self.numeric_features:
            if self.impute_strategy == 'median':
                self.numeric_stats_[col] = df[col].median()
            else:
                self.numeric_stats_[col] = df[col].mean()
        
        for col in self.categorical_features:
            mode_values = df[col].mode()
            self.categorical_values_[col] = {
                'mode': mode_values[0] if len(mode_values) > 0 else 'Unknown',
                'categories': sorted(df[col].dropna().unique())
            }
        
        # Prepare data for scale param calculation
        df_temp = df.copy()
        for col in self.numeric_features:
            df_temp[col].fillna(self.numeric_stats_[col], inplace=True)
        
        # Apply log transform BEFORE computing scale params
        for col in self.log_features:
            if col in df_temp.columns:
                df_temp[col] = np.log1p(np.maximum(df_temp[col], 0))
        
        # Learn scaling parameters
        X = df_temp[self.numeric_features].values
        
        if self.scaling == 'standard':
            self.scale_params_ = {
                'center': X.mean(axis=0),
                'scale': X.std(axis=0)
            }
        elif self.scaling == 'minmax':
            self.scale_params_ = {
                'center': X.min(axis=0),
                'scale': X.max(axis=0) - X.min(axis=0)
            }
        
        # Prevent division by zero
        self.scale_params_['scale'] = np.where(
            self.scale_params_['scale'] == 0, 1, self.scale_params_['scale']
        )
        
        self._is_fitted = True
        return self
    
    def transform(self, df):
        if not self._is_fitted:
            raise ValueError("Not fitted!")
        
        result = df.copy()
        
        # 1. Impute numeric
        for col in self.numeric_features:
            result[col].fillna(self.numeric_stats_[col], inplace=True)
        
        # 2. Impute categorical
        for col in self.categorical_features:
            result[col].fillna(self.categorical_values_[col]['mode'], inplace=True)
        
        # 3. Apply log transform (NEW!)
        for col in self.log_features:
            if col in result.columns:
                result[f'log_{col}'] = np.log1p(np.maximum(result[col], 0))
        
        # 4. Ordinal encoding
        for col, mapping in self.ordinal_mappings.items():
            result[f'{col}_encoded'] = result[col].map(mapping).fillna(-1)
            result = result.drop(col, axis=1)
        
        # 5. One-hot encoding
        for col in self.categorical_features:
            if col in self.ordinal_mappings:
                continue
            for cat in self.categorical_values_[col]['categories']:
                result[f'{col}_{cat}'] = (result[col] == cat).astype(int)
            result = result.drop(col, axis=1)
        
        # 6. Scale numeric (including log-transformed)
        if self.scaling:
            for i, col in enumerate(self.numeric_features):
                # If this column was log-transformed, scale the log version
                if col in self.log_features:
                    target_col = f'log_{col}'
                else:
                    target_col = col
                
                result[target_col] = (
                    (result[target_col] - self.scale_params_['center'][i]) / 
                    self.scale_params_['scale'][i]
                )
        
        return result
    
    def fit_transform(self, df):
        return self.fit(df).transform(df)

print("Extended Preprocessor with log transform defined!")

In [None]:
# Test the extended preprocessor
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

preprocessor = PreprocessorWithLog(
    numeric_features=['age', 'income', 'credit_score', 'years_employed'],
    categorical_features=['education', 'employment_type'],
    ordinal_mappings={
        'education': {'High School': 0, 'Bachelor': 1, 'Master': 2, 'PhD': 3}
    },
    scaling='standard',
    log_features=['income']  # Apply log transform to income
)

train_processed = preprocessor.fit_transform(train_df)
test_processed = preprocessor.transform(test_df)

print("Processed columns:")
print(train_processed.columns.tolist())

print(f"\nlog_income statistics:")
print(train_processed['log_income'].describe())

---

## Challenge: Titanic Dataset Preprocessing

Complete preprocessing pipeline for the Titanic dataset.

In [None]:
# SOLUTION - Titanic Challenge
import seaborn as sns

# Load Titanic data
titanic = sns.load_dataset('titanic')
print(f"Titanic dataset shape: {titanic.shape}")
print(f"\nColumns: {titanic.columns.tolist()}")
print(f"\nMissing values:\n{titanic.isnull().sum()[titanic.isnull().sum() > 0]}")

In [None]:
# Feature engineering
df_titanic = titanic.copy()

# 1. Extract title from name
# Note: 'name' column is not in seaborn's version, so we'll skip this
# If using Kaggle version:
# df_titanic['title'] = df_titanic['name'].str.extract(' ([A-Za-z]+)\.')

# 2. Create family size
df_titanic['family_size'] = df_titanic['sibsp'] + df_titanic['parch'] + 1

# 3. Is traveling alone?
df_titanic['is_alone'] = (df_titanic['family_size'] == 1).astype(int)

# 4. Fare per person
df_titanic['fare_per_person'] = df_titanic['fare'] / df_titanic['family_size']

print("New features created:")
print(df_titanic[['family_size', 'is_alone', 'fare_per_person']].head())

In [None]:
# Handle missing values

# Age: Impute with median by class and sex
df_titanic['age'] = df_titanic.groupby(['pclass', 'sex'])['age'].transform(
    lambda x: x.fillna(x.median())
)

# Embarked: Impute with mode
df_titanic['embarked'].fillna(df_titanic['embarked'].mode()[0], inplace=True)

# Deck (from cabin): Extract first letter, fill unknown
# df_titanic['deck'] = df_titanic['deck'].fillna('Unknown')

print(f"Missing values after imputation:")
print(df_titanic.isnull().sum()[df_titanic.isnull().sum() > 0])

In [None]:
# Encode categorical variables

# Sex: Binary encoding
df_titanic['sex_encoded'] = (df_titanic['sex'] == 'male').astype(int)

# Embarked: One-hot
embarked_dummies = pd.get_dummies(df_titanic['embarked'], prefix='embarked', dtype=int)
df_titanic = pd.concat([df_titanic, embarked_dummies], axis=1)

# Class: Already numeric (1, 2, 3)

print("Encoded features:")
print(df_titanic[['sex_encoded', 'embarked_C', 'embarked_Q', 'embarked_S']].head())

In [None]:
# Scale numeric features
numeric_cols = ['age', 'fare', 'fare_per_person', 'family_size']

# Standard scaling
for col in numeric_cols:
    mean = df_titanic[col].mean()
    std = df_titanic[col].std()
    df_titanic[f'{col}_scaled'] = (df_titanic[col] - mean) / std

print("Scaled features statistics:")
print(df_titanic[[f'{c}_scaled' for c in numeric_cols]].describe().round(2))

In [None]:
# Final feature set for modeling
feature_columns = [
    'pclass',
    'sex_encoded',
    'age_scaled',
    'fare_per_person_scaled',
    'family_size',
    'is_alone',
    'embarked_C',
    'embarked_Q',
    'embarked_S'
]

X = df_titanic[feature_columns]
y = df_titanic['survived']

print(f"Final feature matrix shape: {X.shape}")
print(f"\nFeatures:\n{X.columns.tolist()}")
print(f"\nTarget distribution:\n{y.value_counts()}")

---

## Key Takeaways

1. **Group-based imputation** is more intelligent than global imputation
2. **Log transforms** help with skewed distributions like income
3. **Feature engineering** (family_size, is_alone) can be very valuable
4. **Always fit on training data only** to prevent data leakage

---

**End of Solutions**