In [None]:
# Binary Encoding Implementation
# Based on LAB4 materials - efficient for high cardinality categorical features

import category_encoders as ce

def apply_binary_encoding(df, categorical_columns):
    """
    Apply binary encoding for high cardinality categorical features
    
    Parameters:
    df: DataFrame with categorical features
    categorical_columns: list of column names to encode
    
    Returns:
    df_encoded: DataFrame with binary encoded features
    encoder: fitted binary encoder for inverse transform
    """
    df_encoded = df.copy()
    
    print("Applying Binary Encoding...")
    print("Binary encoding reduces dimensionality compared to one-hot encoding")
    print("Example: 8 categories need only 3 binary columns (2^3 = 8)")
    
    for col in categorical_columns:
        if col in df_encoded.columns:
            # Handle missing values
            df_encoded[col] = df_encoded[col].fillna('Unknown')
            
            # Apply binary encoding
            encoder = ce.BinaryEncoder(cols=[col])
            encoded_cols = encoder.fit_transform(df_encoded[[col]])
            
            # Add encoded columns to dataframe
            for encoded_col in encoded_cols.columns:
                df_encoded[f"{col}_bin_{encoded_col}"] = encoded_cols[encoded_col]
            
            n_categories = df_encoded[col].nunique()
            n_binary_cols = len(encoded_cols.columns)
            print(f"  {col}: {n_categories} categories -> {n_binary_cols} binary columns")
            
            # Show the binary encoding pattern
            if n_categories <= 8:  # Only show for small examples
                unique_vals = df_encoded[col].unique()[:8]  # Show first 8
                print(f"    Encoding pattern for {col}:")
                for val in unique_vals:
                    mask = df_encoded[col] == val
                    if mask.any():
                        binary_vals = encoded_cols[mask].iloc[0].values
                        print(f"      '{val}' -> {binary_vals}")
    
    return df_encoded, encoder

# Example usage with high cardinality categorical features
if not df.empty and categorical_features:
    # Find features with high cardinality (good candidates for binary encoding)
    high_cardinality_features = []
    for col in categorical_features:
        if col in df.columns:
            n_unique = df[col].nunique()
            if 5 < n_unique <= 50:  # Good range for binary encoding
                high_cardinality_features.append(col)
    
    if high_cardinality_features:
        # Take first feature for demonstration
        demo_feature = high_cardinality_features[0]
        print(f"Demonstrating Binary Encoding on: {demo_feature}")
        print(f"  Unique values: {df[demo_feature].nunique()}")
        
        df_binary_encoded, binary_encoder = apply_binary_encoding(df, [demo_feature])
        
        # Show comparison of dimensions
        binary_cols = [c for c in df_binary_encoded.columns if c.startswith(demo_feature + '_bin_')]
        print(f"\nDimensionality comparison:")
        print(f"  One-hot would create: {df[demo_feature].nunique()} columns")  
        print(f"  Binary encoding creates: {len(binary_cols)} columns")
        print(f"  Space savings: {((df[demo_feature].nunique() - len(binary_cols)) / df[demo_feature].nunique() * 100):.1f}%")
    else:
        print("No suitable high cardinality features found for Binary Encoding demonstration")

In [None]:
# One-Hot Encoding Implementation
# Based on LAB4 materials - safe for linear models, avoids ordinal assumptions

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def apply_onehot_encoding(df, categorical_columns, drop_first=True, max_categories=10):
    """
    Apply one-hot encoding to categorical features
    
    Parameters:
    df: DataFrame with categorical features  
    categorical_columns: list of column names to encode
    drop_first: whether to drop first category to avoid multicollinearity
    max_categories: maximum unique values to consider for encoding
    
    Returns:
    df_encoded: DataFrame with one-hot encoded features
    """
    df_encoded = df.copy()
    
    print("Applying One-Hot Encoding...")
    for col in categorical_columns:
        if col in df_encoded.columns:
            # Check if feature has reasonable number of categories
            n_categories = df_encoded[col].nunique()
            
            if n_categories <= max_categories:
                # Apply one-hot encoding using pandas get_dummies
                # This is more memory efficient than sklearn OneHotEncoder for small datasets
                dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=drop_first)
                
                # Add dummy columns to dataframe
                df_encoded = pd.concat([df_encoded, dummies], axis=1)
                
                print(f"  {col}: {n_categories} categories -> {len(dummies.columns)} dummy columns")
            else:
                print(f"  {col}: Skipped ({n_categories} categories > {max_categories} threshold)")
    
    return df_encoded

# Example usage with sample categorical features
if not df.empty and categorical_features:
    # Select categorical features with reasonable number of categories
    suitable_features = []
    for col in categorical_features[:3]:
        if col in df.columns and df[col].nunique() <= 10:
            suitable_features.append(col)
    
    if suitable_features:
        print(f"Demonstrating One-Hot Encoding on: {suitable_features}")
        df_onehot_encoded = apply_onehot_encoding(df, suitable_features)
        
        # Show the new dummy columns created
        for col in suitable_features:
            dummy_cols = [c for c in df_onehot_encoded.columns if c.startswith(col + '_')]
            if dummy_cols:
                print(f"\n{col} dummy columns: {dummy_cols}")
                print(f"Sample values:")
                print(df_onehot_encoded[dummy_cols].head())
    else:
        print("No suitable categorical features found for One-Hot Encoding demonstration")

In [None]:
# Label Encoding Implementation
# Based on LAB4 materials - simple encoding for binary/ordinal categorical variables

from sklearn.preprocessing import LabelEncoder

def apply_label_encoding(df, categorical_columns):
    """
    Apply label encoding to categorical features
    
    Parameters:
    df: DataFrame with categorical features
    categorical_columns: list of column names to encode
    
    Returns:
    df_encoded: DataFrame with label encoded features
    encoders: dict of fitted encoders for inverse transform
    """
    df_encoded = df.copy()
    encoders = {}
    
    print("Applying Label Encoding...")
    for col in categorical_columns:
        if col in df_encoded.columns:
            # Create and fit encoder
            le = LabelEncoder()
            # Handle missing values by filling with 'Unknown'
            df_encoded[col] = df_encoded[col].fillna('Unknown')
            df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col])
            
            # Store encoder for later use
            encoders[col] = le
            
            print(f"  {col}: {len(le.classes_)} unique values -> 0 to {len(le.classes_)-1}")
    
    return df_encoded, encoders

# Example usage with sample categorical features
if not df.empty and categorical_features:
    # Select a few categorical features for demonstration
    sample_categorical = categorical_features[:3] if len(categorical_features) >= 3 else categorical_features
    
    print(f"Demonstrating Label Encoding on: {sample_categorical}")
    df_label_encoded, label_encoders = apply_label_encoding(df, sample_categorical)
    
    # Show before and after comparison
    for col in sample_categorical:
        if col in df.columns:
            print(f"\n{col} - Original vs Encoded:")
            comparison = pd.DataFrame({
                'Original': df[col].head(10),
                'Encoded': df_label_encoded[col + '_encoded'].head(10)
            })
            print(comparison)

# Feature Engineering for Lending Club Data
## COMP647 Assignment 03
### Student ID: 1163127

This notebook implements feature engineering techniques including:
- Data loading and initial exploration
- Categorical encoding methods
- Feature scaling techniques

Based on LAB4 materials and course teachings.

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [None]:
# Load preprocessed data from Assignment 02
try:
    # Load the processed sample data
    df = pd.read_csv('../data/processed/accepted_sample_10000.csv')
    print(f"Data loaded successfully: {df.shape}")
    print(f"Columns: {len(df.columns)}")
except FileNotFoundError:
    print("Processed data not found. Please run Assignment 02 notebooks first.")
    # For demonstration, create sample data structure
    df = pd.DataFrame()

# Display basic info about the dataset
if not df.empty:
    print("\nDataset Info:")
    print(df.info())
    
    # Identify categorical and numerical features
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f"\nCategorical features: {len(categorical_features)}")
    print(f"Numerical features: {len(numerical_features)}")
    
    if categorical_features:
        print("\nSample categorical features:")
        for col in categorical_features[:5]:  # Show first 5
            print(f"  {col}: {df[col].nunique()} unique values")