In [None]:
# Label Encoding Implementation
# Based on LAB4 materials - simple encoding for binary/ordinal categorical variables

from sklearn.preprocessing import LabelEncoder

def apply_label_encoding(df, categorical_columns):
    """
    Apply label encoding to categorical features
    
    Parameters:
    df: DataFrame with categorical features
    categorical_columns: list of column names to encode
    
    Returns:
    df_encoded: DataFrame with label encoded features
    encoders: dict of fitted encoders for inverse transform
    """
    df_encoded = df.copy()
    encoders = {}
    
    print("Applying Label Encoding...")
    for col in categorical_columns:
        if col in df_encoded.columns:
            # Create and fit encoder
            le = LabelEncoder()
            # Handle missing values by filling with 'Unknown'
            df_encoded[col] = df_encoded[col].fillna('Unknown')
            df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col])
            
            # Store encoder for later use
            encoders[col] = le
            
            print(f"  {col}: {len(le.classes_)} unique values -> 0 to {len(le.classes_)-1}")
    
    return df_encoded, encoders

# Example usage with sample categorical features
if not df.empty and categorical_features:
    # Select a few categorical features for demonstration
    sample_categorical = categorical_features[:3] if len(categorical_features) >= 3 else categorical_features
    
    print(f"Demonstrating Label Encoding on: {sample_categorical}")
    df_label_encoded, label_encoders = apply_label_encoding(df, sample_categorical)
    
    # Show before and after comparison
    for col in sample_categorical:
        if col in df.columns:
            print(f"\n{col} - Original vs Encoded:")
            comparison = pd.DataFrame({
                'Original': df[col].head(10),
                'Encoded': df_label_encoded[col + '_encoded'].head(10)
            })
            print(comparison)

# Feature Engineering for Lending Club Data
## COMP647 Assignment 03
### Student ID: 1163127

This notebook implements feature engineering techniques including:
- Data loading and initial exploration
- Categorical encoding methods
- Feature scaling techniques

Based on LAB4 materials and course teachings.

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [None]:
# Load preprocessed data from Assignment 02
try:
    # Load the processed sample data
    df = pd.read_csv('../data/processed/accepted_sample_10000.csv')
    print(f"Data loaded successfully: {df.shape}")
    print(f"Columns: {len(df.columns)}")
except FileNotFoundError:
    print("Processed data not found. Please run Assignment 02 notebooks first.")
    # For demonstration, create sample data structure
    df = pd.DataFrame()

# Display basic info about the dataset
if not df.empty:
    print("\nDataset Info:")
    print(df.info())
    
    # Identify categorical and numerical features
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f"\nCategorical features: {len(categorical_features)}")
    print(f"Numerical features: {len(numerical_features)}")
    
    if categorical_features:
        print("\nSample categorical features:")
        for col in categorical_features[:5]:  # Show first 5
            print(f"  {col}: {df[col].nunique()} unique values")