# 02. Feature Engineering & Preprocessing

## Goal
- Load the split data (from Google Drive).
- Handle missing values (imputation).
- Feature Engineering (cleaning, encoding Cat features).
- Normalize Numeric features.
- Save ready-to-train datasets back to Drive.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import os
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# --- DRIVE SETUP ---
try:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_DIR = '/content/drive/MyDrive/credit_risk_project'
    DATA_DIR = os.path.join(PROJECT_DIR, 'data/processed')
except ImportError:
    # Local fallback
    PROJECT_DIR = '..'
    DATA_DIR = '../data/processed'

print(f"Data Directory: {DATA_DIR}")

Mounted at /content/drive
Data Directory: /content/drive/MyDrive/credit_risk_project/data/processed


In [3]:
# Load Processed Data from Step 01
try:
    df_train = pd.read_pickle(os.path.join(DATA_DIR, 'train_2014_2016.pkl'))
    df_ood = pd.read_pickle(os.path.join(DATA_DIR, 'ood_2018_2019.pkl'))
    print("Loaded data successfully.")
except FileNotFoundError:
    print("Error: Pickle files not found. Did you run 01_eda.ipynb?")

Loaded data successfully.


In [4]:
# Shape check
print(f"Train: {df_train.shape}")
print(f"OOD: {df_ood.shape}")

Train: (891754, 33)
OOD: (56318, 33)


In [5]:
# --- 1. HANDLING MISSING VALUES ---
missing_train = df_train.isnull().mean()
missing_cols = missing_train[missing_train > 0].sort_values(ascending=False)
print("Top missing features:")
print(missing_cols.head(10))

Top missing features:
emp_title     0.060440
emp_length    0.058864
title         0.018661
revol_util    0.000512
dti           0.000045
dtype: float64


In [6]:
# Define column types for processing
# Note: ensuring we only select columns that actually exist in dataframe
numeric_cols = [
    'loan_amnt', 'term', 'int_rate', 'installment', 'annual_inc', 'dti', 
    'fico_range_low', 'fico_range_high', 'open_acc', 'pub_rec', 
    'revol_bal', 'revol_util', 'total_acc', 'mort_acc', 'pub_rec_bankruptcies'
]
# Filter numeric cols to available ones
numeric_cols = [c for c in numeric_cols if c in df_train.columns]

categorical_cols = [
    'grade', 'sub_grade', 'home_ownership', 'verification_status', 'purpose', 
    'application_type', 'initial_list_status'
]
# Filter cat cols
categorical_cols = [c for c in categorical_cols if c in df_train.columns]

# Clean 'term' column (numeric conversion)
def clean_term(df):
    if df['term'].dtype == 'O':
        return df['term'].str.extract('(\\d+)').astype(float)
    return df['term']

df_train['term'] = clean_term(df_train)
df_ood['term'] = clean_term(df_ood)

In [7]:
# Imputation
imputer_num = SimpleImputer(strategy='median')
df_train[numeric_cols] = imputer_num.fit_transform(df_train[numeric_cols])
df_ood[numeric_cols] = imputer_num.transform(df_ood[numeric_cols])

imputer_cat = SimpleImputer(strategy='constant', fill_value='Missing')
df_train[categorical_cols] = imputer_cat.fit_transform(df_train[categorical_cols])
df_ood[categorical_cols] = imputer_cat.transform(df_ood[categorical_cols])

print("Imputation Completed.")

Imputation Completed.


In [8]:
# --- 2. ENCODING CATEGORICAL FEATURES ---
# Ordinal Encoding for Grade
grade_map = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7}
if 'grade' in df_train.columns:
    df_train['grade'] = df_train['grade'].map(grade_map)
    df_ood['grade'] = df_ood['grade'].map(grade_map)

# One-Hot Encoding for others
cat_encode_list = ['home_ownership', 'verification_status', 'purpose', 'application_type', 'initial_list_status']
cat_encode_list = [c for c in cat_encode_list if c in df_train.columns]

# Using int dtype for cleaner output
df_train = pd.get_dummies(df_train, columns=cat_encode_list, drop_first=True, dtype=int)
df_ood = pd.get_dummies(df_ood, columns=cat_encode_list, drop_first=True, dtype=int)

# Align columns
df_train, df_ood = df_train.align(df_ood, join='left', axis=1, fill_value=0)

# --- CRITICAL STEP: Drop Remaining Object Columns ---
# Drop string columns that were not encoded (e.g. sub_grade, emp_title, zip_code, dates etc.)
# This prevents errors in model training.
df_train = df_train.select_dtypes(include=['number', 'bool'])
df_ood = df_ood.select_dtypes(include=['number', 'bool'])

print(f"Encoded Shapes - Train: {df_train.shape}, OOD: {df_ood.shape}")

Encoded Shapes - Train: (891754, 38), OOD: (56318, 38)


In [9]:
# --- 3. NORMALIZATION ---
scaler = StandardScaler()
# Select cols to scale: numeric + grade (if numeric)
scale_cols = numeric_cols + (['grade'] if 'grade' in df_train.columns else [])

# Only scale cols that are actually in the df
scale_cols = [c for c in scale_cols if c in df_train.columns]

df_train[scale_cols] = scaler.fit_transform(df_train[scale_cols])
df_ood[scale_cols] = scaler.transform(df_ood[scale_cols])

print("Normalization Completed.")

Normalization Completed.


In [10]:
# Save Final Ready-to-Model datasets to DRIVE
df_train.to_pickle(os.path.join(DATA_DIR, 'train_final.pkl'))
df_ood.to_pickle(os.path.join(DATA_DIR, 'ood_final.pkl'))

print(f"Saved final datasets to {DATA_DIR}")
print("IMPORTANT: Now you must RESTART RUNTIME before running 03_models.ipynb")

Saved final datasets to /content/drive/MyDrive/credit_risk_project/data/processed
IMPORTANT: Now you must RESTART RUNTIME before running 03_models.ipynb
