In [14]:
# Import required libraries
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

sys.path.append(os.path.abspath('../src'))
path_to_py = os.path.abspath('../src')

In [15]:
# Load the cleaned dataset
df = pd.read_csv(os.path.join(path_to_py, "data", "cleaned_train.csv"))

# Initial check
print("Dataset shape:", df.shape)
print("\nFeature names:", df.columns.tolist())
print("\nData types:\n", df.dtypes)

Dataset shape: (189457, 23)

Feature names: ['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti', 'inq_last_6mths', 'mths_since_recent_inq', 'revol_util', 'total_bc_limit', 'mths_since_last_major_derog', 'tot_hi_cred_lim', 'tot_cur_bal', 'internal_score', 'bad_flag', 'no_credit_card_history', 'no_derog_history', 'credit_limit_group', 'emp_length_missing']

Data types:
 loan_amnt                        int64
term                            object
int_rate                       float64
emp_length                     float64
home_ownership                  object
annual_inc                     float64
purpose                         object
percent_bc_gt_75               float64
bc_util                        float64
dti                            float64
inq_last_6mths                 float64
mths_since_recent_inq          float64
revol_util                     float64
total_bc_limit                 float64
mths_si

## Categorical Feature Encoding

Based on our EDA:
- Home ownership: Ordinal encoding based on default rates
    - MORTGAGE (5.83%) -> 0
    - OWN (7%) -> 1
    - RENT (8.4%) -> 2
    - NONE (9.5%) -> 3
    - OTHER (10.87%) -> 4

- Loan Purpose: Risk-based groupings then encode
    - Low risk (credit_card, home_improvement) -> 0
    - Medium risk (debt_consolidation, car, major_purchase) -> 1
    - High risk (small_business, renewable_energy) -> 2

- Credit Limit Group: Ordinal encoding (already ordered Q1-Q5 by risk)
    - Q5 (4.73%) -> 0
    - Q4 -> 1
    - Q3 -> 2
    - Q2 -> 3
    - Q1 (9.87%) -> 4

In [16]:
# Home ownership encoding based on default rates (ascending order of risk)
home_ownership_risk = {
    'MORTGAGE': 0,  # 5.83%
    'OWN': 1,      # 7.00%
    'RENT': 2,     # 8.40%
    'NONE': 3,     # 9.50%
    'OTHER': 4     # 10.87%
}

# Credit limit group encoding (already ordered by risk Q5 to Q1)
credit_limit_risk = {
    'Q5': 0,  # 4.73%
    'Q4': 1,  
    'Q3': 2,
    'Q2': 3,
    'Q1': 4   # 9.87%
}

# Loan purpose risk-based grouping and encoding
# First, let's define risk groups based on default rates
low_risk_purpose = ['credit_card', 'home_improvement', 'car', 'major_purchase']
medium_risk_purpose = ['debt_consolidation', 'medical', 'moving', 'vacation', 'house', 'wedding']
high_risk_purpose = ['small_business', 'renewable_energy', 'other']

def encode_purpose(purpose):
    if purpose in low_risk_purpose:
        return 0
    elif purpose in medium_risk_purpose:
        return 1
    else:
        return 2

# Apply encodings
df['home_ownership_encoded'] = df['home_ownership'].map(home_ownership_risk)
df['credit_limit_encoded'] = df['credit_limit_group'].map(credit_limit_risk)
df['purpose_encoded'] = df['purpose'].apply(encode_purpose)

# Binary encoding for term (36 months = 0, 60 months = 1)
df['term_encoded'] = df['term'].apply(lambda x: 0 if '36' in x else 1)

# Drop original categorical columns
df = df.drop(['home_ownership', 'credit_limit_group', 'purpose', 'term'], axis=1)

# Verify encodings
print("\nEncoded features check:")
for col in ['home_ownership_encoded', 'credit_limit_encoded', 'purpose_encoded', 'term_encoded']:
    print(f"\n{col} value counts:")
    print(df[col].value_counts().sort_index())

print("\nUpdated dataset shape:", df.shape)


Encoded features check:

home_ownership_encoded value counts:
home_ownership_encoded
0    97647
1    15573
2    76149
3       42
4       46
Name: count, dtype: int64

credit_limit_encoded value counts:
credit_limit_encoded
0    37892
1    37891
2    37891
3    37891
4    37892
Name: count, dtype: int64

purpose_encoded value counts:
purpose_encoded
0     59436
1    118194
2     11827
Name: count, dtype: int64

term_encoded value counts:
term_encoded
0    144800
1     44657
Name: count, dtype: int64

Updated dataset shape: (189457, 23)


## Train-Val-Test Split

In [17]:
# Step 2: Train-Val-Test Split
# First split into train and temp (test + validation)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['bad_flag'], random_state=42)
# Split temp into validation and test
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['bad_flag'], random_state=42)

print("Dataset shapes:")
print(f"Train: {train_df.shape}")
print(f"Validation: {val_df.shape}")
print(f"Test: {test_df.shape}")

# Verify class distribution in splits
print("\nClass distribution in splits:")
print("Train:", train_df['bad_flag'].value_counts(normalize=True))
print("Validation:", val_df['bad_flag'].value_counts(normalize=True))
print("Test:", test_df['bad_flag'].value_counts(normalize=True))

Dataset shapes:
Train: (132619, 23)
Validation: (28419, 23)
Test: (28419, 23)

Class distribution in splits:
Train: bad_flag
0.0    0.930704
1.0    0.069296
Name: proportion, dtype: float64
Validation: bad_flag
0.0    0.930715
1.0    0.069285
Name: proportion, dtype: float64
Test: bad_flag
0.0    0.930715
1.0    0.069285
Name: proportion, dtype: float64


## Feature Transformations and Feature Engineering
1. Log transformation for right-skewed features
- annual_inc
- loan_amnt

2. Standard scaling for normally distributed features:
- int_rate
- dti
- revol_util
- bc_util

3. Feature Engineering
- Composite utilization feature (from revol_util and bc_util)
- Income-to-loan ratio (using log transformed values)
- Relevant interaction terms based on our EDA insights

**Implementation Strategy**:
1. Apply log transformations to all splits (no fitting required)
2. Fit scalers only on training data, then transform all splits
3. Create engineered features using the properly scaled values
4. Drop original features that have been transformed


In [18]:
# Scaling and Feature Transformations (fit only on training data)

# Log transformations first
for df_ in [train_df, val_df, test_df]:
    df_['log_annual_inc'] = np.log1p(df_['annual_inc'])
    df_['log_loan_amt'] = np.log1p(df_['loan_amnt'])

# Standard scaling
scaler = StandardScaler()
scale_features = [
    'int_rate',
    'dti',
    'revol_util',
    'bc_util',
    'percent_bc_gt_75',
    'inq_last_6mths',
    'mths_since_recent_inq',
    'total_bc_limit',
    'tot_hi_cred_lim',
    'tot_cur_bal'
]

# Fit scaler only on training data
scaler_dict = {}
for feature in scale_features:
    scaler_dict[feature] = StandardScaler()
    train_df[f'{feature}_scaled'] = scaler_dict[feature].fit_transform(train_df[[feature]])
    # Transform validation and test using the scaler fit on training data
    val_df[f'{feature}_scaled'] = scaler_dict[feature].transform(val_df[[feature]])
    test_df[f'{feature}_scaled'] = scaler_dict[feature].transform(test_df[[feature]])

# Feature Engineering (using scaled features)
for df_ in [train_df, val_df, test_df]:
    # Composite utilization
    df_['composite_util'] = (df_['revol_util_scaled'] + df_['bc_util_scaled']) / 2
    
    # Income to loan ratio using log transformed values
    df_['income_to_loan_ratio'] = df_['log_annual_inc'] / df_['log_loan_amt']
    
    # Interest rate * DTI interaction
    df_['int_rate_dti'] = df_['int_rate_scaled'] * df_['dti_scaled']
    
    # Risk score
    df_['risk_score'] = (df_['int_rate_scaled'] + 
                        df_['dti_scaled'] + 
                        df_['composite_util'] - 
                        df_['income_to_loan_ratio'])

# Drop original features that have been transformed
columns_to_drop = scale_features + ['annual_inc', 'loan_amnt', 'revol_util_scaled', 'bc_util_scaled']
train_df = train_df.drop(columns_to_drop, axis=1)
val_df = val_df.drop(columns_to_drop, axis=1)
test_df = test_df.drop(columns_to_drop, axis=1)

# Verify final shapes and features
print("\nFinal shapes:")
print(f"Train: {train_df.shape}")
print(f"Validation: {val_df.shape}")
print(f"Test: {test_df.shape}")

# Check for any missing values
print("\nMissing values in final datasets:")
print("Train:", train_df.isnull().sum().sum())
print("Validation:", val_df.isnull().sum().sum())
print("Test:", test_df.isnull().sum().sum())

# Verify engineered features in training set
print("\nEngineered features summary in training set:")
engineered_features = ['composite_util', 'income_to_loan_ratio', 'int_rate_dti', 'risk_score']
print(train_df[engineered_features].describe())


Final shapes:
Train: (132619, 25)
Validation: (28419, 25)
Test: (28419, 25)

Missing values in final datasets:
Train: 0
Validation: 0
Test: 0

Engineered features summary in training set:
       composite_util  income_to_loan_ratio   int_rate_dti     risk_score
count    1.326190e+05         132619.000000  132619.000000  132619.000000
mean     4.671980e-17              1.181601       0.146952      -1.181601
std      9.298377e-01              0.080174       1.003699       2.095974
min     -2.395327e+00              1.050047      -5.712039      -7.821150
25%     -6.408539e-01              1.124076      -0.295388      -2.562022
50%      1.410476e-01              1.164254       0.024781      -1.078546
75%      7.514079e-01              1.218175       0.532429       0.307960
max      4.246996e+00              1.766755       6.099598       5.128489


**Strategy for Handling Class Imbalance**:
- Let's calculate class weights based on our training data rather than SMOTE or other resampling techniques which can lead to overfitting.
- The weights will be used later in the loss function when training the neural network model

In [19]:
# Calculate class weights using sklearn's balanced method
from sklearn.utils.class_weight import compute_class_weight

balanced_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['bad_flag']),
    y=train_df['bad_flag']
)
balanced_weights_dict = dict(zip(np.unique(train_df['bad_flag']), balanced_weights))

# Convert to PyTorch tensor for later use
class_weights_tensor = torch.FloatTensor([balanced_weights_dict[0], balanced_weights_dict[1]])

print("Balanced class weights:")
print(balanced_weights_dict)
print("\nClass weights tensor:")
print(class_weights_tensor)

Balanced class weights:
{0.0: 0.537227879995787, 1.0: 7.215397170837868}

Class weights tensor:
tensor([0.5372, 7.2154])


**Observation**:
The class weights look good! The weights reflect the imbalance in our data:
- Majority class (0.0): weight ≈ 0.54
- Minority class (1.0): weight ≈ 7.22

This means the loss function will weigh errors on the minority class (bad loans) about 13.4 times more heavily than errors on the majority class (good loans), which aligns with our class distribution.

## Pytorch Tensors and Data Loaders

In [20]:
# Create PyTorch Dataset and DataLoader

# First, let's separate features and target for each split
def prepare_features_target(df):
   # Separate target
   y = df['bad_flag'].values
   # Drop target from features
   X = df.drop('bad_flag', axis=1).values
   return X, y

# Prepare data for all splits
X_train, y_train = prepare_features_target(train_df)
X_val, y_val = prepare_features_target(val_df)
X_test, y_test = prepare_features_target(test_df)

# Create PyTorch Dataset class
class LoanDataset(Dataset):
   def __init__(self, X, y):
       self.X = torch.FloatTensor(X)
       self.y = torch.FloatTensor(y)
   
   def __len__(self):
       return len(self.y)
   
   def __getitem__(self, idx):
       return self.X[idx], self.y[idx]

# Create Dataset objects
train_dataset = LoanDataset(X_train, y_train)
val_dataset = LoanDataset(X_val, y_val)
test_dataset = LoanDataset(X_test, y_test)

# Create DataLoaders
batch_size = 512  # Can be tuned

train_loader = DataLoader(
   train_dataset, 
   batch_size=batch_size, 
   shuffle=True
)

val_loader = DataLoader(
   val_dataset, 
   batch_size=batch_size, 
   shuffle=False
)

test_loader = DataLoader(
   test_dataset, 
   batch_size=batch_size, 
   shuffle=False
)

# Verify the dataloaders
print("Number of features:", X_train.shape[1])
print("\nDataLoader sizes:")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Verify a batch
X_batch, y_batch = next(iter(train_loader))
print("\nBatch shapes:")
print(f"X batch shape: {X_batch.shape}")
print(f"y batch shape: {y_batch.shape}")

Number of features: 24

DataLoader sizes:
Training batches: 260
Validation batches: 56
Test batches: 56

Batch shapes:
X batch shape: torch.Size([512, 24])
y batch shape: torch.Size([512])
