In [85]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
import joblib
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [86]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

Using device: cuda


In [87]:
data_path = Path('../data/processed/processed.csv')
model_dir = Path('../models')
model_dir.mkdir(exist_ok=True)

In [88]:
df = pd.read_csv(data_path)
print(f"Loaded dataset: {df.shape}")

Loaded dataset: (503, 61)


In [89]:
target_col = 'ESG Risk Level'
exclude_cols = ['Symbol', 'Name', 'Address', 'Description', 'Sector', 'Industry', 
                'Controversy Level', 'ESG Risk Level', 'ESG Risk Percentile', 'Employee Size']
leakage_features = [
    'ESG_Component_Balance', 'ESG_Max_Component', 'ESG_Min_Component',
    'ESG_Risk_Above_Average', 'ESG_Risk_Category', 'Sector_Risk_Average',
    'ESG_vs_Sector_Average', 'High_Risk_Percentile', 'ESG Risk Percentile_Encoded',
    'ESG Risk Level_High', 'ESG Risk Level_Low', 'ESG Risk Level_Medium',
    'ESG Risk Level_Negligible', 'ESG Risk Level_Severe', 'ESG Risk Level_nan',
    'ESG_Risk_Category_Low', 'ESG_Risk_Category_Medium', 'ESG_Risk_Category_High',
    'ESG_Risk_Category_Severe', 'ESG_Risk_Category_nan'
]
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in numeric_cols if col not in exclude_cols and col not in leakage_features]

In [90]:
X = df[feature_cols].copy()
y = df[target_col].copy()
X = X.fillna(0)
y = y.fillna('Medium')
print(f"Features: {len(feature_cols)}")
print(f"Target distribution:\n{y.value_counts()}")

Features: 31
Target distribution:
Low           260
Medium        184
High           50
Negligible      6
Severe          3
Name: ESG Risk Level, dtype: int64


In [91]:
label_mapping = {'Low': 0, 'Negligible': 0, 'Medium': 1, 'High': 2, 'Severe': 2}
y_encoded = y.map(label_mapping)
y_encoded = y_encoded.fillna(1).astype(int)
num_classes = len(y_encoded.unique())
print(f"Classes: {sorted(y_encoded.unique())}")
print(f"Class distribution:\n{y_encoded.value_counts().sort_index()}")

Classes: [0, 1, 2]
Class distribution:
0    266
1    184
2     53
Name: ESG Risk Level, dtype: int64


In [92]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_encoded, test_size=0.15, random_state=RANDOM_SEED, stratify=y_encoded
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=RANDOM_SEED, stratify=y_temp
)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Train: (351, 31), Val: (76, 31), Test: (76, 31)


In [93]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [94]:
smote = SMOTE(random_state=RANDOM_SEED, k_neighbors=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
print(f"Original training set: {X_train_scaled.shape}")
print(f"Resampled training set: {X_train_resampled.shape}")
print(f"Class distribution after SMOTE:\\n{pd.Series(y_train_resampled).value_counts().sort_index()}")

Original training set: (351, 31)
Resampled training set: (558, 31)
Class distribution after SMOTE:\n0    186
1    186
2    186
Name: ESG Risk Level, dtype: int64


In [95]:
X_train_tensor = torch.FloatTensor(X_train_resampled)
y_train_tensor = torch.LongTensor(y_train_resampled)
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.LongTensor(y_val.values)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.LongTensor(y_test.values)

In [96]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [97]:
class ESGRiskClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, num_classes, dropout=0.5):
        super(ESGRiskClassifier, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, num_classes))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

In [98]:
input_dim = X_train_scaled.shape[1]
hidden_dims = [512, 256, 128]
model = ESGRiskClassifier(input_dim, hidden_dims, num_classes, dropout=0.4).to(device)
print(f"Model architecture:\n{model}")
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Model architecture:
ESGRiskClassifier(
  (network): Sequential(
    (0): Linear(in_features=31, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=256, out_features=128, bias=True)
    (9): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=128, out_features=3, bias=True)
  )
)

Total parameters: 182,787
Trainable parameters: 182,787


In [99]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        
        if self.alpha is not None:
            alpha_t = self.alpha[targets]
            focal_loss = alpha_t * focal_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

In [100]:
class_counts = torch.bincount(y_train_tensor)
class_weights = 1.0 / class_counts.float()
class_weights = class_weights / class_weights.sum() * len(class_weights)
class_weights = class_weights.to(device)
criterion = FocalLoss(alpha=class_weights, gamma=2.0)

In [101]:
def train_with_config(batch_size, learning_rate, weight_decay, num_epochs):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model_instance = ESGRiskClassifier(input_dim, hidden_dims, num_classes, dropout=0.4).to(device)
    optimizer = optim.AdamW(model_instance.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=False)
    
    best_val_loss = float('inf')
    best_val_acc = 0.0
    patience_counter = 0
    early_stop_patience = 15
    
    for epoch in range(num_epochs):
        model_instance.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model_instance(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model_instance.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item() * batch_X.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_total += batch_y.size(0)
            train_correct += (predicted == batch_y).sum().item()
        
        model_instance.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model_instance(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item() * batch_X.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_total += batch_y.size(0)
                val_correct += (predicted == batch_y).sum().item()
        
        train_loss = train_loss / train_total
        train_acc = train_correct / train_total
        val_loss = val_loss / val_total
        val_acc = val_correct / val_total
        
        scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= early_stop_patience:
            break
    
    return best_val_loss, best_val_acc, model_instance

In [102]:
hyperparameter_grid = {
    'batch_size': [32, 64],
    'learning_rate': [0.0003, 0.0002, 0.0001],
    'weight_decay': [5e-4, 1e-4],
    'num_epochs': [200]
}

In [103]:
best_config = None
best_val_loss = float('inf')
best_model_state = None
results = []

for batch_size in hyperparameter_grid['batch_size']:
    for lr in hyperparameter_grid['learning_rate']:
        for wd in hyperparameter_grid['weight_decay']:
            for epochs in hyperparameter_grid['num_epochs']:
                print(f"\nTesting: bs={batch_size}, lr={lr}, wd={wd}, epochs={epochs}")
                val_loss, val_acc, trained_model = train_with_config(batch_size, lr, wd, epochs)
                print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
                
                results.append({
                    'batch_size': batch_size,
                    'learning_rate': lr,
                    'weight_decay': wd,
                    'num_epochs': epochs,
                    'val_loss': val_loss,
                    'val_acc': val_acc
                })
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_config = {
                        'batch_size': batch_size,
                        'learning_rate': lr,
                        'weight_decay': wd,
                        'num_epochs': epochs
                    }
                    best_model_state = trained_model.state_dict()

print(f"\nBest configuration: {best_config}")
print(f"Best validation loss: {best_val_loss:.4f}")


Testing: bs=32, lr=0.0003, wd=0.0005, epochs=200
Val Loss: 0.1599, Val Acc: 0.8026

Testing: bs=32, lr=0.0003, wd=0.0001, epochs=200
Val Loss: 0.1728, Val Acc: 0.8026

Testing: bs=32, lr=0.0002, wd=0.0005, epochs=200
Val Loss: 0.1733, Val Acc: 0.8158

Testing: bs=32, lr=0.0002, wd=0.0001, epochs=200
Val Loss: 0.1918, Val Acc: 0.7632

Testing: bs=32, lr=0.0001, wd=0.0005, epochs=200
Val Loss: 0.1816, Val Acc: 0.8026

Testing: bs=32, lr=0.0001, wd=0.0001, epochs=200
Val Loss: 0.1917, Val Acc: 0.7763

Testing: bs=64, lr=0.0003, wd=0.0005, epochs=200
Val Loss: 0.1867, Val Acc: 0.8026

Testing: bs=64, lr=0.0003, wd=0.0001, epochs=200
Val Loss: 0.1588, Val Acc: 0.7763

Testing: bs=64, lr=0.0002, wd=0.0005, epochs=200
Val Loss: 0.1704, Val Acc: 0.8026

Testing: bs=64, lr=0.0002, wd=0.0001, epochs=200
Val Loss: 0.1788, Val Acc: 0.8026

Testing: bs=64, lr=0.0001, wd=0.0005, epochs=200
Val Loss: 0.1791, Val Acc: 0.8026

Testing: bs=64, lr=0.0001, wd=0.0001, epochs=200
Val Loss: 0.1732, Val Acc:

In [104]:
X_full_train = pd.concat([X_train, X_val], axis=0)
y_full_train = pd.concat([y_train, y_val], axis=0)
X_full_train_scaled = scaler.fit_transform(X_full_train)
X_full_train_tensor = torch.FloatTensor(X_full_train_scaled)
y_full_train_tensor = torch.LongTensor(y_full_train.values)
full_train_dataset = TensorDataset(X_full_train_tensor, y_full_train_tensor)
print(f"Retraining on full training set: {X_full_train.shape[0]} samples")

Retraining on full training set: 427 samples


In [105]:
final_model = ESGRiskClassifier(input_dim, hidden_dims, num_classes, dropout=0.5).to(device)
optimizer = optim.AdamW(final_model.parameters(), 
                        lr=best_config['learning_rate'], 
                        weight_decay=best_config['weight_decay'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)
train_loader = DataLoader(full_train_dataset, batch_size=best_config['batch_size'], shuffle=True)
best_train_loss = float('inf')
patience_counter = 0
early_stop_patience = 20
for epoch in range(best_config['num_epochs'] + 50):
    final_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = final_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(final_model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item() * batch_X.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_total += batch_y.size(0)
        train_correct += (predicted == batch_y).sum().item()
    train_loss = train_loss / train_total
    train_acc = train_correct / train_total
    scheduler.step(train_loss)
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{best_config['num_epochs']+50}] Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= early_stop_patience:
        print(f"Early stopping at epoch {epoch+1}")
        break
final_model.eval()
print(f"Final model trained on {X_full_train.shape[0]} samples")

Epoch [20/250] Loss: 0.1824, Acc: 0.7869
Epoch [40/250] Loss: 0.1200, Acc: 0.8501
Epoch [60/250] Loss: 0.0992, Acc: 0.8618
Epoch 00073: reducing learning rate of group 0 to 1.5000e-04.
Epoch [80/250] Loss: 0.0994, Acc: 0.8735
Epoch 00097: reducing learning rate of group 0 to 7.5000e-05.
Epoch [100/250] Loss: 0.0770, Acc: 0.8689
Epoch [120/250] Loss: 0.0805, Acc: 0.8665
Epoch 00137: reducing learning rate of group 0 to 3.7500e-05.
Epoch [140/250] Loss: 0.0830, Acc: 0.8665
Epoch 00155: reducing learning rate of group 0 to 1.8750e-05.
Epoch [160/250] Loss: 0.0901, Acc: 0.8665
Early stopping at epoch 164
Final model trained on 427 samples


In [106]:
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        outputs = final_model(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(batch_y.numpy())

test_accuracy = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.8026
Test F1 Score: 0.7950


In [107]:
class_names = ['Low', 'Medium', 'High']
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=class_names))
print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))


Classification Report:
              precision    recall  f1-score   support

         Low       0.88      0.93      0.90        40
      Medium       0.72      0.75      0.74        28
        High       0.60      0.38      0.46         8

    accuracy                           0.80        76
   macro avg       0.74      0.68      0.70        76
weighted avg       0.79      0.80      0.80        76


Confusion Matrix:
[[37  3  0]
 [ 5 21  2]
 [ 0  5  3]]


In [108]:
torch.save({
    'model_state_dict': final_model.state_dict(),
    'model_architecture': {
        'input_dim': input_dim,
        'hidden_dims': hidden_dims,
        'num_classes': num_classes,
        'dropout': 0.5
    },
    'best_config': best_config,
    'scaler_params': {
        'mean': scaler.mean_.tolist(),
        'scale': scaler.scale_.tolist()
    },
    'feature_columns': feature_cols,
    'label_mapping': label_mapping,
    'test_accuracy': test_accuracy,
    'test_f1_score': test_f1
}, model_dir / 'esg_risk_model.pt')

print(f"Model saved to {model_dir / 'esg_risk_model.pt'}")

Model saved to ..\models\esg_risk_model.pt


In [109]:
joblib.dump(scaler, model_dir / 'scaler.pkl')
print(f"Scaler saved to {model_dir / 'scaler.pkl'}")

Scaler saved to ..\models\scaler.pkl


In [110]:
metadata = {
    'model_type': 'ESGRiskClassifier',
    'input_features': len(feature_cols),
    'num_classes': num_classes,
    'class_names': class_names,
    'training_samples': len(X_train),
    'validation_samples': len(X_val),
    'test_samples': len(X_test),
    'test_accuracy': float(test_accuracy),
    'test_f1_score': float(test_f1),
    'best_hyperparameters': best_config,
    'feature_columns': feature_cols,
    'label_mapping': label_mapping,
    'architecture': {
        'input_dim': input_dim,
        'hidden_dims': hidden_dims,
        'num_classes': num_classes,
        'dropout': 0.5
    }
}

with open(model_dir / 'model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to {model_dir / 'model_metadata.json'}")

Metadata saved to ..\models\model_metadata.json


In [111]:
print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print(f"Final Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test F1 Score: {test_f1:.4f}")
print(f"Best Configuration: {best_config}")
print(f"Model saved at: {model_dir / 'esg_risk_model.pt'}")
print("="*60)


TRAINING COMPLETE
Final Test Accuracy: 0.8026
Final Test F1 Score: 0.7950
Best Configuration: {'batch_size': 64, 'learning_rate': 0.0003, 'weight_decay': 0.0001, 'num_epochs': 200}
Model saved at: ..\models\esg_risk_model.pt
