In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import joblib
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

In [None]:
data_path = Path('../data/processed/processed.csv')
model_dir = Path('../models')
model_dir.mkdir(exist_ok=True)

In [None]:
df = pd.read_csv(data_path)
print(f"Loaded dataset: {df.shape}")

In [None]:
target_col = 'ESG Risk Level'
exclude_cols = ['Symbol', 'Name', 'Address', 'Description', 'Sector', 'Industry', 
                'Controversy Level', 'ESG Risk Level', 'ESG Risk Percentile', 'Employee Size']
leakage_features = [
    'ESG_Component_Balance', 'ESG_Max_Component', 'ESG_Min_Component',
    'ESG_Risk_Above_Average', 'ESG_Risk_Category', 'Sector_Risk_Average',
    'ESG_vs_Sector_Average', 'High_Risk_Percentile', 'ESG Risk Percentile_Encoded',
    'ESG Risk Level_High', 'ESG Risk Level_Low', 'ESG Risk Level_Medium',
    'ESG Risk Level_Negligible', 'ESG Risk Level_Severe', 'ESG Risk Level_nan',
    'ESG_Risk_Category_Low', 'ESG_Risk_Category_Medium', 'ESG_Risk_Category_High',
    'ESG_Risk_Category_Severe', 'ESG_Risk_Category_nan'
]
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in numeric_cols if col not in exclude_cols and col not in leakage_features]

In [None]:
X = df[feature_cols].copy()
y = df[target_col].copy()
X = X.fillna(0)
y = y.fillna('Medium')
print(f"Features: {len(feature_cols)}")
print(f"Target distribution:\n{y.value_counts()}")

In [None]:
label_mapping = {'Low': 0, 'Negligible': 0, 'Medium': 1, 'High': 2, 'Severe': 2}
y_encoded = y.map(label_mapping)
y_encoded = y_encoded.fillna(1).astype(int)
num_classes = len(y_encoded.unique())
print(f"Classes: {sorted(y_encoded.unique())}")
print(f"Class distribution:\n{y_encoded.value_counts().sort_index()}")

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_encoded, test_size=0.15, random_state=RANDOM_SEED, stratify=y_encoded
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=RANDOM_SEED, stratify=y_temp
)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.LongTensor(y_train.values)
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.LongTensor(y_val.values)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.LongTensor(y_test.values)

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
class ESGRiskClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims, num_classes, dropout=0.5):
        super(ESGRiskClassifier, self).__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, num_classes))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

In [None]:
input_dim = X_train_scaled.shape[1]
hidden_dims = [256, 128, 64]
model = ESGRiskClassifier(input_dim, hidden_dims, num_classes, dropout=0.5).to(device)
print(f"Model architecture:\n{model}")
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

In [None]:
class_counts = torch.bincount(y_train_tensor)
class_weights = 1.0 / class_counts.float()
class_weights = class_weights / class_weights.sum() * len(class_weights)
class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
def train_with_config(batch_size, learning_rate, weight_decay, num_epochs):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model_instance = ESGRiskClassifier(input_dim, hidden_dims, num_classes, dropout=0.5).to(device)
    optimizer = optim.AdamW(model_instance.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=False)
    
    best_val_loss = float('inf')
    best_val_acc = 0.0
    patience_counter = 0
    early_stop_patience = 15
    
    for epoch in range(num_epochs):
        model_instance.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model_instance(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model_instance.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item() * batch_X.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_total += batch_y.size(0)
            train_correct += (predicted == batch_y).sum().item()
        
        model_instance.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model_instance(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item() * batch_X.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_total += batch_y.size(0)
                val_correct += (predicted == batch_y).sum().item()
        
        train_loss = train_loss / train_total
        train_acc = train_correct / train_total
        val_loss = val_loss / val_total
        val_acc = val_correct / val_total
        
        scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_acc = val_acc
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= early_stop_patience:
            break
    
    return best_val_loss, best_val_acc, model_instance

In [None]:
hyperparameter_grid = {
    'batch_size': [16, 32],
    'learning_rate': [0.0001, 0.00005, 0.0002],
    'weight_decay': [1e-3, 5e-4, 1e-4],
    'num_epochs': [150]
}

In [None]:
best_config = None
best_val_loss = float('inf')
best_model_state = None
results = []

for batch_size in hyperparameter_grid['batch_size']:
    for lr in hyperparameter_grid['learning_rate']:
        for wd in hyperparameter_grid['weight_decay']:
            for epochs in hyperparameter_grid['num_epochs']:
                print(f"\nTesting: bs={batch_size}, lr={lr}, wd={wd}, epochs={epochs}")
                val_loss, val_acc, trained_model = train_with_config(batch_size, lr, wd, epochs)
                print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
                
                results.append({
                    'batch_size': batch_size,
                    'learning_rate': lr,
                    'weight_decay': wd,
                    'num_epochs': epochs,
                    'val_loss': val_loss,
                    'val_acc': val_acc
                })
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_config = {
                        'batch_size': batch_size,
                        'learning_rate': lr,
                        'weight_decay': wd,
                        'num_epochs': epochs
                    }
                    best_model_state = trained_model.state_dict()

print(f"\nBest configuration: {best_config}")
print(f"Best validation loss: {best_val_loss:.4f}")

In [None]:
X_full_train = pd.concat([X_train, X_val], axis=0)
y_full_train = pd.concat([y_train, y_val], axis=0)
X_full_train_scaled = scaler.fit_transform(X_full_train)
X_full_train_tensor = torch.FloatTensor(X_full_train_scaled)
y_full_train_tensor = torch.LongTensor(y_full_train.values)
full_train_dataset = TensorDataset(X_full_train_tensor, y_full_train_tensor)
print(f"Retraining on full training set: {X_full_train.shape[0]} samples")

In [None]:
final_model = ESGRiskClassifier(input_dim, hidden_dims, num_classes, dropout=0.5).to(device)
optimizer = optim.AdamW(final_model.parameters(), 
                        lr=best_config['learning_rate'], 
                        weight_decay=best_config['weight_decay'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)
train_loader = DataLoader(full_train_dataset, batch_size=best_config['batch_size'], shuffle=True)
best_train_loss = float('inf')
patience_counter = 0
early_stop_patience = 20
for epoch in range(best_config['num_epochs'] + 50):
    final_model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = final_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(final_model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item() * batch_X.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_total += batch_y.size(0)
        train_correct += (predicted == batch_y).sum().item()
    train_loss = train_loss / train_total
    train_acc = train_correct / train_total
    scheduler.step(train_loss)
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{best_config['num_epochs']+50}] Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= early_stop_patience:
        print(f"Early stopping at epoch {epoch+1}")
        break
final_model.eval()
print(f"Final model trained on {X_full_train.shape[0]} samples")

In [None]:
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        outputs = final_model(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(batch_y.numpy())

test_accuracy = accuracy_score(all_labels, all_preds)
test_f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

In [None]:
class_names = ['Low', 'Medium', 'High']
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=class_names))
print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

In [None]:
torch.save({
    'model_state_dict': final_model.state_dict(),
    'model_architecture': {
        'input_dim': input_dim,
        'hidden_dims': hidden_dims,
        'num_classes': num_classes,
        'dropout': 0.5
    },
    'best_config': best_config,
    'scaler_params': {
        'mean': scaler.mean_.tolist(),
        'scale': scaler.scale_.tolist()
    },
    'feature_columns': feature_cols,
    'label_mapping': label_mapping,
    'test_accuracy': test_accuracy,
    'test_f1_score': test_f1
}, model_dir / 'esg_risk_model.pt')

print(f"Model saved to {model_dir / 'esg_risk_model.pt'}")

In [None]:
joblib.dump(scaler, model_dir / 'scaler.pkl')
print(f"Scaler saved to {model_dir / 'scaler.pkl'}")

In [None]:
metadata = {
    'model_type': 'ESGRiskClassifier',
    'input_features': len(feature_cols),
    'num_classes': num_classes,
    'class_names': class_names,
    'training_samples': len(X_train),
    'validation_samples': len(X_val),
    'test_samples': len(X_test),
    'test_accuracy': float(test_accuracy),
    'test_f1_score': float(test_f1),
    'best_hyperparameters': best_config,
    'feature_columns': feature_cols,
    'label_mapping': label_mapping,
    'architecture': {
        'input_dim': input_dim,
        'hidden_dims': hidden_dims,
        'num_classes': num_classes,
        'dropout': 0.5
    }
}

with open(model_dir / 'model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to {model_dir / 'model_metadata.json'}")

In [None]:
print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print(f"Final Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test F1 Score: {test_f1:.4f}")
print(f"Best Configuration: {best_config}")
print(f"Model saved at: {model_dir / 'esg_risk_model.pt'}")
print("="*60)