In [9]:
# 1. Imports & configuration
import os, sys
import pandas as pd
import numpy as np
from pathlib import Path
sys.path.append(os.path.abspath('../'))

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

from scripts.config import PROCESSED_DATA_PATH

RANDOM_STATE = 42
MODEL_DIR = Path('../models')
MODEL_PATH = MODEL_DIR / 'esg_risk_model.joblib'

pd.set_option('display.max_columns', 100)

In [10]:
# 2. Load processed dataset
raw_df = pd.read_csv(PROCESSED_DATA_PATH)
print(f"Loaded shape: {raw_df.shape}")
raw_df.head()

Loaded shape: (503, 15)


Unnamed: 0,Symbol,Name,Address,Sector,Industry,Full Time Employees,Description,Total ESG Risk score,Environment Risk Score,Governance Risk Score,Social Risk Score,Controversy Level,Controversy Score,ESG Risk Percentile,ESG Risk Level
0,ENPH,"Enphase Energy, Inc.","47281 Bayside Parkway\nFremont, CA 94538\nUnit...",Technology,Solar,3157,"Enphase Energy, Inc., together with its subsid...",21.05,4.05,6.1,8.9,,2.0,,
1,EMN,Eastman Chemical Company,"200 South Wilcox Drive\nKingsport, TN 37662\nU...",Basic Materials,Specialty Chemicals,14000,Eastman Chemical Company operates as a special...,25.3,12.8,6.6,5.8,Moderate Controversy Level,2.0,50th percentile,Medium
2,DPZ,Domino's Pizza Inc.,"30 Frank Lloyd Wright Drive\nAnn Arbor, MI 481...",Consumer Cyclical,Restaurants,6500,"Domino's Pizza, Inc., through its subsidiaries...",29.2,10.6,6.3,12.2,Moderate Controversy Level,2.0,66th percentile,Medium
3,DAY,"Dayforce, Inc.","3311 East Old Shakopee Road\nMinneapolis, MN 5...",Technology,Software - Application,9084,"Dayforce Inc., together with its subsidiaries,...",21.05,4.05,6.1,8.9,,2.0,,
4,DVA,Davita Inc.,"2000 16th Street\nDenver, CO 80202\nUnited States",Healthcare,Medical Care Facilities,70000,DaVita Inc. provides kidney dialysis services ...,22.6,0.1,8.4,14.1,Moderate Controversy Level,2.0,38th percentile,Medium


In [11]:
# 3. Basic cleaning & column normalization
if hasattr(raw_df.columns, 'str'):
    raw_df.columns = raw_df.columns.str.strip()

required_cols = [
    'Environment Risk Score','Social Risk Score','Governance Risk Score','Controversy Score','ESG Risk Level'
]
missing = [c for c in required_cols if c not in raw_df.columns]
if missing:
    raise ValueError(f"Dataset missing required columns: {missing}")

# Handle target label missing values
raw_df['ESG Risk Level'] = raw_df['ESG Risk Level'].fillna('Medium')

# Feature frame (original names first)
feature_df = raw_df[['Environment Risk Score','Social Risk Score','Governance Risk Score','Controversy Score']].copy()

# If you eventually add employees field, derive or set default
feature_df['full_time_employees'] = 1000  # placeholder constant; replace with real data if available

# Rename to snake_case for model/pipeline consistency
feature_df.columns = [
    'environment_risk_score',
    'social_risk_score',
    'governance_risk_score',
    'controversy_score',
    'full_time_employees'
]

labels = raw_df['ESG Risk Level'].map({'Low':0,'Medium':1,'High':2}).fillna(1).astype(int)
print(feature_df.head())
print(labels.value_counts())

   environment_risk_score  social_risk_score  governance_risk_score  \
0                    4.05                8.9                    6.1   
1                   12.80                5.8                    6.6   
2                   10.60               12.2                    6.3   
3                    4.05                8.9                    6.1   
4                    0.10               14.1                    8.4   

   controversy_score  full_time_employees  
0                2.0                 1000  
1                2.0                 1000  
2                2.0                 1000  
3                2.0                 1000  
4                2.0                 1000  
ESG Risk Level
1    266
0    187
2     50
Name: count, dtype: int64


In [12]:
# 4. Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    feature_df,
    labels,
    test_size=0.2,
    stratify=labels,
    random_state=RANDOM_STATE
)
X_train.shape, X_test.shape

((402, 5), (101, 5))

In [13]:
# 5. Build pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(
        n_estimators=300,
        random_state=RANDOM_STATE,
        class_weight='balanced'
    ))
])

pipeline

In [14]:
# 6. Train pipeline
pipeline.fit(X_train, y_train)
print("Pipeline trained.")

# Basic sanity check
assert hasattr(pipeline, 'predict_proba'), "Pipeline missing predict_proba"
assert set(pipeline.classes_) == set([0,1,2]), "Unexpected target classes"

Pipeline trained.


In [15]:
# 7. Evaluation
from sklearn.metrics import f1_score

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Macro F1: {macro_f1:.3f}")

              precision    recall  f1-score   support

           0       0.88      0.92      0.90        38
           1       0.89      0.89      0.89        53
           2       0.88      0.70      0.78        10

    accuracy                           0.88       101
   macro avg       0.88      0.84      0.85       101
weighted avg       0.88      0.88      0.88       101

Confusion Matrix:
 [[35  3  0]
 [ 5 47  1]
 [ 0  3  7]]
Macro F1: 0.854


In [16]:
# 8. Persist pipeline
MODEL_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(pipeline, MODEL_PATH)
print(f"✅ Saved unified pipeline to {MODEL_PATH}")

✅ Saved unified pipeline to ..\models\esg_risk_model.joblib
