# ML Model training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [2]:
# --- Load feature-engineered dataset ---
df = pd.read_csv("final_esg_dataset_labeled.csv")


In [3]:
# --- Define features and target ---
features = [
    'Total_ESG_Risk_Score', 'Predicted_ESG_Score', 'ESG_Risk_Exposure',
    'ESG_Risk_Management', 'Environment_Score', 'Governance_Score',
    'Social_Score', 'Controversy_Score'
]
target = 'ESG_Risk_Label'

X = df[features]
y = df[target]


In [4]:
# --- Encode labels ---
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [5]:
# Save for decoding later
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Encoding:", label_map)


Label Encoding: {'High': 0, 'Low': 1, 'Medium': 2}


In [6]:
# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [7]:
# --- Train Random Forest ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)


In [8]:
# --- Train XGBoost ---
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)


In [9]:
# --- Evaluation ---
print("🔍 Random Forest Results:")
print(classification_report(y_test, rf_preds, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))

print("\n🔍 XGBoost Results:")
print(classification_report(y_test, xgb_preds, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_preds))


🔍 Random Forest Results:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        30
         Low       1.00      1.00      1.00        56
      Medium       1.00      1.00      1.00        95

    accuracy                           1.00       181
   macro avg       1.00      1.00      1.00       181
weighted avg       1.00      1.00      1.00       181

Confusion Matrix:
 [[30  0  0]
 [ 0 56  0]
 [ 0  0 95]]

🔍 XGBoost Results:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00        30
         Low       1.00      1.00      1.00        56
      Medium       1.00      1.00      1.00        95

    accuracy                           1.00       181
   macro avg       1.00      1.00      1.00       181
weighted avg       1.00      1.00      1.00       181

Confusion Matrix:
 [[30  0  0]
 [ 0 56  0]
 [ 0  0 95]]


In [10]:
# --- Save the best model
import joblib
joblib.dump(rf, "rf_esg_model.pkl")
joblib.dump(le, "label_encoder.pkl")
print("✅ Saved Random Forest model and label encoder for Streamlit app.")

✅ Saved Random Forest model and label encoder for Streamlit app.


In [11]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load dataset
df = pd.read_csv("final_esg_dataset_labeled.csv")

# Initialize encoders
sector_encoder = LabelEncoder()
industry_encoder = LabelEncoder()
controversy_level_encoder = LabelEncoder()

# Encode categorical features (only if not already encoded)
if "Sector_encoded" not in df.columns:
    df["Sector_encoded"] = sector_encoder.fit_transform(df["Sector"])
if "Industry_encoded" not in df.columns:
    df["Industry_encoded"] = industry_encoder.fit_transform(df["Industry"])
if "Controversy_Level_encoded" not in df.columns:
    df["Controversy_Level_encoded"] = controversy_level_encoder.fit_transform(df["Controversy_Level"])

# Save encoded dataset
df.to_csv("final_esg_dataset_labeled.csv", index=False)

# Optionally save the encoders if needed in Streamlit
import joblib
encoders = {
    "Sector": sector_encoder,
    "Industry": industry_encoder,
    "Controversy_Level": controversy_level_encoder
}
joblib.dump(encoders, "label_encoders.pkl")

print("✅ Categorical columns encoded and saved.")


✅ Categorical columns encoded and saved.


In [18]:
df.columns

Index(['Symbol', 'Company Name', 'Sector', 'Industry', 'Description',
       'Total_ESG_Risk_Score', 'Predicted_ESG_Score', 'ESG_Risk_Exposure',
       'ESG_Risk_Management', 'ESG_Risk_Level', 'Environment_Score',
       'Governance_Score', 'Social_Score', 'Controversy_Level',
       'Controversy_Score', 'ESG_Risk_Label', 'Sector_encoded',
       'Industry_encoded', 'Controversy_Level_encoded'],
      dtype='object')