# Feature Engineering For ML Model

In [None]:
import pandas as pd
import numpy as np

# --- Load combined dataset ---
df = pd.read_csv("final_esg_dataset.csv")

In [2]:
df.columns

Index(['Symbol', 'Company Name', 'Sector', 'Industry', 'Description',
       'Total_ESG_Risk_Score', 'Predicted_ESG_Score', 'ESG_Risk_Exposure',
       'ESG_Risk_Management', 'ESG_Risk_Level', 'Environment_Score',
       'Governance_Score', 'Social_Score', 'Controversy_Level',
       'Controversy_Score'],
      dtype='object')

In [3]:
# --- Clean numeric columns ---
numeric_cols = [
    'Total_ESG_Risk_Score', 'Predicted_ESG_Score', 'ESG_Risk_Exposure',
    'ESG_Risk_Management', 'Environment_Score', 'Governance_Score',
    'Social_Score', 'Controversy_Score'
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [7]:
# --- Drop rows with invalid numeric values (optional, safe step) ---
df = df.dropna(subset=numeric_cols).reset_index(drop=True)

In [9]:
# --- ESG Risk Labeling ---
def label_risk(score):
    if score <= 20:
        return "Low"
    elif score <= 40:
        return "Medium"
    else:
        return "High"

df['ESG_Risk_Label'] = df['Total_ESG_Risk_Score'].apply(label_risk)


In [11]:
# --- Confirm label distribution ---
print("Label counts:")
print(df['ESG_Risk_Label'].value_counts())


Label counts:
ESG_Risk_Label
Medium    475
Low       281
High      147
Name: count, dtype: int64


In [13]:
# --- Save feature-engineered dataset ---
df.to_csv("final_esg_dataset_labeled.csv", index=False)
print("✅ Feature-engineered dataset saved as 'final_esg_dataset_labeled.csv'")
print("Final shape:", df.shape)


✅ Feature-engineered dataset saved as 'final_esg_dataset_labeled.csv'
Final shape: (903, 16)
