In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.pipeline import Pipeline
import joblib

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10

plt.rcParams['axes.titlesize'] = 12print('Libraries loaded successfully')

plt.rcParams['axes.labelsize'] = 10

pd.set_option('display.precision', 3)
pd.set_option('display.max_columns', None)

In [None]:
DATA_PATH = Path('../data/raw/dataset.csv')
MODEL_DIR = Path('../models')
MODEL_PATH = MODEL_DIR / 'esg_risk_model.joblib')
RANDOM_STATE = 42

df = pd.read_csv(DATA_PATH)
print(f'Dataset Shape: {df.shape[0]:,} rows x {df.shape[1]} columns')
print(f'Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\n')
df.head(10)

In [None]:
print('Dataset Information')
print('=' * 80)
df.info(verbose=True, show_counts=True)
print('\n' + '=' * 80)
print('Statistical Summary')
print('=' * 80)
df.describe()

In [None]:
df.columns = df.columns.str.strip()

feature_cols = ['Environment Risk Score', 'Social Risk Score', 'Governance Risk Score', 'Controversy Score']
target_col = 'ESG Risk Level'

print(f'Target Variable: {target_col}')
print(f'Feature Variables: {feature_cols}\n')

df[target_col] = df[target_col].fillna('Medium')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df[target_col].value_counts().plot(kind='bar', ax=axes[0], color='steelblue', edgecolor='black', alpha=0.8)
axes[0].set_title('ESG Risk Level Distribution', fontweight='bold')
axes[0].set_xlabel('Risk Level')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

df[target_col].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Set2'))
axes[1].set_title('ESG Risk Level Percentage', fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

print(df[target_col].value_counts().to_string())
print('\nClass Distribution:')

In [None]:
X = df[feature_cols].copy()
X.columns = ['environment_risk_score', 'social_risk_score', 'governance_risk_score', 'controversy_score']

print('Missing Values Before Imputation:')
print(X.isnull().sum())

X = X.fillna(X.median())

print('\nMissing Values After Imputation:')
print(X.isnull().sum())

label_map = {'Low': 0, 'Medium': 1, 'High': 2}
y = df[target_col].map(label_map).fillna(1).astype(int)

print(f'\nFeature Matrix Shape: {X.shape}')
print(f'Label Vector Shape: {y.shape}')

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for idx, col in enumerate(X.columns):
    ax = axes[idx // 2, idx % 2]
    X[col].hist(bins=35, ax=ax, color=sns.color_palette('Set2')[idx], edgecolor='black', alpha=0.7)
    ax.axvline(X[col].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {X[col].mean():.2f}')
    ax.set_title(f'{col.replace("_", " ").title()} Distribution', fontweight='bold')
    ax.set_xlabel('Score')
    ax.set_ylabel('Frequency')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)

plt.show()
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
correlation = X.corr()

sns.heatmap(correlation, annot=True, fmt='.3f', cmap='RdYlBu_r', square=True, 
            linewidths=1.5, cbar_kws={'shrink': 0.8}, vmin=-1, vmax=1, center=0, ax=ax)
ax.set_title('Feature Correlation Matrix', fontweight='bold', fontsize=14, pad=20)

        print(f'{col1} <-> {col2}: {corr_val:.3f}')

plt.tight_layout()        corr_val = correlation.loc[col1, col2]

plt.show()    for col2 in X.columns[i+1:]:

for i, col1 in enumerate(X.columns):
print('\nCorrelation Analysis:')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

print('Train/Test Split (80/20):')
print('=' * 80)
print(f'Training Samples: {X_train.shape[0]:,} ({X_train.shape[0]/len(X)*100:.1f}%)')
print(f'Testing Samples:  {X_test.shape[0]:,} ({X_test.shape[0]/len(X)*100:.1f}%)')

print('\nTraining Set - Class Distribution:')
print(y_train.value_counts().sort_index().to_string())

print('\nTesting Set - Class Distribution:')
print(y_test.value_counts().sort_index().to_string())

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=300,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1,
        verbose=0
    ))
])

print('ML Pipeline Architecture:')
print('=' * 80)
for step_name, step_obj in pipeline.steps:
    print(f'{step_name}: {step_obj.__class__.__name__}')
print('=' * 80)

In [None]:
print('Training model...')
pipeline.fit(X_train, y_train)

train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print('\nTraining Complete')
print('=' * 80)
print(f'Training Accuracy:   {train_score:.4f} ({train_score*100:.2f}%)')
print(f'Testing Accuracy:    {test_score:.4f} ({test_score*100:.2f}%)')

print(f'Generalization Gap:  {abs(train_score - test_score):.4f} ({abs(train_score - test_score)*100:.2f}%)')print('=' * 80)

In [None]:
print('Performing 5-Fold Cross-Validation...')
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

print('=' * 80)
print(f'Fold Scores: {["{:.4f}".format(score) for score in cv_scores]}')
print(f'Mean CV Accuracy: {cv_scores.mean():.4f} +/- {cv_scores.std():.4f}')
print(f'Min CV Accuracy:  {cv_scores.min():.4f}')

print(f'Max CV Accuracy:  {cv_scores.max():.4f}')print('=' * 80)

In [None]:
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

print('Classification Report:')
print('=' * 80)
print(classification_report(y_test, y_pred, target_names=['Low Risk (0)', 'Medium Risk (1)', 'High Risk (2)'], digits=4))

macro_f1 = f1_score(y_test, y_pred, average='macro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print('Overall Performance Metrics:')
print('=' * 80)
print(f'Accuracy:          {accuracy:.4f}')
print(f'Macro F1 Score:    {macro_f1:.4f}')

print(f'Weighted F1 Score: {weighted_f1:.4f}')print('=' * 80)

In [None]:
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True, linewidths=2,
            xticklabels=['Low', 'Medium', 'High'],
            yticklabels=['Low', 'Medium', 'High'],
            cbar_kws={'shrink': 0.8}, ax=ax)
ax.set_title('Confusion Matrix - ESG Risk Prediction', fontweight='bold', fontsize=14, pad=20)
ax.set_xlabel('Predicted Risk Level', fontweight='bold')
ax.set_ylabel('Actual Risk Level', fontweight='bold')

plt.tight_layout()
plt.show()

print('\nConfusion Matrix (Raw Counts):')

print('=' * 80)                   columns=['Pred Low', 'Pred Medium', 'Pred High']).round(2))

print(cm)                   index=['Actual Low', 'Actual Medium', 'Actual High'],

print('\nNormalized Confusion Matrix (Percentages):')print(pd.DataFrame(cm_normalized, 

print('=' * 80)cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

In [None]:
feature_importance = pipeline.named_steps['classifier'].feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
colors = sns.color_palette('viridis', len(importance_df))
ax.barh(importance_df['Feature'], importance_df['Importance'], color=colors, edgecolor='black')
ax.set_xlabel('Importance Score', fontweight='bold')
ax.set_ylabel('Features', fontweight='bold')
ax.set_title('Feature Importance - Random Forest Classifier', fontweight='bold', fontsize=14)
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

for i, (idx, row) in enumerate(importance_df.iterrows()):
    ax.text(row['Importance'] + 0.01, i, f"{row['Importance']:.4f}", va='center')


plt.tight_layout()print('=' * 80)

plt.show()    print(f"{row['Feature']:30s}: {row['Importance']:.4f} ({row['Importance']*100:.2f}%)")

for i, row in importance_df.iterrows():

print('\nFeature Importance Ranking:')print('=' * 80)

In [None]:
MODEL_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(pipeline, MODEL_PATH)

file_size = MODEL_PATH.stat().st_size
print('Model Saved Successfully')
print('=' * 80)
print(f'Location: {MODEL_PATH}')
print(f'Size: {file_size / 1024:.2f} KB ({file_size / (1024*1024):.2f} MB)')
print(f'Components: {len(pipeline.steps)} pipeline steps')

print(f'Estimators: {pipeline.named_steps["classifier"].n_estimators}')print('=' * 80)

In [None]:
loaded_model = joblib.load(MODEL_PATH)
verification_predictions = loaded_model.predict(X_test[:10])
verification_probas = loaded_model.predict_proba(X_test[:10])

print('Model Verification:')
print('=' * 80)
print(f'Loaded Model Type: {type(loaded_model).__name__}')
print(f'Pipeline Steps: {list(loaded_model.named_steps.keys())}')
print(f'Number of Classes: {len(loaded_model.classes_)}')
print(f'Classes: {loaded_model.classes_}')


print('\nFirst 10 Test Samples - Predictions:')print('Model verified and ready for production deployment')

print('=' * 80)print(f'Accuracy on sample: {comparison_df["Match"].sum()}/10 ({comparison_df["Match"].mean()*100:.1f}%)')

comparison_df = pd.DataFrame({print('=' * 80)

    'Actual': y_test[:10].values,print(comparison_df.to_string())

    'Predicted': verification_predictions,})
    'Match': y_test[:10].values == verification_predictions

In [None]:
sample_data = pd.DataFrame({
    'environment_risk_score': [15.5, 35.2, 8.1],
    'social_risk_score': [12.3, 28.7, 5.6],
    'governance_risk_score': [10.1, 25.4, 4.2],
    'controversy_score': [5.0, 45.0, 2.0]
})

predictions = loaded_model.predict(sample_data)
probabilities = loaded_model.predict_proba(sample_data)

risk_labels = {0: 'Low Risk', 1: 'Medium Risk', 2: 'High Risk'}

print('Sample Predictions - Demonstration:')
print('=' * 80)
for i in range(len(sample_data)):
    print(f"\nSample {i+1}:")
    print(f"  Environment Risk Score: {sample_data.iloc[i]['environment_risk_score']:.1f}")
    print(f"  Social Risk Score:      {sample_data.iloc[i]['social_risk_score']:.1f}")
    print(f"  Governance Risk Score:  {sample_data.iloc[i]['governance_risk_score']:.1f}")
    print(f"  Controversy Score:      {sample_data.iloc[i]['controversy_score']:.1f}")
    print(f"  Predicted Risk Level: {risk_labels[predictions[i]]}")
    print(f"  Confidence Distribution:")
    print(f"    Low Risk:    {probabilities[i][0]:.4f} ({probabilities[i][0]*100:.2f}%)")

    print(f"    Medium Risk: {probabilities[i][1]:.4f} ({probabilities[i][1]*100:.2f}%)")print('=' * 80)
    print(f"    High Risk:   {probabilities[i][2]:.4f} ({probabilities[i][2]*100:.2f}%)")