# 🧠 Regression Diagnostics: Residual Analysis for Model Validation

This notebook validates the assumptions of our ESG vs Return linear model using 
residual plots. It helps assess heteroscedasticity, linearity, and outlier influence 
— a critical step in institutional-grade financial modeling and risk evaluation.

---

### 🔍 Techniques Used:
- Fitted vs Residual plot
- Zero-line diagnostic overlay
- Extended diagnostics: Normality, Heteroscedasticity, Autocorrelation, Outliers


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.outliers_influence import OLSInfluence, variance_inflation_factor
from statsmodels.formula.api import ols
import numpy as np

# 📥 Load dataset with original variables
df_full = pd.read_csv('../data/fund_esg_scores_and_returns.csv')

# Refit model for diagnostics
X = sm.add_constant(df_full['ESG_Score'])
y = df_full['Annual_Return_%']
model = sm.OLS(y, X).fit()
df_full['Fitted'] = model.fittedvalues
df_full['Residual'] = model.resid
influence = model.get_influence()


In [None]:
# 📊 Residuals vs Fitted Plot
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Fitted', y='Residual', data=df_full, color='darkslateblue', edgecolor='white')
plt.axhline(0, color='red', linestyle='--', linewidth=1.2)
plt.title('OLS Regression Residuals vs Fitted Values')
plt.xlabel('Fitted Return (%)')
plt.ylabel('Residual')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# 📈 Histogram + QQ Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Histogram
sns.histplot(df_full['Residual'], bins=20, kde=True, ax=axes[0], color='slateblue')
axes[0].set_title('Histogram of Residuals')
axes[0].axvline(0, color='black', linestyle='--')

# QQ Plot
sm.qqplot(df_full['Residual'], line='s', ax=axes[1])
axes[1].set_title('QQ Plot')

plt.tight_layout()
plt.show()

In [None]:
# 📐 Statistical Tests
# Durbin-Watson (autocorrelation)
dw_stat = durbin_watson(df_full['Residual'])

# Breusch-Pagan (heteroscedasticity)
bp_test = het_breuschpagan(df_full['Residual'], model.model.exog)
bp_labels = ['LM stat', 'LM p-value', 'F stat', 'F p-value']

# Shapiro-Wilk (normality)
shapiro_stat, shapiro_p = stats.shapiro(df_full['Residual'])

# Print results
print(f"Durbin-Watson statistic: {dw_stat:.3f}")
print(f"Breusch-Pagan test: {dict(zip(bp_labels, bp_test))}")
print(f"Shapiro-Wilk test: stat={shapiro_stat:.3f}, p-value={shapiro_p:.3f}")

In [None]:
# 📏 Influence and Leverage Plot
fig, ax = plt.subplots(figsize=(8, 5))
sm.graphics.influence_plot(model, ax=ax, criterion="cooks")
plt.tight_layout()
plt.show()