<a href="https://colab.research.google.com/github/SZ330/EE344-Assignment-2/blob/main/Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Part 1: Outlier Detection and Removal using Cook's Distance**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

np.set_printoptions(precision=4, suppress=True)

I chose to use linear regression because Cook's distance was designed to work with linear regression models. Cook's distance is used to identify influential data points that heavily change the model's fitted values. I also chose to use OLS because it is an estimation that Cook's distance is based on.

In [7]:
def fit_ols_diagnostics(X, y):
    """Fit OLS and return (model, diagnostics dataframe).

    Parameters
    ----------
    X : array-like, shape (n,) or (n, p)
        Feature matrix (without intercept column).
    y : array-like, shape (n,)
        Target vector.
    """
    X = np.asarray(X)
    y = np.asarray(y).reshape(-1)
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    # Add intercept column for statsmodels
    X_sm = sm.add_constant(X)
    model = sm.OLS(y, X_sm).fit()

    infl = OLSInfluence(model)
    diag = pd.DataFrame({
        "y": y,
        "y_hat": model.fittedvalues,
        "residual": model.resid,
        "leverage_hii": infl.hat_matrix_diag,   # diagonal of Hat matrix H
        "cooks_D": infl.cooks_distance[0]
    })
    return model, diag

def plot_curve_fit(x, y, pipeline, title=""):
    """Scatter + fitted curve for scikit-learn pipeline (1D x)."""
    x = np.asarray(x).reshape(-1, 1)
    y = np.asarray(y).reshape(-1)

    # Create a smooth grid for plotting
    grid = np.linspace(x.min(), x.max(), 200).reshape(-1, 1)
    yhat = pipeline.predict(grid)

    plt.figure(figsize=(7, 4))
    plt.scatter(x, y)
    plt.plot(grid, yhat)
    plt.xlabel("x")
    plt.ylabel("y")
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.show()

In [24]:
# --- Use usina_with_outliers.csv (multivariate predictors) ---
df = pd.read_csv("usina_with_outliers.csv")
print("Rows loaded:", df.shape[0])
print("Columns:", df.columns.tolist())
FEATURE_COLS = ["AT", "V", "AP", "RH"]
TARGET_COL = "PE"

# Drop missing rows
df_clean_in = df.dropna().copy()

X = df_clean_in[FEATURE_COLS]
y = df_clean_in[TARGET_COL]

model_full, diag_full = fit_ols_diagnostics(X, y)

print(model_full.summary())

# Cook's distance threshold
n = len(df_clean_in)
threshold = 4 / n

diag_table = diag_full.copy()
diag_table["flag_D_gt_4_over_n"] = diag_table["cooks_D"] > threshold

print(f"Cook's distance heuristic threshold 4/n = {threshold:.6f}")

# Show the top points by Cook's distance
diag_table_sorted = diag_table.sort_values("cooks_D", ascending=False)
display(diag_table_sorted.head(15))
display(diag_table_sorted.tail(15))


Rows loaded: 9568
Columns: ['AT', 'V', 'AP', 'RH', 'PE']
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.649
Model:                            OLS   Adj. R-squared:                  0.649
Method:                 Least Squares   F-statistic:                     4416.
Date:                Sat, 24 Jan 2026   Prob (F-statistic):               0.00
Time:                        16:51:30   Log-Likelihood:                -36621.
No. Observations:                9568   AIC:                         7.325e+04
Df Residuals:                    9563   BIC:                         7.329e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

Unnamed: 0,y,y_hat,residual,leverage_hii,cooks_D,flag_D_gt_4_over_n
4228,532.89197,367.627618,165.264352,0.027485,1.28376,True
49,370.64803,515.83375,-145.18572,0.027817,1.003422,True
875,541.43197,408.524788,132.907182,0.032505,0.992133,True
5922,373.10803,481.504793,-108.396763,0.041859,0.866516,True
2538,364.32803,559.133964,-194.805934,0.013496,0.851224,True
5480,358.96803,505.177891,-146.209861,0.023388,0.847862,True
9241,575.06197,410.364058,164.697912,0.017632,0.801592,True
7678,331.77803,476.20931,-144.43128,0.022602,0.798247,True
9525,343.19803,472.982395,-129.784365,0.026271,0.754847,True
9219,536.88197,394.454473,142.427497,0.019136,0.652604,True


Unnamed: 0,y,y_hat,residual,leverage_hii,cooks_D,flag_D_gt_4_over_n
3139,448.87,448.877793,-0.007793,0.000445,4.372253e-11,False
2373,463.65,463.656325,-0.006325,0.000649,4.204544e-11,False
9058,442.67,442.679633,-0.009633,0.000206,3.097179e-11,False
160,448.37,448.379786,-0.009786,0.000194,3.012523e-11,False
8950,431.19,431.194679,-0.004679,0.000817,2.897086e-11,False
3033,456.88,456.890712,-0.010712,0.000148,2.739085e-11,False
2245,449.34,449.349007,-0.009007,0.000206,2.702788e-11,False
3926,451.44,451.436356,0.003644,0.000392,8.434416e-12,False
871,433.62,433.624036,-0.004036,0.000308,8.116194e-12,False
7255,440.96,440.962203,-0.002203,0.000626,4.919034e-12,False


In [20]:
# Keep only non-outlier observations
mask_keep = ~diag_table["flag_D_gt_4_over_n"]

df_no_outliers = df_clean_in.loc[mask_keep].copy()
print("Original rows:", df_clean_in.shape[0])
print("Outliers removed:", (~mask_keep).sum())
print("Cleaned rows:", df_no_outliers.shape[0])

df_no_outliers.to_csv("usina.csv", index=False)
print("Saved cleaned dataset as usina.csv")

Original rows: 9568
Outliers removed: 120
Cleaned rows: 9448
Saved cleaned dataset as usina.csv
