# Project 3 — Bias–Variance with Polynomial Regression

This notebook explores bias–variance trade-offs by fitting polynomial regression models of increasing degree.

## Objectives
- Load the dataset from a URL (or local fallback).
- Perform quick EDA and visualize the target.
- Train polynomial models for degrees 1…15.
- Compare Train/Validation/Test errors.
- Optionally add **Ridge**/**Lasso** regularization.

Dataset columns:
- `x` — feature
- `y` — target
- `split` — one of `train`, `val`, `test`


## 1) Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from urllib.error import URLError
import io, sys, os
print('Versions ->', 'numpy', np.__version__, 'pandas', pd.__version__)

## 2) Load data (URL first, local fallback)
Set `DATA_URL` to your **raw** CSV URL (GitHub/Drive), e.g.

```
DATA_URL = "https://raw.githubusercontent.com/<user>/<repo>/main/project3_bias_variance_curve.csv"
```

If the URL fails, it will try to read a local file path.

In [None]:
DATA_URL = "https://raw.githubusercontent.com/PereEs/Project_AI_ML/main/project3_bias_variance_curve.csv"  # <- change if needed
LOCAL_PATH = "project3_bias_variance_curve.csv"  # fallback

def load_dataset(url: str, local_path: str):
    try:
        df = pd.read_csv(url)
        print('Loaded from URL:', url)
        return df
    except Exception as e:
        print('URL load failed:', e)
        try:
            df = pd.read_csv(local_path)
            print('Loaded local file:', local_path)
            return df
        except Exception as e2:
            print('Local load failed:', e2)
            raise

df = load_dataset(DATA_URL, LOCAL_PATH)
df.head()

## 3) Quick EDA

In [None]:
# Basic info
display(df.describe(include='all'))
print('\nCounts by split:')
print(df['split'].value_counts())

# Scatter by split
for sp in ['train','val','test']:
    sub = df[df['split']==sp]
    plt.figure()
    plt.title(f'x vs y — {sp}')
    plt.scatter(sub['x'], sub['y'])
    plt.xlabel('x')
    plt.ylabel('y')
    plt.show()

## 4) Helpers — fit & evaluate across degrees

In [None]:
def fit_poly_model(x, y, degree=1, reg=None, alpha=1.0):
    steps = [('poly', PolynomialFeatures(degree=degree, include_bias=False)),
             ('scaler', StandardScaler())]
    if reg is None:
        steps.append(('lin', LinearRegression()))
    elif reg == 'ridge':
        steps.append(('ridge', Ridge(alpha=alpha, random_state=0)))
    elif reg == 'lasso':
        steps.append(('lasso', Lasso(alpha=alpha, random_state=0, max_iter=20000)))
    else:
        raise ValueError('reg must be None, "ridge", or "lasso"')
    model = Pipeline(steps)
    model.fit(x.reshape(-1,1), y)
    return model

def evaluate_model(model, x, y):
    yhat = model.predict(x.reshape(-1,1))
    mse = mean_squared_error(y, yhat)
    rmse = np.sqrt(mse)
    r2 = r2_score(y, yhat)
    return {'mse': mse, 'rmse': rmse, 'r2': r2}

def split_xy(df):
    x_train = df[df.split=='train']['x'].values
    y_train = df[df.split=='train']['y'].values
    x_val   = df[df.split=='val']['x'].values
    y_val   = df[df.split=='val']['y'].values
    x_test  = df[df.split=='test']['x'].values
    y_test  = df[df.split=='test']['y'].values
    return (x_train,y_train,x_val,y_val,x_test,y_test)

## 5) Train degrees 1…15 (no regularization)

In [None]:
degrees = list(range(1,16))
x_tr,y_tr,x_va,y_va,x_te,y_te = split_xy(df)

results = []
models = {}
for d in degrees:
    m = fit_poly_model(x_tr, y_tr, degree=d, reg=None)
    models[d] = m
    r_tr = evaluate_model(m, x_tr, y_tr)
    r_va = evaluate_model(m, x_va, y_va)
    r_te = evaluate_model(m, x_te, y_te)
    results.append({'degree': d,
                    'train_rmse': r_tr['rmse'], 'val_rmse': r_va['rmse'], 'test_rmse': r_te['rmse'],
                    'train_r2': r_tr['r2'], 'val_r2': r_va['r2'], 'test_r2': r_te['r2']})
res = pd.DataFrame(results)
res

## 6) Error vs degree plots

In [None]:
# RMSE vs Degree
plt.figure()
plt.plot(res['degree'], res['train_rmse'], marker='o', label='Train')
plt.plot(res['degree'], res['val_rmse'], marker='o', label='Validation')
plt.plot(res['degree'], res['test_rmse'], marker='o', label='Test')
plt.xlabel('Polynomial degree')
plt.ylabel('RMSE')
plt.title('RMSE vs Degree (no regularization)')
plt.legend()
plt.show()

# R2 vs Degree
plt.figure()
plt.plot(res['degree'], res['train_r2'], marker='o', label='Train')
plt.plot(res['degree'], res['val_r2'], marker='o', label='Validation')
plt.plot(res['degree'], res['test_r2'], marker='o', label='Test')
plt.xlabel('Polynomial degree')
plt.ylabel('$R^2$')
plt.title('R^2 vs Degree (no regularization)')
plt.legend()
plt.show()

## 7) Best degree by validation RMSE & residual plot

In [None]:
best_row = res.iloc[res['val_rmse'].idxmin()]
best_degree = int(best_row['degree'])
print('Best degree (by Val RMSE):', best_degree)
best_model = models[best_degree]

# Residuals on train and validation
for split_name,(x,y) in {'train':(x_tr,y_tr), 'val':(x_va,y_va)}.items():
    yhat = best_model.predict(x.reshape(-1,1))
    resid = y - yhat
    plt.figure()
    plt.scatter(yhat, resid)
    plt.axhline(0)
    plt.xlabel('Predicted y')
    plt.ylabel('Residuals')
    plt.title(f'Residuals — {split_name} (degree={best_degree})')
    plt.show()

# Fit curve visualization
x_grid = np.linspace(df['x'].min(), df['x'].max(), 400)
y_grid = best_model.predict(x_grid.reshape(-1,1))
plt.figure()
plt.scatter(x_tr, y_tr, alpha=0.6, label='train')
plt.scatter(x_va, y_va, alpha=0.6, label='val')
plt.plot(x_grid, y_grid, linewidth=2, label=f'fit degree {best_degree}')
plt.xlabel('x'); plt.ylabel('y'); plt.title('Best model fit'); plt.legend(); plt.show()

## 8) Regularization sweep (Ridge/Lasso) [optional]
Try different `alpha` values to stabilize high-degree polynomials.

In [None]:
def sweep_regularization(kind='ridge', degree=12, alphas=(0.01,0.1,1,10,100)):
    rows = []
    for a in alphas:
        m = fit_poly_model(x_tr, y_tr, degree=degree, reg=kind, alpha=a)
        r_tr = evaluate_model(m, x_tr, y_tr)
        r_va = evaluate_model(m, x_va, y_va)
        r_te = evaluate_model(m, x_te, y_te)
        rows.append({'alpha': a,
                     'train_rmse': r_tr['rmse'], 'val_rmse': r_va['rmse'], 'test_rmse': r_te['rmse']})
    out = pd.DataFrame(rows)
    display(out)
    plt.figure()
    plt.plot(out['alpha'], out['train_rmse'], marker='o', label='Train')
    plt.plot(out['alpha'], out['val_rmse'], marker='o', label='Validation')
    plt.plot(out['alpha'], out['test_rmse'], marker='o', label='Test')
    plt.xscale('log')
    plt.xlabel('alpha (log scale)'); plt.ylabel('RMSE')
    plt.title(f'{kind.title()} regularization — degree {degree}')
    plt.legend(); plt.show()
    return out

# Example (uncomment to run):
# _ = sweep_regularization(kind='ridge', degree=12)
# _ = sweep_regularization(kind='lasso', degree=12)

## 9) Conclusions
- Discuss where validation error is minimized vs. train/test.
- Explain bias (underfitting at low degrees) vs variance (overfitting at high degrees).
- Reflect on the effect of regularization (if used).
