In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
df = pd.read_csv("polynomial_regression.csv")
X = df[['x']]
Y = df['y']

In [None]:
plt.plot(X, Y, 'o', markersize=3)
plt.xlabel("x")
plt.ylabel("y")
plt.title("Data")

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
np.random.seed(42)
sample_size = 20
num_samples = 30
degrees = range(1, 11)

In [None]:
test_errors_by_degree = {deg: [] for deg in degrees}
train_minus_test_errors_by_degree = {deg: [] for deg in degrees}

In [None]:
for _ in range(num_samples):
    idx = np.random.choice(len(X_train), size=sample_size, replace=False)
    X_sample = X_train.iloc[idx]
    Y_sample = Y_train.iloc[idx]

    for degree in degrees:
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        X_sample_poly = poly.fit_transform(X_sample)
        X_test_poly = poly.transform(X_test)

        model = LinearRegression()
        model.fit(X_sample_poly, Y_sample)

        Y_sample_pred = model.predict(X_sample_poly)
        Y_test_pred = model.predict(X_test_poly)

        train_error = mean_squared_error(Y_sample, Y_sample_pred)
        test_error = mean_squared_error(Y_test, Y_test_pred)

        test_errors_by_degree[degree].append(test_error)
        train_minus_test_errors_by_degree[degree].append(train_error - test_error)

In [None]:
test_error_df = pd.DataFrame([
    {"Degree": deg, "Test Error": err}
    for deg, errors in test_errors_by_degree.items()
    for err in errors
])

train_minus_test_df = pd.DataFrame([
    {"Degree": deg, "Train - Test Error": err}
    for deg, errors in train_minus_test_errors_by_degree.items()
    for err in errors
])

In [None]:
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)

plt.figure(figsize=(12, 4))
sns.violinplot(data=test_error_df, x="Degree", y="Test Error")
plt.title("Degree vs Test Error (30 Samples)")
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
sns.violinplot(data=train_minus_test_df, x="Degree", y="Train - Test Error")
plt.title("Degree vs (Train Error - Test Error) (30 Samples)")
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

sns.violinplot(
    data=test_error_df[test_error_df["Degree"] <= 5],
    x="Degree", y="Test Error", ax=axes[0, 0]
)
axes[0, 0].set_title("Degree (1–8) vs Test Error")

sns.violinplot(
    data=test_error_df[test_error_df["Degree"] > 5],
    x="Degree", y="Test Error", ax=axes[0, 1]
)
axes[0, 1].set_title("Degree (9–10) vs Test Error")

sns.violinplot(
    data=train_minus_test_df[train_minus_test_df["Degree"] <= 5],
    x="Degree", y="Train - Test Error", ax=axes[1, 0]
)
axes[1, 0].set_title("Degree (1–8) vs (Train Error - Test Error)")

sns.violinplot(
    data=train_minus_test_df[train_minus_test_df["Degree"] > 5],
    x="Degree", y="Train - Test Error", ax=axes[1, 1]
)
axes[1, 1].set_title("Degree (9–10) vs (Train Error - Test Error)")

plt.tight_layout()
plt.show()

In [None]:
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)
fig, axes = plt.subplots(2, 3, figsize=(18, 8))

sns.violinplot(data=test_error_df[test_error_df["Degree"].between(1, 4)], x="Degree", y="Test Error", ax=axes[0, 0])
axes[0, 0].set_title("Degree (1–4) vs Test Error")

sns.violinplot(data=test_error_df[test_error_df["Degree"].between(5, 8)], x="Degree", y="Test Error", ax=axes[0, 1])
axes[0, 1].set_title("Degree (5–8) vs Test Error")

sns.violinplot(data=test_error_df[test_error_df["Degree"].between(9, 10)], x="Degree", y="Test Error", ax=axes[0, 2])
axes[0, 2].set_title("Degree (9–10) vs Test Error")

sns.violinplot(data=train_minus_test_df[train_minus_test_df["Degree"].between(1, 4)], x="Degree", y="Train - Test Error", ax=axes[1, 0])
axes[1, 0].set_title("Degree (1–4) vs (Train Error - Test Error)")

sns.violinplot(data=train_minus_test_df[train_minus_test_df["Degree"].between(5, 8)], x="Degree", y="Train - Test Error", ax=axes[1, 1])
axes[1, 1].set_title("Degree (5–8) vs (Train Error - Test Error)")

sns.violinplot(data=train_minus_test_df[train_minus_test_df["Degree"].between(9, 10)], x="Degree", y="Train - Test Error", ax=axes[1, 2])
axes[1, 2].set_title("Degree (9–10) vs (Train Error - Test Error)")

plt.tight_layout()
plt.show()

In [None]:
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)
fig, axes = plt.subplots(2, 5, figsize=(22, 10))
axes = axes.flatten()

degrees = range(1, 11)

for idx, deg in enumerate(degrees):
    ax = axes[idx]

    sns.violinplot(
        data=test_error_df[test_error_df["Degree"] == deg],
        x="Degree", y="Test Error",
        ax=ax
    )
    ax.set_title(f"Degree {deg} - Test Error")
    ax.set_xlabel("")
    ax.set_xticks([])

plt.tight_layout()
plt.show()

In [None]:
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)
fig, axes = plt.subplots(2, 5, figsize=(22, 10))
axes = axes.flatten()

degrees = range(1, 11)

for idx, deg in enumerate(degrees):
    ax = axes[idx]

    sns.violinplot(
        data=train_minus_test_df[train_minus_test_df["Degree"] == deg],
        x="Degree", y="Train - Test Error",
        ax=ax
    )
    ax.set_title(f"Degree {deg} - Train - Test Error")
    ax.set_xlabel("")
    ax.set_xticks([])

plt.tight_layout()
plt.show()

In [None]:
idx = np.random.choice(len(X_train), size=20, replace=False)
X_sample = X_train.iloc[idx]
Y_sample = Y_train.iloc[idx]

In [None]:
mean_cv_scores = []
for degree in degrees:
    pipeline = make_pipeline(PolynomialFeatures(degree, include_bias=False), LinearRegression())
    scores = cross_val_score(pipeline, X_sample, Y_sample, cv=5, scoring='neg_mean_squared_error')
    mean_cv_scores.append(scores.mean())

best_degree = degrees[np.argmax(mean_cv_scores)]
print(f"Best degree from 5-fold CV on sample: {best_degree}")

In [None]:
final_poly = PolynomialFeatures(degree=best_degree, include_bias=False)
X_sample_poly = final_poly.fit_transform(X_sample)
X_test_poly = final_poly.transform(X_test)

model = LinearRegression()
model.fit(X_sample_poly, Y_sample)
Y_test_pred = model.predict(X_test_poly)

final_test_error = mean_squared_error(Y_test, Y_test_pred)
print(f"Test error of best degree model from CV: {final_test_error:.4f}")

In [None]:
alphas = np.logspace(-3, 3, 100)

In [None]:
ridge_pipeline = make_pipeline(
    PolynomialFeatures(degree=best_degree, include_bias=False),
    StandardScaler(),
    RidgeCV(alphas=alphas, cv=10, scoring='neg_mean_squared_error')
)

ridge_pipeline.fit(X_train, Y_train)
ridge_test_error = mean_squared_error(Y_test, ridge_pipeline.predict(X_test))
ridge_r2_score = r2_score(Y_test, ridge_pipeline.predict(X_test))
best_ridge_model = ridge_pipeline.named_steps['ridgecv']
print(f"Ridge best alpha: {best_ridge_model.alpha_:.4f}, Test error: {ridge_test_error:.4f}, R^2: {ridge_r2_score:.4f}")

In [None]:
lasso_pipeline = make_pipeline(
    PolynomialFeatures(degree=best_degree, include_bias=False),
    StandardScaler(),
    LassoCV(alphas=alphas, cv=10, max_iter=100000)
)

lasso_pipeline.fit(X_train, Y_train)
lasso_test_error = mean_squared_error(Y_test, lasso_pipeline.predict(X_test))
lasso_r2_score = r2_score(Y_test, lasso_pipeline.predict(X_test))
best_lasso_model = lasso_pipeline.named_steps['lassocv']
print(f"Lasso best alpha: {best_lasso_model.alpha_:.4f}, Test error: {lasso_test_error:.4f}, R^2: {lasso_r2_score:.4f}")