# Polynominal regression

## Task

- Load dataset with sizes and prices
- Explore and display the data
- Create a linear model and find out its R2
- Create polynomial models with different degrees, find their R2
- Select the best model 

# Data loading

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data['size'], data['price'], alpha=0.6)
plt.xlabel('Size')
plt.ylabel('Price')
plt.title('Size vs Price')
plt.grid(True)
plt.show()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv ("../dataset/prices.csv",  sep=';')

# Linear model
- standardize the data
- create and train a linear model
- display model parameters
- visualize the linear model
- display model scores

In [None]:
def print_model_score(Y, Y_predicted, label):
    r2 = r2_score(Y, Y_predicted)
    rmse = np.sqrt(mean_squared_error(Y, Y_predicted))
    print(f"{label} R2 score: {r2:.4f}")
    print(f"{label} RMSE: {rmse:.4f}")

print("Linear Model Scores:")
print_model_score(Y_train, Y_train_pred, "Train")

Y_test_pred = lin_model.predict(X_test_reshaped)
print_model_score(Y_test, Y_test_pred, "Test")

In [None]:
Y_train_pred = lin_model.predict(X_train_reshaped)
visualize_model_2d(X_train, Y_train, Y_train_pred, "Linear Model - Training Data")

In [None]:
def visualize_model_2d(X, Y, Y_predicted, title="Model", x_label="Size", y_label="Price"):
    prediction = np.zeros((X.shape[0], 3))
    prediction[:,0] = X
    prediction[:,1] = Y
    prediction[:,2] = Y_predicted
    prediction = prediction[prediction[:, 0].argsort()]
    
    plt.figure(figsize=(10, 6))
    plt.plot(prediction[:,0], prediction[:,2], c='b', label="Prediction", linewidth=2)
    plt.scatter(prediction[:,0], prediction[:,1], marker='x', c='r', label="Actual", alpha=0.6)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
X_train_reshaped = X_train.reshape(-1, 1)
X_test_reshaped = X_test.reshape(-1, 1)

lin_model = LinearRegression()
lin_model.fit(X_train_reshaped, Y_train)

print("Linear Model Parameters:")
print(f"Bias (intercept): {lin_model.intercept_:.4f}")
print(f"Weight (coefficient): {lin_model.coef_[0]:.4f}")

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_std, Y_std, test_size=0.25, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
mean_X = X.mean()
std_X = X.std()
X_std = (X - mean_X) / std_X

mean_Y = Y.mean()
std_Y = Y.std()
Y_std = (Y - mean_Y) / std_Y

print(f"X standardized - mean: {X_std.mean():.4f}, std: {X_std.std():.4f}")
print(f"Y standardized - mean: {Y_std.mean():.4f}, std: {Y_std.std():.4f}")

In [None]:
X = data['size'].values
Y = data['price'].values

print(f"X shape: {X.shape}")
print(f"Y shape: {Y.shape}")

# Create a polynomial regression model
- create a polynomial 1D model with steps 2 - 8
- display models
- display their parameters
- show their scores
- Which model would you choose?

In [None]:
best_test_r2_idx = results_df['test_r2'].idxmax()
best_model = results_df.iloc[best_test_r2_idx]

print("="*70)
print("BEST MODEL SELECTION")
print("="*70)
print(f"\nBest model based on Test R2 score:")
print(f"  Polynomial Degree: {best_model['degree']}")
print(f"  Train R2: {best_model['train_r2']:.4f}")
print(f"  Test R2: {best_model['test_r2']:.4f}")
print(f"  Train RMSE: {best_model['train_rmse']:.4f}")
print(f"  Test RMSE: {best_model['test_rmse']:.4f}")

print("\n" + "="*70)
print("OBSERVATIONS:")
print("="*70)
print("- Higher degree polynomials fit training data better (higher Train R2)")
print("- Watch for overfitting: when Train R2 >> Test R2")
print("- Best model balances good fit with generalization to test data")
print("- Too low degree = underfitting (poor fit)")
print("- Too high degree = overfitting (poor generalization)")
print("="*70)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

axes[0].plot(results_df['degree'], results_df['train_r2'], marker='o', label='Train R2', linewidth=2)
axes[0].plot(results_df['degree'], results_df['test_r2'], marker='s', label='Test R2', linewidth=2)
axes[0].set_xlabel('Polynomial Degree')
axes[0].set_ylabel('R2 Score')
axes[0].set_title('R2 Score vs Polynomial Degree')
axes[0].legend()
axes[0].grid(True)
axes[0].set_xticks(range(2, 9))

axes[1].plot(results_df['degree'], results_df['train_rmse'], marker='o', label='Train RMSE', linewidth=2)
axes[1].plot(results_df['degree'], results_df['test_rmse'], marker='s', label='Test RMSE', linewidth=2)
axes[1].set_xlabel('Polynomial Degree')
axes[1].set_ylabel('RMSE')
axes[1].set_title('RMSE vs Polynomial Degree')
axes[1].legend()
axes[1].grid(True)
axes[1].set_xticks(range(2, 9))

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
print("="*70)
print("POLYNOMIAL REGRESSION MODELS COMPARISON")
print("="*70)
print(results_df.to_string(index=False))
print("="*70)

# Model Comparison and Selection

In [None]:
results = []

for degree in range(2, 9):
    p = polynomial_fit(X_train, Y_train, degree)
    
    visualize_polynomial(p, X_train, Y_train, X_test, Y_test, degree)
    
    print(f"\n{'='*60}")
    print(f"Polynomial Degree {degree}")
    print(f"{'='*60}")
    print(f"Coefficients: {p.coef}")
    
    Y_train_pred_poly = p(X_train)
    Y_test_pred_poly = p(X_test)
    
    train_r2 = r2_score(Y_train, Y_train_pred_poly)
    train_rmse = np.sqrt(mean_squared_error(Y_train, Y_train_pred_poly))
    test_r2 = r2_score(Y_test, Y_test_pred_poly)
    test_rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred_poly))
    
    print_model_score(Y_train, Y_train_pred_poly, "Train")
    print_model_score(Y_test, Y_test_pred_poly, "Test")
    
    results.append({
        'degree': degree,
        'train_r2': train_r2,
        'train_rmse': train_rmse,
        'test_r2': test_r2,
        'test_rmse': test_rmse
    })

In [None]:
def visualize_polynomial(p, X_train, Y_train, X_test, Y_test, degree, x_label="Size", y_label="Price"):
    fig = plt.figure(figsize=(12, 6))
    plt.scatter(X_train, Y_train, c='r', label="Train", alpha=0.6)
    plt.scatter(X_test, Y_test, c='b', label="Test", alpha=0.6)
    
    curve_x = np.arange(min(X_train.min(), X_test.min()), max(X_train.max(), X_test.max()), 0.01)
    plt.plot(curve_x, p(curve_x), label=f"Polynomial degree {degree}", linewidth=2, color='green')
    
    plt.title(f"Polynomial Regression - Degree {degree}")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def polynomial_fit(X, Y, degree=1):
    return np.poly1d(np.polyfit(X, Y, degree))