# Polynomial Regression

The goal of this notebook is to empirically find the best polynomial regression by testing different degrees. 

In [163]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [164]:
# Import data
df = pd.read_csv("cleaned_data.csv")

df

Unnamed: 0,Time,Cases,Close,old_Close
0,7,1,4315.410156,4352.100098
1,8,1,4322.740234,4324.770020
2,9,1,4300.080078,4344.259766
3,12,1,4041.560059,4246.680176
4,13,1,3294.219971,4323.930176
...,...,...,...,...
377,735,464394,5274.229980,5494.209961
378,736,470665,5251.640137,5525.129883
379,737,475431,5227.370117,5356.509766
380,740,489870,5301.799805,5126.620117


In [165]:
"""
Selecting the features(X) and the label(y)
    Label: "Close"
    Features: "Time", "Cases", "old_Close"
"""

y = df["Close"].to_numpy()
X = df[["Time","Cases","old_Close"]].to_numpy().reshape(-1,3)

In [166]:
# Split the dataset into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42) #The answer

# Define a list of polynomial degrees
degrees = list(range(1,10))

# These variables are for storing the errors for each polynomial degree
tr_errors = []
val_errors = []

# Loop over the degrees
for i, degree in enumerate(degrees):
    lin_regr = LinearRegression(fit_intercept=False)

    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    lin_regr.fit(X_train_poly, y_train)

    y_pred_train = lin_regr.predict(X_train_poly)
    tr_error = mean_squared_error(y_train, y_pred_train)
    
    X_val_poly = poly.transform(X_val)
    y_pred_val = lin_regr.predict(X_val_poly)
    val_error = mean_squared_error(y_val, y_pred_val)

    tr_errors.append(tr_error)
    val_errors.append(val_error)

    tr_errors.append(tr_error)
    val_errors.append(val_error)



In [167]:
print("{:<10s}{:<20s}{:<20s}".format("Degree", "Training error", "Validation error"))
for i in range(len(degrees)):
    print("{:<10d}{:<20.0f}{:<20.0f}".format(degrees[i], tr_errors[i], val_errors[i]))

Degree    Training error      Validation error    
1         106704              107973              
2         106704              107973              
3         89336               93781               
4         89336               93781               
5         81582               89280               
6         81582               89280               
7         12802275            17613947            
8         12802275            17613947            
9         54569172            149943247           
