# Polynomial linear regression with degree = 2 and features = 5

In [53]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [54]:
# Import X and y (pre-processed data) from csv files

X = pd.read_csv('X.csv', index_col = 'Unnamed: 0')
y = pd.read_csv('y.csv', index_col = 'Unnamed: 0')

# For polynomial regression, choose 5 columns that were originally
# numerical data (scaled by ScandardScaler())
# Target variable y is scaled, but retains (+/-) sign, denoting whether
# flight arrived early (-) or late (+)

X = X[['month', 'crs_dep_timeFT', 'crs_arr_timeFT',
      'crs_elapsed_timeFT', 'distanceFT']]
y = y['yFT']

print(X.shape)
print(y.shape)

(9970, 5)
(9970,)


In [56]:
# First try polynomial degree = 2

X = np.array(X)
y = np.array(y)
pf = PolynomialFeatures(2)
X = pf.fit_transform(X)
print(X.shape)

(9970, 253)


In [67]:
# Split the training data into sub-splits of training and 
# testing data (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 1)

In [68]:
# Fit the polynomial feature-transformed data as a linear regression model

lr = LinearRegression()
lr.fit(X_train, y_train)
y_lr = lr.predict(X_test)

In [69]:
# Calculate the R-squared score and RMSE (root mean square error) to
# determine how well the polynomial linear regression performed

print('RMSE: ', mean_squared_error(y_test, y_lr))
print('R-squared score: ', r2_score(y_test, y_lr))

RMSE:  1.1176475599746347
R-squared score:  -0.011075665029054438


# The R-squared score is negative, so this model performs worse than random guessing. I am abandoning it.