# Ridge regression with default parameters

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

In [4]:
# Import X and y (pre-processed data) from csv files

X = pd.read_csv('X.csv', index_col = 'Unnamed: 0')
y = pd.read_csv('y.csv', index_col = 'Unnamed: 0')

# For polynomial regression, choose 5 columns that were originally
# numerical data (scaled by ScandardScaler())
# Target variable y is scaled, but retains (+/-) sign, denoting whether
# flight arrived early (-) or late (+)

X = X[['month', 'crs_dep_timeFT', 'crs_arr_timeFT',
      'crs_elapsed_timeFT', 'distanceFT']]
y = y['yFT']

print(X.shape)
print(y.shape)

(9970, 5)
(9970,)


In [5]:
# Split the training data into sub-splits of training and 
# testing data (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 1)

In [7]:
# Fit the training data into a ridge regression model

rr = Ridge()
rr.fit(X_train, y_train)
y_rr = rr.predict(X_test)

In [8]:
# Calculate the R-squared score and RMSE (root mean square error) to
# determine how well the ridge regression performed

print('RMSE: ', mean_squared_error(y_test, y_rr))
print('R-squared score: ', r2_score(y_test, y_rr))

RMSE:  1.0941410565319378
R-squared score:  0.010189404973310001


# The R-squared score is very small; this model performs only slightly better than random guessing. I will abandon this model.