# Ridge regression with default parameters

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

In [7]:
# Import X and y (pre-processed data) from csv files

X = pd.read_csv('X.csv', index_col = 'Unnamed: 0')
y = pd.read_csv('y.csv', index_col = 'Unnamed: 0')

# Target variable y is scaled, but retains (+/-) sign, denoting whether
# flight arrived early (-) or late (+)

y = y['yFT']

print(X.shape)
print(y.shape)

(9970, 657)
(9970,)


In [4]:
# Split the training data into sub-splits of training and 
# testing data (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 1)

In [5]:
# Fit the training data into a ridge regression model

rr = Ridge()
rr.fit(X_train, y_train)
y_rr = rr.predict(X_test)

In [6]:
# Calculate the R-squared score and RMSE (root mean square error) to
# determine how well the ridge regression performed

print('RMSE: ', mean_squared_error(y_test, y_rr))
print('R-squared score: ', r2_score(y_test, y_rr))

RMSE:  1.1383162186882478
R-squared score:  -0.02977348946182956


# Now try again with 100000 rows of data

In [9]:
X = pd.read_csv('X100k.csv', index_col = 'Unnamed: 0', compression = 'gzip')
y = pd.read_csv('y100k.csv', index_col = 'Unnamed: 0', compression = 'gzip')
y = y['yFT']
print(X.shape)
print(y.shape)

(98047, 771)
(98047,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 1)

rr = Ridge()
rr.fit(X_train, y_train)
y_rr = rr.predict(X_test)

print('Training score: ', rr.score(X_train, y_train))
print('R-squared test score: ', r2_score(y_test, y_rr))

Training score:  0.029931457845720932
R-squared test score:  0.006466164364790972
