# Lasso regression with default parameters

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
# Import X and y (pre-processed data) from csv files

X = pd.read_csv('X.csv', index_col = 'Unnamed: 0')
y = pd.read_csv('y.csv', index_col = 'Unnamed: 0')

# For polynomial regression, choose 5 columns that were originally
# numerical data (scaled by ScandardScaler())
# Target variable y is scaled, but retains (+/-) sign, denoting whether
# flight arrived early (-) or late (+)

X = X[['month', 'crs_dep_timeFT', 'crs_arr_timeFT',
      'crs_elapsed_timeFT', 'distanceFT']]
y = y['yFT']

print(X.shape)
print(y.shape)

(9970, 5)
(9970,)


In [4]:
# Split the training data into sub-splits of training and 
# testing data (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 1)

In [5]:
# Fit the training data into a lasso regression model

lasso = Lasso()
lasso.fit(X_train, y_train)
y_lasso = lasso.predict(X_test)

In [6]:
# Calculate the R-squared score and RMSE (root mean square error) to
# determine how well the ridge regression performed

print('RMSE: ', mean_squared_error(y_test, y_lasso))
print('R-squared score: ', r2_score(y_test, y_lasso))

RMSE:  1.1054069182698258
R-squared score:  -2.2145109388471695e-06


# The R-squared score is negative; this model performs slightly worse than random guessing. I will abandon this model.