In [1]:
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

In [2]:
flights_train = pd.read_csv('data/flights_train.csv', index_col=0)
flights_test = pd.read_csv('data/flights_test.csv', index_col=0)

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
add_features = harness.chain(
    harness.add_date_parts, harness.add_haul, harness.add_weather
)

In [5]:
flights_with_features = add_features(flights_clean)

In [6]:
x_transform = harness.chain(
    add_features,
    [harness.add_all_grouped_stats, flights_with_features],
    harness.only_numeric
)

In [7]:
transformer = harness.DataTransformer(
    x_transform, harness.normalize, harness.unnormalize
)

In [8]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [9]:
def train_model():
    linreg = lm.Ridge(random_state=42)
    grid_search = ms.GridSearchCV(
        linreg, dict(alpha=[10 ** i for i in range(-6, 7)])
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'time_of_day_model')

In [10]:
train_model()

Traceback (most recent call last):
  File "/home/meamer/anaconda3/envs/midterm/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/meamer/anaconda3/envs/midterm/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py", line 762, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "/home/meamer/anaconda3/envs/midterm/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py", line 542, in fit
    X, y = self._validate_data(X, y,
  File "/home/meamer/anaconda3/envs/midterm/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/meamer/anaconda3/envs/midterm/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/home/meamer/anaconda3/envs/midterm/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
linreg = harness.load('time_of_day_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [None]:
linreg.coef_

In [None]:
linreg_model.validate(harness.clean_train(flights_test))

In [None]:
linreg_model.submit(
    'data/test.csv', 'time_of_day_submission.csv', 'predicted_delay'
)

So this model is pretty bad. It's worse than just assuming the average delay. We can see why looking at the graphs below. Time of day predicts taxi time well, but this doesn't translate well into actual delays measurements.

In [None]:
sns.scatterplot(x=flights_clean.crs_dep_time, y=y_train.arr_delay, alpha=0.05)
plt.show()

In [None]:
sns.scatterplot(x=flights_clean.crs_arr_time, y=y_train.arr_delay, alpha=0.05)
plt.show()