In [14]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

In [2]:
flights_train = harness.read_flights('data/flights_train.csv')
flights_test = harness.read_flights('data/flights_test.csv')

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
# add_features = harness.chain(
#     harness.add_date_parts, harness.add_haul,
#     harness.add_hour, harness.make_all_dummies,
# )

In [5]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    [harness.make_weather_dummies, flights_clean],
    harness.only_numeric,
    harness.scale,
)

In [6]:
transformer = harness.DataTransformer(
    x_transform, harness.remove_early
)

In [7]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [8]:
def train_model():
    linreg = lm.Ridge(random_state=42)
    grid_search = ms.GridSearchCV(
        linreg, dict(alpha=[10 ** i for i in range(-6, 7)]),
        scoring=transformer.score_transformed
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'weather_model')

In [9]:
# train_model()

In [10]:
linreg = harness.load('weather_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [11]:
{col: coef for col, coef in zip(x_train.columns, linreg.coef_[0])}

{'crs_dep_time': 2.5062369626454886,
 'crs_arr_time': 1.7770447303057946,
 'crs_elapsed_time': 7.141477541173036,
 'distance': -7.016086796760896,
 'cold_o': -0.24968616652995726,
 'fog_o': -3.986672907873232,
 'hail_o': 0.002941947240471487,
 'precipitation_o': 1.7205392313919976,
 'rain_o': -1.6423241565919748,
 'snow_o': 0.7419678446656128,
 'storm_o': -0.059449800092904265,
 'severity_o': 6.57781546753999,
 'cold_d': -0.4578919840522657,
 'fog_d': -1.6898973084116709,
 'hail_d': 0.08740574497680006,
 'precipitation_d': 0.5638186073083732,
 'rain_d': 0.19638845594147197,
 'snow_d': 0.716213588276369,
 'storm_d': 0.4232026091226577,
 'severity_d': 3.242166548229952}

In [12]:
linreg_model.validate(harness.clean_train(flights_test))

R squared: -0.00224
Median absolute error: 22.5
R squared (no early): 0.0195
Median absolute error (no early): 12.1

In [13]:
# linreg_model.submit(
#     'data/test.csv', 'weather_submission.csv', 'predicted_delay'
# )