In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

In [2]:
flights_train = harness.read_flights('data/flights_train.csv')
flights_test = harness.read_flights('data/flights_test.csv')

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
# add_features = harness.chain(
#     harness.add_date_parts, harness.add_haul,
#     harness.add_hour, harness.make_all_dummies,
# )

In [5]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    harness.make_weather_dummies,
    harness.only_numeric,
    harness.scale,
)

In [6]:
transformer = harness.DataTransformer(
    x_transform, harness.remove_early
)

In [7]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [9]:
def train_model():
    linreg = lm.Ridge(random_state=42)
    grid_search = ms.GridSearchCV(
        linreg, dict(alpha=[10 ** i for i in range(-6, 7)]),
        scoring=transformer.score
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'weather_model')

In [10]:
#train_model()

In [11]:
linreg = harness.load('weather_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [18]:
{col: coef for col, coef in zip(x_train.columns, linreg.coef_[0])}

{'mkt_carrier_fl_num': -0.362685088191627,
 'op_carrier_fl_num': 0.3087486251620202,
 'origin_airport_id': -0.18272107952478397,
 'dest_airport_id': 0.2194914683430145,
 'crs_dep_time': 0.32312626047834475,
 'crs_arr_time': 0.4722154353554303,
 'crs_elapsed_time': -8.642436343157227,
 'distance': 7.766420843737963,
 'cold_o': -0.09418322984862719,
 'fog_o': -0.43831196865368227,
 'hail_o': 0.014965341491729416,
 'precipitation_o': 0.10660271569428056,
 'rain_o': 0.035450621073347816,
 'snow_o': 0.395379351808152,
 'storm_o': -0.06187622081977797,
 'severity_o': 0.9002903463938772,
 'cold_d': -0.0798745899646338,
 'fog_d': -0.2504392412388941,
 'hail_d': 0.010333775391612364,
 'precipitation_d': 0.12266048070805372,
 'rain_d': 0.1574733084669595,
 'snow_d': 0.058765824699360175,
 'storm_d': -0.007758301820339872,
 'severity_d': 0.6189948917018101}

In [12]:
linreg_model.validate(harness.clean_train(flights_test))

R squared: -0.0701
Median absolute error: 9.6
R squared (no early): 0.0532
Median absolute error (no early): 7.22

In [13]:
# linreg_model.submit(
#     'data/test.csv', 'everything_linear_submission.csv', 'predicted_delay'
# )