In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

In [2]:
flights_train = harness.read_flights('data/flights_train.csv')
flights_test = harness.read_flights('data/flights_test.csv')

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
# add_features = harness.chain(
#     harness.add_date_parts, harness.add_haul,
#     harness.add_hour, harness.make_all_dummies,
# )

In [5]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    [harness.make_weather_dummies, flights_clean],
    harness.only_numeric,
    harness.scale,
)

In [6]:
transformer = harness.DataTransformer(
    x_transform
)

In [7]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [15]:
def train_model():
    linreg = lm.Ridge(random_state=42)
    grid_search = ms.GridSearchCV(
        linreg, dict(alpha=[10 ** i for i in range(-6, 7)]),
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'weather_model')

In [16]:
train_model()

In [17]:
linreg = harness.load('weather_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [18]:
{col: coef for col, coef in zip(x_train.columns, linreg.coef_[0])}

{'crs_dep_time': 2.8584340504109327,
 'crs_arr_time': 2.2298875255389268,
 'crs_elapsed_time': -1.0207317955819768,
 'distance': 0.2952843358063764,
 'cold_o': -0.3450024902320432,
 'fog_o': -4.414159263912504,
 'hail_o': 0.018509331792886115,
 'precipitation_o': 1.826231058524534,
 'rain_o': -1.604060652835962,
 'snow_o': 1.1355474421276275,
 'storm_o': -0.12038458065655953,
 'severity_o': 7.467208591403767,
 'cold_d': -0.5341979922854408,
 'fog_d': -1.9348462153601473,
 'hail_d': 0.09814769358135515,
 'precipitation_d': 0.686296785490789,
 'rain_d': 0.35662214449956936,
 'snow_d': 0.7766777410849036,
 'storm_d': 0.4145983470704834,
 'severity_d': 3.8498514339389724}

In [19]:
linreg_model.validate(harness.clean_train(flights_test))[3]

R squared: 0.024
R squared (early = 0): -0.01
R squared (only delay): -0.182

In [13]:
# linreg_model.submit(
#     'data/test.csv', 'weather_submission.csv', 'predicted_delay'
# )