In [1]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

In [2]:
flights_train = harness.read_flights('../data/flights_train.csv')
flights_test = harness.read_flights('../data/flights_test.csv')

In [3]:
flights_clean = harness.clean_train_late_only(flights_train)

In [4]:
# add_features = harness.chain(
#     harness.add_date_parts, harness.add_haul,
#     harness.add_hour, harness.make_all_dummies,
# )

In [5]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    [harness.make_weather_dummies, flights_clean],
    harness.only_numeric,
    harness.scale,
)

In [6]:
transformer = harness.DataTransformer(
    x_transform, harness.remove_early
)

In [7]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [8]:
def train_model():
    linreg = lm.Ridge(random_state=42)
    grid_search = ms.GridSearchCV(
        linreg, dict(alpha=[10 ** i for i in range(-6, 7)]),
        scoring=transformer.score_transformed
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'weather_model')

In [9]:
# train_model()

In [10]:
linreg = harness.load('weather_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [11]:
{col: coef for col, coef in zip(x_train.columns, linreg.coef_[0])}

{'crs_dep_time': 2.8015052110125596,
 'crs_arr_time': 2.2989973556777263,
 'crs_elapsed_time': 26.512182900827163,
 'distance': -26.699413631308413,
 'cold_o': 0.08213972165775348,
 'fog_o': -5.004781679196559,
 'hail_o': -0.4126447299487693,
 'precipitation_o': 2.714336953590364,
 'rain_o': -2.890272092025589,
 'snow_o': -1.0214417414531496,
 'storm_o': 0.05499597582487183,
 'severity_o': 8.667838540834344,
 'cold_d': -0.3826600601592919,
 'fog_d': -1.4524061547044713,
 'hail_d': 0.1255080165942015,
 'precipitation_d': 0.6048645541961506,
 'rain_d': 0.3852238099097103,
 'snow_d': 1.7449399867962574,
 'storm_d': 0.7528020388200805,
 'severity_d': 2.920261106979311}

In [12]:
linreg_model.validate(harness.clean_train_late_only(flights_test))

R squared: 0.0157
Median absolute error: 27.6
R squared (no early): 0.0157
Median absolute error (no early): 27.6

In [13]:
linreg_model.submit(
    '../data/test.csv', 'weather_submission.csv', 'predicted_delay'
)