In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

In [2]:
flights_train = harness.read_flights('data/flights_train.csv')
flights_test = harness.read_flights('data/flights_test.csv')

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
harness.make_city_pairs_dummies(flights_clean)

Unnamed: 0_level_0,fl_date_crs_dep_time,fl_date_crs_arr_time,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,...,"Washington, DC to Minneapolis, MN","Washington, DC to New York, NY","Washington, DC to Newark, NJ","Washington, DC to Orlando, FL","Washington, DC to Raleigh/Durham, NC","Washington, DC to San Francisco, CA","Washington, DC to St. Louis, MO","West Palm Beach/Palm Beach, FL to Atlanta, GA","West Palm Beach/Palm Beach, FL to New York, NY","West Palm Beach/Palm Beach, FL to Newark, NJ"
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
121941,2019-08-25 18:18:00,2019-08-25 19:21:00,2019-08-25,AA,AA_CODESHARE,AA,5636,OH,N723PS,5636,...,0,0,0,0,0,0,0,0,0,0
109932,2018-03-02 07:45:00,2018-03-02 09:15:00,2018-03-02,WN,WN,WN,2028,WN,N244WN,2028,...,0,0,0,0,0,0,0,0,0,0
63108,2019-03-06 12:14:00,2019-03-06 15:31:00,2019-03-06,UA,UA,UA,545,UA,N69804,545,...,0,0,0,0,0,0,0,0,0,0
91519,2019-10-11 06:00:00,2019-10-11 08:06:00,2019-10-11,DL,DL_CODESHARE,DL,3798,OO,N452SW,3798,...,0,0,0,0,0,0,0,0,0,0
8361,2019-09-26 14:26:00,2019-09-26 15:21:00,2019-09-26,UA,UA_CODESHARE,UA,5359,OO,N120SY,5359,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48069,2019-05-01 06:05:00,2019-05-01 06:38:00,2019-05-01,G4,G4,G4,2170,G4,260NV,2170,...,0,0,0,0,0,0,0,0,0,0
68804,2019-07-01 05:40:00,2019-07-01 08:51:00,2019-07-01,UA,UA_CODESHARE,UA,6259,YV,N80343,6259,...,0,0,0,0,0,0,0,0,0,0
43542,2018-07-12 13:00:00,2018-07-12 14:15:00,2018-07-12,AA,AA,AA,412,AA,N177US,412,...,0,0,0,0,0,0,0,0,0,0
100583,2018-11-21 18:55:00,2018-11-21 20:56:00,2018-11-21,AA,AA,AA,1252,AA,N136AN,1252,...,0,0,0,0,0,0,0,0,0,0


In [5]:
add_features = harness.chain(
    harness.add_date_parts, harness.add_haul,
    harness.add_hour, harness.make_all_dummies,
)

In [6]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    add_features,
    harness.only_numeric,
    harness.scale,
)

In [7]:
transformer = harness.DataTransformer(
    x_transform, harness.remove_early
)

In [8]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [9]:
def train_model():
    linreg = lm.Ridge(random_state=42)
    grid_search = ms.GridSearchCV(
        linreg, dict(alpha=[10 ** i for i in range(-6, 7)]),
        scoring=transformer.score_transformed,
        njobs=-1
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'everything_linear_model')

In [14]:
# train_model()

In [11]:
linreg = harness.load('everything_linear_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [12]:
linreg_model.validate(harness.clean_train(flights_test))

R squared: 0.0129
Median absolute error: 22.1
R squared (no early): 0.0417
Median absolute error (no early): 12.3

In [13]:
# linreg_model.submit(
#     'data/test.csv', 'everything_linear_submission.csv', 'predicted_delay'
# )