In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

np.warnings.filterwarnings('ignore', 'Ill-conditioned matrix')

In [2]:
flights_train = pd.read_csv('data/flights_train.csv', index_col=0)
flights_test = pd.read_csv('data/flights_test.csv', index_col=0)

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
add_features = harness.chain(
    harness.add_date_parts, harness.add_haul, harness.add_weather
)

In [5]:
flights_with_features = add_features(flights_clean)

In [6]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    add_features,
    [harness.add_all_grouped_stats, flights_with_features],
    harness.only_numeric,
    harness.scale,
)

In [7]:
transformer = harness.DataTransformer(
    x_transform
)

In [8]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [1]:
for col in x_train.columns:
    print(f"{col}: {np.corrcoef(x_train[col], y_train.arr_delay)[0, 1]}")

mkt_carrier_fl_num: 0.02396390030554952
op_carrier_fl_num: 0.023957695784052345
origin_airport_id: -0.0059979326027192305
dest_airport_id: 0.0007424099189047517
crs_dep_time: 0.08637986895272753
crs_arr_time: 0.08219644379892342
crs_elapsed_time: -0.013246107998264683
distance: -0.013977561031507504
month: -0.00956819291072373
day: -0.0029787768345340875
conditions_origin_delay_mean: 0.006186058477773424
conditions_origin_delay_median: 0.001537449998639927
conditions_origin_delay_std: 0.004995596611660686
conditions_origin_delay_min: 7.787678155686351e-05
conditions_origin_delay_max: 0.0013206388078667924
conditions_dest_delay_mean: 0.00553239598409512
conditions_dest_delay_median: 0.00301426650675934
conditions_dest_delay_std: 0.0034098259923459142
conditions_dest_delay_min: -0.0020022489136836685
conditions_dest_delay_max: 0.0007614044066044412
weather_origin_delay_mean: 0.002438233137473265
weather_origin_delay_median: -0.000591256779837111
weather_origin_delay_std: 0.00170684719562

In [10]:
def select_columns(df):
    return df[[
        'origin_airport_id_delay_mean',
        'origin_airport_id_delay_std',
        'dest_airport_id_delay_mean',
        'dest_airport_id_delay_std',
    ]]

In [11]:
x_transform = harness.chain(x_transform, select_columns)
transformer = harness.DataTransformer(
    x_transform
)
x_train, y_train = transformer.extract_transform(flights_clean)

In [12]:
def train_model():
    linreg = lm.Ridge(random_state=42)
    grid_search = ms.GridSearchCV(
        linreg, dict(alpha=[10 ** i for i in range(-6, 7)]),
    )
    grid_search.fit(x_train.values, y_train.values)
    print(grid_search.best_score_)
    harness.save(grid_search.best_estimator_, 'selective_linear_model')

In [13]:
#train_model()

In [14]:
linreg = harness.load('selective_linear_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [15]:
linreg_model.validate(harness.clean_train(flights_test))

R squared: 0.00169
Median absolute error: 16.5

In [16]:
# linreg_model.submit(
#     'data/test.csv', 'everything_linear_submission.csv', 'predicted_delay'
# )