In [1]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import sklearn.ensemble as ens
import sklearn.preprocessing as pre
import sklearn.model_selection as ms

import harness

In [2]:
flights_train = harness.read_flights('../data/flights_train.csv')
flights_test = harness.read_flights('../data/flights_test.csv')

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    [harness.make_weather_dummies, flights_clean],
    [harness.transfer_grouped_means, flights_clean, 100, 'origin', 'dest'],
    harness.only_numeric,
    harness.scale,
    harness.powers_of_time,
    [
        harness.drop,
        'crs_dep_time', 'crs_arr_time', 'has_origin__dest_mean',
        'crs_dep_time_hours', 'crs_arr_time_hours'
    ]
)
transformer = harness.DataTransformer(x_transform)

In [5]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [6]:
def train_model():
    RFR = ens.RandomForestRegressor(n_estimators=20, n_jobs=-1)
    grid_search = ms.GridSearchCV(
        RFR, {'max_depth' : [2,3,4], 'max_features' : [2,5,10]},
        scoring=transformer.score_transformed
    )
    grid_search.fit(x_train.values, y_train.values.ravel())
    harness.save(grid_search.best_estimator_, 'random_forest_no_early_model')
#     RFR.fit(x_train.values, y_train.values.ravel())
#     harness.save(RFR, 'last_hurrah_random_forest_model')

In [7]:
train_model()

In [8]:
rfr = harness.load('last_hurrah_random_forest_model')
rfr_model = harness.TrainedModel(rfr, transformer)

In [9]:
rfr_model.validate(harness.clean_train(flights_test))[3]

R squared: -0.111
R squared (early = 0): -0.116
R squared (only delay): -0.206

In [10]:
rfr_model.submit(
    '../data/test.csv', 'last_hurrah_random_forest_submission.csv', 'predicted_delay'
)