In [1]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import sklearn.ensemble as ens
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns

import harness

In [2]:
flights_train = harness.read_flights('../data/flights_train.csv')
flights_test = harness.read_flights('../data/flights_test.csv')

In [3]:
flights_clean = harness.clean_train_late_only(flights_train)

In [4]:
add_features = harness.chain(
    harness.add_date_parts, harness.add_haul,
    harness.add_hour,
)

In [5]:
flights_with_features = add_features(flights_clean)

In [6]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    add_features,
    [harness.make_all_dummies_except_city_pairs, flights_with_features],
    harness.only_numeric,
)

In [7]:
transformer = harness.DataTransformer(
    x_transform, harness.remove_early
)

In [8]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [9]:
def train_model():
    RFR = ens.RandomForestRegressor(n_estimators=100, n_jobs=-1)
#     grid_search = ms.GridSearchCV(
#         RFR, {'max_depth' : [None, 2,3,5],'max_leaf_nodes' : [None, 3,5,10]},
#         scoring=transformer.score_transformed
#     )
#     grid_search.fit(x_train.values, y_train.values.ravel())
#     harness.save(grid_search.best_estimator_, 'random_forest_no_early_model')
    RFR.fit(x_train.values, y_train.values.ravel())
    harness.save(RFR, 'random_forest_no_early_model')

In [10]:
train_model()

In [11]:
rfr = harness.load('random_forest_no_early_model')
rfr_model = harness.TrainedModel(rfr, transformer)

In [12]:
rfr_model.validate(harness.clean_train_late_only(flights_test))

R squared: -0.0232
Median absolute error: 21.7
R squared (no early): -0.0232
Median absolute error (no early): 21.7

In [None]:
rfr_model.submit(
    '../data/test.csv', 'random_forest_no_early_submission.csv', 'predicted_delay'
)