In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

import harness

np.warnings.filterwarnings('ignore', 'Ill-conditioned matrix')
pd.options.display.max_columns=None

In [2]:
flights_train = pd.read_csv('data/flights_train.csv', index_col=0)
flights_test = pd.read_csv('data/flights_test.csv', index_col=0)

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
add_features = harness.chain(
    harness.add_date_parts, harness.add_haul, harness.add_weather, harness.add_hour
)

In [5]:
flights_with_features = add_features(flights_clean)

In [6]:
x_transform = harness.chain(
    harness.keep_only_test_columns,
    add_features,
    [harness.add_all_grouped_stats, flights_with_features],
    harness.only_numeric,
    harness.scale
)

In [7]:
transformer = harness.DataTransformer(
    x_transform, harness.normalize, harness.unnormalize
)

In [8]:
x_train, y_train = transformer.extract_transform(flights_clean)



In [9]:
def train_model():
    RFR = RandomForestRegressor(n_estimators=10, n_jobs=-1)
    grid_search = ms.GridSearchCV(
        RFR, {'max_depth' : [None, 2,3,5],'max_leaf_nodes' : [None, 3,5,10]}
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'The_First_Forest')

In [10]:
train_model()

  estimator.fit(X_train, y_train, **fit_params)


KeyboardInterrupt: 

In [None]:
RFR = harness.load('The_First_Forest')
RFR_model = harness.TrainedModel(RFR, transformer)

In [None]:
RFR_model.validate(harness.clean_train(flights_test))

In [None]:
RFR_model.submit(
    'data/test.csv', 'time_of_day_submission.csv', 'predicted_delay'
)