In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor

import harness

import timeit

np.warnings.filterwarnings('ignore', 'Ill-conditioned matrix')
pd.options.display.max_columns=None

In [2]:
flights_train = pd.read_csv('data/flights_train.csv', index_col=0)
flights_test = pd.read_csv('data/flights_test.csv', index_col=0)

In [3]:
flights_clean = harness.clean_train(flights_train)


add_features = harness.chain(
    harness.add_date_parts, 
    harness.add_haul, 
    harness.add_weather,
    harness.add_hour,
#    harness.make_all_dummies
)

flights_with_features = add_features(flights_clean)


x_transform = harness.chain(
    harness.keep_only_test_columns,
    add_features,
    [harness.add_all_grouped_stats, flights_with_features],
    harness.only_numeric,
    harness.scale,
)


transformer = harness.DataTransformer(
    x_transform, harness.normalize, harness.unnormalize
)


x_train, y_train = transformer.extract_transform(flights_clean)



In [4]:
for col in x_train.columns:
    print(f"{col}: {np.corrcoef(x_train[col], y_train.arr_delay)[0, 1]}")

mkt_carrier_fl_num: 0.012862292432801372
op_carrier_fl_num: 0.01287439929672372
origin_airport_id: -0.009438814869731969
dest_airport_id: 0.0030847547005330104
crs_dep_time: 0.09771383315876599
crs_arr_time: 0.08795351101131096
crs_elapsed_time: -0.08631898665835405
distance: -0.06560419674987938
month: -0.00398405787624075
day: -0.0035219871001030594
hour: 0.0973902908650512
conditions_origin_delay_mean: 0.004634543843910926
conditions_origin_delay_median: 0.0039170851277519755
conditions_origin_delay_std: 0.002623735069785848
conditions_origin_delay_min: -0.007552485780817887
conditions_origin_delay_max: 0.00951781981384199
conditions_dest_delay_mean: 0.004659182443629278
conditions_dest_delay_median: 0.008772759166266979
conditions_dest_delay_std: 0.0009512283715517388
conditions_dest_delay_min: -0.009349939769215831
conditions_dest_delay_max: 0.009145557551466572
weather_origin_delay_mean: 0.0022455829633361577
weather_origin_delay_median: 0.006136518666994105
weather_origin_delay_

In [5]:
def select_columns(df):
    return df[[
        'crs_dep_time',
        'crs_arr_time',
        'crs_elapsed_time',
        'distance',
        'origin_city_name_delay_mean',
        'dest_city_name_delay_mean',
        'origin_airport_id_delay_mean',
        'dest_airport_id_delay_mean',
        'day_delay_mean',
        'month_delay_mean',
        'op_unique_carrier_delay_mean',
        'haul_delay_mean',
        'tail_num_delay_mean',
        'hour_delay_mean',
    ]]





x_transform = harness.chain(x_transform, select_columns)
transformer = harness.DataTransformer(
    x_transform
)
x_train, y_train = transformer.extract_transform(flights_clean)

In [6]:
def train_model():
    RFR = RandomForestRegressor(n_estimators=10, n_jobs=-1)
    grid_search = ms.GridSearchCV(
        RFR, {'max_depth' : [None, 2,3,5],'max_leaf_nodes' : [None, 3,5,10]}
    )
    grid_search.fit(x_train.values, y_train.values)
    harness.save(grid_search.best_estimator_, 'The_Selective_Forest_2')

In [7]:
train_model()

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [8]:
RFR = harness.load('The_Selective_Forest_2')
RFR_model = harness.TrainedModel(RFR, transformer)


RFR_model.validate(harness.clean_train(flights_test))

R squared: -0.0281
Median absolute error: 15.7

In [9]:
RFR_model.submit(
    'data/test.csv', 'time_of_day_submission.csv', 'predicted_delay'
)