In [None]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import matplotlib.pyplot as plt
import seaborn as sns

import harness

import sklearn.model_selection as ms
import plotly.express as px
from sklearn.decomposition import PCA

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

In [None]:
pd.options.display.max_columns=None

In [None]:
flights_train = pd.read_csv('data/flights_train.csv', index_col=0)
flights_test = pd.read_csv('data/flights_test.csv', index_col=0)

In [None]:
flights_test_clean = harness.clean_train(flights_test)
flights_test_clean_copy = flights_test_clean.copy()
flights_test_clean_copy['month']=flights_test_clean_copy.fl_date
flights_test_clean_copy.month = flights_test_clean_copy.month.map(lambda v: int(v[5:7]))
flights_test_clean_copy['day']=flights_test_clean_copy.fl_date
flights_test_clean_copy.day = flights_test_clean_copy.day.map(lambda v: int(v[8:]))
flights_test_clean_copy['haul']=flights_test_clean_copy.crs_elapsed_time/60
flights_test_clean_copy['haul'] = pd.cut(flights_test_clean_copy.haul,bins=[0,3,6,12],labels=['Short','Medium','Long'])

In [None]:
flights_clean = harness.clean_train(flights_train)
flights_clean_copy = flights_clean.copy()
flights_clean_copy['month']=flights_clean_copy.fl_date
flights_clean_copy.month = flights_clean_copy.month.map(lambda v: int(v[5:7]))
flights_clean_copy['day']=flights_clean_copy.fl_date
flights_clean_copy.day = flights_clean_copy.day.map(lambda v: int(v[8:]))
flights_clean_copy['haul']=flights_clean_copy.crs_elapsed_time/60
flights_clean_copy['haul'] = pd.cut(flights_clean_copy.haul,bins=[0,3,6,12],labels=['Short','Medium','Long'])

In [None]:
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'op_unique_carrier')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'fl_date')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'dest_city_name')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'origin_airport_id')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'origin_city_name')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'dest_airport_id')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'day')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'month')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'haul')
flights_test_clean_copy = harness.add_grouped_stats(flights_test_clean_copy,'tail_num')

In [None]:
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'op_unique_carrier')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'fl_date')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'dest_city_name')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'origin_airport_id')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'origin_city_name')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'dest_airport_id')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'day')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'month')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'haul')
flights_clean_copy = harness.add_grouped_stats(flights_clean_copy,'tail_num')

In [None]:
flights_test_clean_copy = flights_test_clean_copy.drop(['dup',
                                                  'op_unique_carrier', 
                                                  'fl_date','dest_city_name',
                                                  'origin_airport_id','origin_city_name',
                                                  'dest_airport_id','day','month','haul',
                                                  'branded_code_share',
                                                  'mkt_carrier','origin','dest',
                                                  'tail_num','mkt_unique_carrier'], axis =1)

In [None]:
flights_clean_copy = flights_clean_copy.drop(['dup',
                                                  'op_unique_carrier', 
                                                  'fl_date','dest_city_name',
                                                  'origin_airport_id','origin_city_name',
                                                  'dest_airport_id','day','month','haul',
                                                  'branded_code_share',
                                                  'mkt_carrier','origin','dest',
                                                  'tail_num','mkt_unique_carrier'], axis =1)

In [None]:
flights_test_clean_copy['tail_num_delay_std'] = flights_test_clean_copy['tail_num_delay_std'].fillna(0)
flights_test_clean_copy['dest_airport_id_delay_std'] = flights_test_clean_copy['dest_airport_id_delay_std'].fillna(0)
flights_test_clean_copy['origin_city_name_delay_std'] = flights_test_clean_copy['origin_city_name_delay_std'].fillna(0)
flights_test_clean_copy['origin_airport_id_delay_std'] = flights_test_clean_copy['origin_airport_id_delay_std'].fillna(0)
flights_test_clean_copy['dest_city_name_delay_std'] = flights_test_clean_copy['dest_city_name_delay_std'].fillna(0)


In [None]:
flights_clean_copy['tail_num_delay_std'] = flights_clean_copy['tail_num_delay_std'].fillna(0)
flights_clean_copy['dest_airport_id_delay_std'] = flights_clean_copy['dest_airport_id_delay_std'].fillna(0)
flights_clean_copy['origin_city_name_delay_std'] = flights_clean_copy['origin_city_name_delay_std'].fillna(0)
flights_clean_copy['origin_airport_id_delay_std'] = flights_clean_copy['origin_airport_id_delay_std'].fillna(0)
flights_clean_copy['dest_city_name_delay_std'] = flights_clean_copy['dest_city_name_delay_std'].fillna(0)

In [None]:
X_train = flights_clean_copy.drop('arr_delay',axis=1)
y_train = flights_clean_copy.arr_delay
X_test = flights_test_clean_copy.drop('arr_delay',axis=1)
y_test = flights_test_clean_copy.arr_delay

In [None]:
GBR = GradientBoostingRegressor()
GBR = GBR.fit(X_train,y_train)
y_pred = GBR.predict(X_test)

In [None]:
r2_score(y_test, y_pred)