In [96]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [97]:
##### LOAD IN FROM HERE TO SAVE TIME (FILES CREATED IN EDA notebook)

df = pd.read_csv('flights1500.csv')

weather_df = pd.read_csv('weatherframeTrunc.csv')
weather_dfALT = pd.read_csv('weatherframeTruncALT.csv')

test = pd.read_csv('flights_test.csv')

In [119]:
X = weather_df[['mkt_carrier','distance','origin','dest','crs_dep_time','dep_weather']]
y = weather_df['arr_delay']

X = X.astype({'mkt_carrier': 'category', 'origin': 'category', 'dest': 'category', 'dep_weather': 'category' })

dummies = pd.get_dummies(X['dep_weather'], drop_first=True)
X = pd.merge(X,dummies, left_index=True, right_index=True)

X['carrier_cat'] = X['mkt_carrier'].cat.codes
X['origin_cat'] = X['origin'].cat.codes
X['dest_cat'] = X['dest'].cat.codes
X = X.drop(['mkt_carrier','origin','dest','crs_dep_time','dep_weather'], axis=1)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x = X['distance'].to_numpy().reshape(-1,1)

scaler.fit(x)

x = scaler.transform(x)

X['distance'] = x

X

Unnamed: 0,distance,Rain,Snow,Sunny,carrier_cat,origin_cat,dest_cat
0,1.532705,0,0,1,2,179,13
1,-0.009199,0,0,0,8,23,124
2,-0.861260,0,0,1,8,44,124
3,0.415977,0,0,1,0,136,175
4,-0.217519,0,0,1,0,51,183
...,...,...,...,...,...,...,...
1476,-0.622205,0,0,1,10,183,166
1477,-0.043350,0,0,1,3,8,50
1478,2.949960,0,1,0,1,22,156
1479,0.641372,0,0,1,5,149,157


In [120]:
X2 = weather_dfALT[['mkt_carrier','distance','origin','dest','crs_dep_time','dep_weather']]
y2 = weather_dfALT['arr_delay']

X2 = X2.astype({'mkt_carrier': 'category', 'origin': 'category', 'dest': 'category', 'dep_weather': 'category' })

dummies = pd.get_dummies(X2['dep_weather'], drop_first=True)
X2 = pd.merge(X2,dummies, left_index=True, right_index=True)

X2['carrier_cat'] = X2['mkt_carrier'].cat.codes
X2['origin_cat'] = X2['origin'].cat.codes
X2['dest_cat'] = X2['dest'].cat.codes
X2 = X2.drop(['mkt_carrier','origin','dest','crs_dep_time','dep_weather'], axis=1)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x = X2['distance'].to_numpy().reshape(-1,1)

scaler.fit(x)

x = scaler.transform(x)

X2['distance'] = x

X2

Unnamed: 0,distance,Rain,Snow,Sunny,carrier_cat,origin_cat,dest_cat
0,-0.316546,1,0,0,3,171,115
1,-1.020650,0,0,1,0,41,31
2,-0.968750,0,0,1,0,139,152
3,0.257809,0,0,1,3,124,8
4,0.380638,0,0,0,1,173,165
...,...,...,...,...,...,...,...
1464,-0.664273,1,0,0,8,105,87
1465,-0.188527,0,0,0,8,51,19
1466,0.583046,0,0,1,0,158,36
1467,0.062321,0,0,1,3,179,8


In [121]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20)

In [122]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2, test_size=0.20)

In [123]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, num_parallel_tree=10)

data_dmatrix = xgb.DMatrix(data=X,label=y, enable_categorical=True)

xg_reg.fit(X,y)

y_pred = xg_reg.predict(X_test)

r2_score(y_test, y_pred)

0.08743216914434115

In [124]:
########################## USING ALTERNATIVE DATASET ###############################
y2_pred = xg_reg.predict(X2)

r2_score(y2, y2_pred)

-0.008365927991909938

In [105]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10, 'num_parallel_tree': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=5,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True)

cv_results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,50.803926,2.801615,49.9619,11.256005
1,50.441279,2.818949,49.908072,11.268166
2,50.067065,2.847242,49.878081,11.274499
3,49.740167,2.862554,49.835502,11.30003
4,49.390332,2.831378,49.80251,11.313263


In [125]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree=1.0, learning_rate=0.1,
                          subsample=0.8, min_child_weight=5, max_depth=20, gamma=0.5, alpha=10, n_estimators=100)

data_dmatrix = xgb.DMatrix(data=X,label=y, enable_categorical=True)

xg_reg.fit(X,y)

y_pred = xg_reg.predict(X_test)

r2_score(y_test, y_pred)

0.9239558941989927

In [126]:
########################## USING ALTERNATIVE DATASET ###############################
y2_pred = xg_reg.predict(X2)

r2_score(y2, y2_pred)

-0.17023106385995468

In [127]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.2, learning_rate = 0.1,
                max_depth = 9, alpha = 10, n_estimators = 150, num_parallel_tree=10, subsample=0.8)

data_dmatrix = xgb.DMatrix(data=X,label=y, enable_categorical=True)

xg_reg.fit(X,y)

y_pred = xg_reg.predict(X_test)

r2_score(y_test, y_pred)

0.4918773067664507

In [128]:
########################## USING ALTERNATIVE DATASET ###############################
y2_pred = xg_reg.predict(X2)

r2_score(y2, y2_pred)

-0.5163264465746273

In [129]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.2, gamma=1, learning_rate = 0.1,
                max_depth = 2, alpha = 10, n_estimators = 110, num_parallel_tree=22, subsample=0.8)

data_dmatrix = xgb.DMatrix(data=X,label=y, enable_categorical=True)

xg_reg.fit(X,y)

y_pred = xg_reg.predict(X_test)

r2_score(y_test, y_pred)

0.04857336542736679

In [130]:
########################## USING ALTERNATIVE DATASET ###############################
y2_pred = xg_reg.predict(X2)

r2_score(y2, y2_pred)

-0.015866290410343042

((1469,), (1469,))

In [133]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror')

xg_reg.fit(X,y)

y_pred = xg_reg.predict(X_test)

r2_score(y_test, y_pred)

0.9365416256752392

In [134]:
########################## USING ALTERNATIVE DATASET ###############################
y2_pred = xg_reg.predict(X2)

r2_score(y2, y2_pred)

-0.3212078202920472

In [116]:
test

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,AA,AA,AA,1149,AA,N150NN,1149,11540,ELP,"El Paso, TX",11298,DFW,"Dallas/Fort Worth, TX",611,858,N,107,1,551
1,2020-01-07,UA,UA_CODESHARE,UA,3472,YX,N724YX,3472,11618,EWR,"Newark, NJ",11986,GRR,"Grand Rapids, MI",2129,2349,N,140,1,605
2,2020-01-01,AA,AA_CODESHARE,AA,3465,MQ,N922AE,3465,11298,DFW,"Dallas/Fort Worth, TX",11982,GRK,"Killeen, TX",1849,1952,N,63,1,134
3,2020-01-02,WN,WN,WN,2519,WN,N446WN,2519,10423,AUS,"Austin, TX",11259,DAL,"Dallas, TX",830,930,N,60,1,189
4,2020-01-03,AA,AA,AA,2124,AA,N951UW,2124,12953,LGA,"New York, NY",11278,DCA,"Washington, DC",1300,1428,N,88,1,214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-01-02,F9,F9,F9,404,F9,N232FR,404,12892,LAX,"Los Angeles, CA",11292,DEN,"Denver, CO",1206,1544,N,158,1,862
96,2020-01-01,B6,B6,B6,2534,B6,N192JB,2534,12478,JFK,"New York, NY",10785,BTV,"Burlington, VT",735,850,N,75,1,266
97,2020-01-04,DL,DL,DL,1442,DL,N912DE,1442,10397,ATL,"Atlanta, GA",12264,IAD,"Washington, DC",1725,1903,N,98,1,534
98,2020-01-07,DL,DL,DL,1808,DL,N932AT,1808,12191,HOU,"Houston, TX",10397,ATL,"Atlanta, GA",1845,2145,N,120,1,696
