# Regression Testing

### Importing Modules

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as skmu
from math import sqrt
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

### Datasets

In [2]:
train = pd.read_csv("Data/TrainingData.csv", encoding = "utf-8")
test = pd.read_csv("Data/TestData.csv", encoding = "utf-8")
train_X = train[(train.ArrDel15 == 1)]
train_X = train[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()

In [3]:
regtrain_Y = train[['ArrDelayMinutes']].values.ravel()

### Regressors

In [4]:
regxgb = XGBRegressor(n_estimators=200, tree_method='gpu_hist', predictor='gpu_predictor')
regxgb.fit(train_X,regtrain_Y)
reglin = LinearRegression()
reglin.fit(train_X,regtrain_Y)
regtrees = DecisionTreeRegressor(random_state = 0)
regtrees.fit(train_X, regtrain_Y)
regada = AdaBoostRegressor(n_estimators = 100)
regada.fit(train_X, regtrain_Y)

AdaBoostRegressor(n_estimators=100)

In [5]:
def regressor(test_X, regactual_Y):
    # XGB Regressor
    regpred_Yxgb = regxgb.predict(test_X)
    # XGB Regressor Metrics
    rmsexgb = sqrt(skmu.mean_squared_error(regactual_Y,regpred_Yxgb))
    maexgb = skmu.mean_absolute_error(regactual_Y,regpred_Yxgb)
    r2xgb = skmu.r2_score(regactual_Y,regpred_Yxgb)
    print("XGBoost Regressor Metrics: \n RMSE: ", rmsexgb, "\n MAE: ", maexgb, "\n R2: ",r2xgb)
    # Linear Regressor
    regpred_Ylin = reglin.predict(test_X)
    # Linear Regressor Metrics
    rmselin = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ylin))
    maelin = skmu.mean_absolute_error(regactual_Y,regpred_Ylin)
    r2lin = skmu.r2_score(regactual_Y,regpred_Ylin)
    print("Linear Regressor Metrics: \n RMSE: ", rmselin, "\n MAE: ", maelin, "\n R2: ",r2lin)
    # Decision Trees Regressor
    regpred_Ytree = regtrees.predict(test_X)
    # Decision Trees Regressor Metrics
    rmsetree = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ytree))
    maetree = skmu.mean_absolute_error(regactual_Y,regpred_Ytree)
    r2tree = skmu.r2_score(regactual_Y,regpred_Ytree)
    print("Decision Tree Regressor Metrics: \n RMSE: ", rmsetree, "\n MAE: ", maetree, "\n R2: ",r2tree)
    # AdaBoost Regressor
    regpred_Yada = regada.predict(test_X)
    # AdaBoost Regressor Metrics
    rmseada = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Yada))
    maeada = skmu.mean_absolute_error(regactual_Y,regpred_Yada)
    r2ada = skmu.r2_score(regactual_Y,regpred_Yada)
    print("AdaBoost Regressor Metrics: \n RMSE: ", rmseada, "\n MAE: ", maeada, "\n R2: ",r2ada)

### Grouping Test Data

In [6]:
test1 = test[(test.ArrDelayMinutes >= 15 ) & (test.ArrDelayMinutes < 100)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

XGBoost Regressor Metrics: 
 RMSE:  31.531257732192447 
 MAE:  23.94673017562231 
 R2:  -1.0708817547811638
Linear Regressor Metrics: 
 RMSE:  31.887415866269865 
 MAE:  24.095272114028276 
 R2:  -1.1179288413889537
Decision Tree Regressor Metrics: 
 RMSE:  57.686806429606214 
 MAE:  37.445321767497035 
 R2:  -5.931475726800077
AdaBoost Regressor Metrics: 
 RMSE:  127.31046735478398 
 MAE:  112.15470511858071 
 R2:  -32.75988339046365


In [7]:
test1 = test[(test.ArrDelayMinutes >= 100 ) & (test.ArrDelayMinutes < 200)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

XGBoost Regressor Metrics: 
 RMSE:  108.15667112213885 
 MAE:  102.29133487879066 
 R2:  -15.30625774040858
Linear Regressor Metrics: 
 RMSE:  119.09982893081853 
 MAE:  115.82452502585483 
 R2:  -18.772881064340915
Decision Tree Regressor Metrics: 
 RMSE:  115.83929298917958 
 MAE:  101.78759942154736 
 R2:  -17.70507591259198
AdaBoost Regressor Metrics: 
 RMSE:  65.89313817097535 
 MAE:  47.406248177282116 
 R2:  -5.0524061158366225


In [8]:
test1 = test[(test.ArrDelayMinutes >= 200 ) & (test.ArrDelayMinutes < 500)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

XGBoost Regressor Metrics: 
 RMSE:  246.8514456091844 
 MAE:  232.69890831566568 
 R2:  -11.287099970752859
Linear Regressor Metrics: 
 RMSE:  265.765591088296 
 MAE:  256.010522856738 
 R2:  -13.242149673926301
Decision Tree Regressor Metrics: 
 RMSE:  242.81494510782687 
 MAE:  220.70812603648423 
 R2:  -10.888549479158597
AdaBoost Regressor Metrics: 
 RMSE:  149.13079795682233 
 MAE:  125.99853989608962 
 R2:  -3.484486545475982


In [9]:
test1 = test[(test.ArrDelayMinutes >= 500 ) & (test.ArrDelayMinutes < 1000)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

XGBoost Regressor Metrics: 
 RMSE:  702.0748447575919 
 MAE:  683.6350663105646 
 R2:  -19.127661077745678
Linear Regressor Metrics: 
 RMSE:  706.0105030186791 
 MAE:  688.8590312639049 
 R2:  -19.353954977112053
Decision Tree Regressor Metrics: 
 RMSE:  674.486028600873 
 MAE:  648.3055555555555 
 R2:  -17.576864006606105
AdaBoost Regressor Metrics: 
 RMSE:  574.5472631886174 
 MAE:  546.6214275768506 
 R2:  -12.47963107789405


In [10]:
test1 = test[(test.ArrDelayMinutes >= 1000 )]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

XGBoost Regressor Metrics: 
 RMSE:  1290.4224813911915 
 MAE:  1279.217201948166 
 R2:  -61.674175937363096
Linear Regressor Metrics: 
 RMSE:  1279.5664321638228 
 MAE:  1268.8920691555686 
 R2:  -60.62408274005251
Decision Tree Regressor Metrics: 
 RMSE:  1276.055053671275 
 MAE:  1262.5 
 R2:  -60.286329933381005
AdaBoost Regressor Metrics: 
 RMSE:  1099.508218217744 
 MAE:  1079.2816194292373 
 R2:  -44.50108479537648
