# Regression Testing

### Importing Modules

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as skmu
from math import sqrt
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

### Datasets

In [2]:
train = pd.read_csv("Data/TrainingData.csv", encoding = "utf-8")
test = pd.read_csv("Data/TestData.csv", encoding = "utf-8")
train_X = train[(train.ArrDel15 == 1)]
train_X = train[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()

In [3]:
regtrain_Y = train[['ArrDelayMinutes']].values.ravel()

### Regressors

In [5]:
#regxgb = XGBRegressor(n_estimators=200, tree_method='gpu_hist', predictor='gpu_predictor')
#regxgb.fit(train_X,regtrain_Y)
reglin = LinearRegression()
reglin.fit(train_X,regtrain_Y)
regtrees = DecisionTreeRegressor(random_state = 0)
regtrees.fit(train_X, regtrain_Y)
regada = AdaBoostRegressor(n_estimators = 100)
regada.fit(train_X, regtrain_Y)

AdaBoostRegressor(n_estimators=100)

In [7]:
def regressor(test_X, regactual_Y):
    # Linear Regressor
    regpred_Ylin = reglin.predict(test_X)
    # Linear Regressor Metrics
    rmselin = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ylin))
    maelin = skmu.mean_absolute_error(regactual_Y,regpred_Ylin)
    r2lin = skmu.r2_score(regactual_Y,regpred_Ylin)
    print("Linear Regressor Metrics: \n RMSE: ", rmselin, "\n MAE: ", maelin, "\n R2: ",r2lin)
    # Decision Trees Regressor
    regpred_Ytree = regtrees.predict(test_X)
    # Decision Trees Regressor Metrics
    rmsetree = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ytree))
    maetree = skmu.mean_absolute_error(regactual_Y,regpred_Ytree)
    r2tree = skmu.r2_score(regactual_Y,regpred_Ytree)
    print("Decision Tree Regressor Metrics: \n RMSE: ", rmsetree, "\n MAE: ", maetree, "\n R2: ",r2tree)
    # AdaBoost Regressor
    regpred_Yada = regada.predict(test_X)
    # AdaBoost Regressor Metrics
    rmseada = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Yada))
    maeada = skmu.mean_absolute_error(regactual_Y,regpred_Yada)
    r2ada = skmu.r2_score(regactual_Y,regpred_Yada)
    print("AdaBoost Regressor Metrics: \n RMSE: ", rmseada, "\n MAE: ", maeada, "\n R2: ",r2ada)

### Grouping Test Data

In [8]:
test1 = test[(test.ArrDelayMinutes >= 15 ) & (test.ArrDelayMinutes < 100)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

Linear Regressor Metrics: 
 RMSE:  19.058773564475565 
 MAE:  14.586950360718086 
 R2:  0.24340648665786857
Decision Tree Regressor Metrics: 
 RMSE:  23.88825592482409 
 MAE:  18.314297894424673 
 R2:  -0.188616074863303
AdaBoost Regressor Metrics: 
 RMSE:  18.903446622990188 
 MAE:  14.946146459278472 
 R2:  0.2556885436289035


In [11]:
test1 = test[(test.ArrDelayMinutes >= 100 ) & (test.ArrDelayMinutes < 200)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

Linear Regressor Metrics: 
 RMSE:  26.657509426485255 
 MAE:  16.748542477401337 
 R2:  0.009426235176119735
Decision Tree Regressor Metrics: 
 RMSE:  33.61397227822147 
 MAE:  23.717281272595805 
 R2:  -0.5750246452517571
AdaBoost Regressor Metrics: 
 RMSE:  30.0169056815398 
 MAE:  22.07450295951907 
 R2:  -0.2559707341848476


In [12]:
test1 = test[(test.ArrDelayMinutes >= 200 ) & (test.ArrDelayMinutes < 500)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

Linear Regressor Metrics: 
 RMSE:  32.9860852368298 
 MAE:  20.592734207308485 
 R2:  0.7805984982947547
Decision Tree Regressor Metrics: 
 RMSE:  38.94772616128351 
 MAE:  26.308457711442784 
 R2:  0.6941262146357356
AdaBoost Regressor Metrics: 
 RMSE:  36.17352140217232 
 MAE:  24.997709830188874 
 R2:  0.736148468905288


In [13]:
test1 = test[(test.ArrDelayMinutes >= 500 ) & (test.ArrDelayMinutes < 1000)]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

Linear Regressor Metrics: 
 RMSE:  30.96854280790588 
 MAE:  26.84833593761018 
 R2:  0.9608377147111434
Decision Tree Regressor Metrics: 
 RMSE:  29.371660566683058 
 MAE:  24.194444444444443 
 R2:  0.9647723650734056
AdaBoost Regressor Metrics: 
 RMSE:  57.458932967070936 
 MAE:  48.75010564235896 
 R2:  0.8651839498034655


In [14]:
test1 = test[(test.ArrDelayMinutes >= 1000 )]
test_X = test1[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = test1[['ArrDelayMinutes']].values.ravel()
regressor(test_X, regactual_Y)

Linear Regressor Metrics: 
 RMSE:  29.39956429337451 
 MAE:  26.517408137346138 
 R2:  0.9674683134239053
Decision Tree Regressor Metrics: 
 RMSE:  56.71860364994893 
 MAE:  56.0 
 R2:  0.8789190409876172
AdaBoost Regressor Metrics: 
 RMSE:  61.67152689005177 
 MAE:  54.73828171699074 
 R2:  0.8568490636098318
