# Regression

## Module 3

### Importing Modules

In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as skmu
from math import sqrt
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

### Dataframes

In [2]:
train = pd.read_csv("Data/TrainingData.csv", encoding = "utf-8")
test = pd.read_csv("Data/ClassifiedFlights.csv", encoding = "utf-8")
train_X = train[(train.ArrDel15 == 1)]
train_X = train[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
test_X = test[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()

In [3]:
regtrain_Y = train[['ArrDelayMinutes']].values.ravel()
regactual_Y = test[['ArrDelayMinutes']].values.ravel()

### Pipeline Regression

In [4]:
regxgb = XGBRegressor(n_estimators=200, tree_method='gpu_hist', predictor='gpu_predictor')
regxgb.fit(train_X,regtrain_Y)
regpred_Yxgb = regxgb.predict(test_X)

In [5]:
rmsexgb = sqrt(skmu.mean_squared_error(regactual_Y,regpred_Yxgb))
maexgb = skmu.mean_absolute_error(regactual_Y,regpred_Yxgb)
r2xgb = skmu.r2_score(regactual_Y,regpred_Yxgb)
print("XGBoost Regressor Metrics: \n RMSE: ", rmsexgb, "\n MAE: ", maexgb, "\n R2: ",r2xgb)

XGBoost Regressor Metrics: 
 RMSE:  31.789288422506043 
 MAE:  14.954194929839565 
 R2:  0.8409080392897866


In [6]:
reglin = LinearRegression()
reglin.fit(train_X,regtrain_Y)
regpred_Ylin = reglin.predict(test_X)

In [7]:
rmselin = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ylin))
maelin = skmu.mean_absolute_error(regactual_Y,regpred_Ylin)
r2lin = skmu.r2_score(regactual_Y,regpred_Ylin)
print("Linear Regressor Metrics: \n RMSE: ", rmselin, "\n MAE: ", maelin, "\n R2: ",r2lin)

Linear Regressor Metrics: 
 RMSE:  18.34550577358906 
 MAE:  13.517815141576698 
 R2:  0.9470158465478425


In [8]:
regtrees = DecisionTreeRegressor(random_state = 0)
regtrees.fit(train_X, regtrain_Y)
regpred_Ytree = regtrees.predict(test_X)

In [9]:
rmsetree = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ytree))
maetree = skmu.mean_absolute_error(regactual_Y,regpred_Ytree)
r2tree = skmu.r2_score(regactual_Y,regpred_Ytree)
print("Decision Tree Regressor Metrics: \n RMSE: ", rmsetree, "\n MAE: ", maetree, "\n R2: ",r2tree)

Decision Tree Regressor Metrics: 
 RMSE:  25.51054043452357 
 MAE:  18.370749911147968 
 R2:  0.8975467029123088


In [10]:
regada = AdaBoostRegressor(n_estimators = 100)
regada.fit(train_X, regtrain_Y)
regpred_Yada = regada.predict(test_X)

In [11]:
rmseada = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Yada))
maeada = skmu.mean_absolute_error(regactual_Y,regpred_Yada)
r2ada = skmu.r2_score(regactual_Y,regpred_Yada)
print("AdaBoost Regressor Metrics: \n RMSE: ", rmseada, "\n MAE: ", maeada, "\n R2: ",r2ada)

AdaBoost Regressor Metrics: 
 RMSE:  23.32483256914232 
 MAE:  17.834550546876326 
 R2:  0.9143507250573909


### Testing Original Test Dataset

In [12]:
testdata = pd.read_csv("Data/TestData.csv", encoding = 'utf-8')
testdata = testdata[(testdata.ArrDel15 == 1)]
test_X = testdata[[ 'Quarter' ,'Year' ,'Month' , 'DayofMonth','OriginAirportID','CRSDepTime', 'DestAirportID', 'DepDelayMinutes','CRSArrTime', 'windspeedKmph_x','winddirDegree_x','precipMM_x','visibility_x','pressure_x','cloudcover_x','DewPointF_x','WindGustKmph_x','humidity_x','windspeedKmph_y','winddirDegree_y','precipMM_y','visibility_y','pressure_y','cloudcover_y','DewPointF_y','WindGustKmph_y','humidity_y']].to_numpy()
regactual_Y = testdata[['ArrDelayMinutes']].values.ravel()

In [13]:
regpred_Yxgb = regxgb.predict(test_X)
rmsexgb = sqrt(skmu.mean_squared_error(regactual_Y,regpred_Yxgb))
maexgb = skmu.mean_absolute_error(regactual_Y,regpred_Yxgb)
r2xgb = skmu.r2_score(regactual_Y,regpred_Yxgb)
print("XGBoost Regressor Metrics: \n RMSE: ", rmsexgb, "\n MAE: ", maexgb, "\n R2: ",r2xgb)

XGBoost Regressor Metrics: 
 RMSE:  30.980396933189855 
 MAE:  16.45789282394679 
 R2:  0.8164368310493996


In [14]:
regpred_Ylin = reglin.predict(test_X)
rmselin = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ylin))
maelin = skmu.mean_absolute_error(regactual_Y,regpred_Ylin)
r2lin = skmu.r2_score(regactual_Y,regpred_Ylin)
print("Linear Regressor Metrics: \n RMSE: ", rmselin, "\n MAE: ", maelin, "\n R2: ",r2lin)

Linear Regressor Metrics: 
 RMSE:  20.896859369161824 
 MAE:  15.129629768949584 
 R2:  0.9164832412925407


In [15]:
regpred_Ytree = regtrees.predict(test_X)
rmsetree = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Ytree))
maetree = skmu.mean_absolute_error(regactual_Y,regpred_Ytree)
r2tree = skmu.r2_score(regactual_Y,regpred_Ytree)
print("Decision Tree Regressor Metrics: \n RMSE: ", rmsetree, "\n MAE: ", maetree, "\n R2: ",r2tree)

Decision Tree Regressor Metrics: 
 RMSE:  26.053785631334502 
 MAE:  19.328910463861924 
 R2:  0.8701765153978249


In [16]:
regpred_Yada = regada.predict(test_X)
rmseada = sqrt(skmu.mean_squared_error(regactual_Y, regpred_Yada))
maeada = skmu.mean_absolute_error(regactual_Y,regpred_Yada)
r2ada = skmu.r2_score(regactual_Y,regpred_Yada)
print("AdaBoost Regressor Metrics: \n RMSE: ", rmseada, "\n MAE: ", maeada, "\n R2: ",r2ada)

AdaBoost Regressor Metrics: 
 RMSE:  21.343414212609193 
 MAE:  15.184455596944632 
 R2:  0.9128756849923996


### Metrics

In [None]:
"""
From observing the metrics of the regressors and testing the test 
data model;
we can say that Linear Regressor is the suitable regressor model for the dataset
"""