# XGBoost

In [1]:
import pandas as pd
import numpy as np

## Baseline Model

In [2]:
# bring in data
path = '/Users/reneehall/Documents/Learning/lighthouse_labs/coursework-lighthouse-labs/Mid-term project/colab_data/'
fl_df = pd.read_csv(path+'cleaned_flights_sample.csv')

# remove columns that we can't predict ahead of time, including departure delay
to_drop = ['cancellation_code', 'carrier_delay', 'weather_delay', 'nas_delay','security_delay','late_aircraft_delay','first_dep_time','total_add_gtime','longest_add_gtime', 'dep_delay', 'cancelled']
fl_df = fl_df.drop(labels=to_drop, axis=1)

#### Data Transformation

In [7]:
# need to transform categorical into continuous

# date string to unix
import datetime as dt 

fl_df['fl_date'] = pd.to_datetime(fl_df['fl_date'])
fl_df['timestamp'] = fl_df['fl_date'].map(dt.datetime.toordinal)
fl_df = fl_df.drop(labels=['fl_date'], axis=1)

# encode categorical vars
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

cat_vars = fl_df.select_dtypes(include= 'object').columns.tolist()
for col in cat_vars:
    fl_df[col] = label_encoder.fit_transform(fl_df[col])
    
fl_df.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,...,arr_time,arr_delay,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,timestamp
0,8,15,10,4052,10,4925,2542,12954,206,283,...,1650.0,-16.0,0.0,0,84.0,90.0,67.0,1.0,528.0,736993
1,3,12,8,2733,8,5882,351,13930,257,314,...,1207.0,-19.0,0.0,0,215.0,148.0,116.0,1.0,404.0,736779
2,10,11,7,5531,19,1818,6266,10868,61,298,...,1853.0,14.0,0.0,0,114.0,81.0,67.0,1.0,93.0,737096
3,3,5,3,629,5,5499,4607,14869,332,17,...,613.0,-7.0,0.0,0,188.0,207.0,186.0,1.0,315.0,737374
4,10,15,10,296,14,2264,4535,14771,323,247,...,1138.0,25.0,0.0,0,70.0,118.0,92.0,1.0,388.0,737143


In [8]:
# separate x and y
X = fl_df.drop(labels='arr_delay', axis=1)
y = fl_df['arr_delay']

# make train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=101)

In [11]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

xg = xgb.XGBRegressor()
xg.fit(X_train, y_train)
y_xg = xg.predict(X_test)

# evaluate using MAE
MAE = mean_absolute_error(y_test, y_xg)
print(f"MAE: {MAE}")

# evaluate using R2
R2 = r2_score(y_test, y_xg)
print(f"R2: {R2}")

# evaluate with adjusted R2
adj_R2 = 1 - ((1 - R2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print(f"adj_R2: {adj_R2}")

# calculate MAPE
MAPE = mean_absolute_percentage_error(y_test,y_xg)
print(f"MAPE: {MAPE}")

MAE: 23.150331996660054
R2: 0.04716904789282683
adj_R2: 0.04701551064989129
MAPE: 609233712165259.8


# Baseline Scaled X

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Xsc = scaler.fit_transform(X)

In [None]:
# make train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=101)

lr = xgb.XGBRegressor()
lr.fit(X_train,y_train)
y_lr = lr.predict(X_test)

# evaluate using MAE
MAE = mean_absolute_error(y_test, y_lr)
print(f"MAE: {MAE}")

# evaluate using R2
R2 = r2_score(y_test, y_lr)
print(f"R2: {R2}")

# evaluate with adjusted R2
adj_R2 = 1 - ((1 - R2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print(f"adj_R2: {adj_R2}")

# calculate MAPE
MAPE = mean_absolute_percentage_error(y_test,y_lr)
print(f"MAPE: {MAPE}")