---
# Baseline model 
---

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm       
import copy

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
                     
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline, make_union

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor
from tpot.builtins import StackingEstimator

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, ElasticNet, Ridge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor

# Regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

RANDOM_STATE=42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

---
# Read the Dataset

In [2]:
org_df = pd.read_csv('AirlineDelay_CleanDataset.csv', index_col=0) 
df = org_df.copy()

display(df.head())

Unnamed: 0,Month,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,TailNum,ActualElapsedTime,Origin,Dest,TaxiIn,TaxiOut,Delay
0,1,4,1829.0,1755,1959.0,1925,WN,N464WN,9.486833,IND,BWI,1.732051,3.162278,34.0
1,1,4,1937.0,1830,2037.0,1940,WN,N763SW,15.491933,IND,LAS,1.732051,2.645751,57.0
2,1,4,1644.0,1510,1845.0,1725,WN,N334SW,11.0,IND,MCO,2.44949,2.828427,80.0
3,1,4,1452.0,1425,1640.0,1625,WN,N286WN,15.099669,IND,PHX,2.645751,2.828427,15.0
4,1,4,1323.0,1255,1526.0,1510,WN,N674AA,11.090537,IND,TPA,2.0,3.0,16.0


---
# Encoding Categorical features

In [3]:
cat_cols = list(df.select_dtypes('object').columns)

for c in cat_cols:
    df[c] = LabelEncoder().fit_transform(df[c].values)

---
# Dividing the dataset: 
### X-Class/Target column and y-Features/Attributes 

In [4]:
X = df.drop(columns=['Delay'])                                 
y = df['Delay']

---
# Train/Test Stratified Split

In [5]:
bins = [15, 60, 120, 180, 240, 300, 2462] 

#bins = [np.log(x) for x in original_bins]
y_binned = np.digitize(y, bins=bins, right=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y_binned, random_state=RANDOM_STATE)

---
# Scaling the dataset using MinMaxScaler: 

In [6]:
mms = MinMaxScaler()
X_train = pd.DataFrame(mms.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(mms.fit_transform(X_test), columns=X_test.columns)

---
# Train and evaluate the model with Baseline Regressors

In [7]:
dummy_regr = DummyRegressor(strategy="mean")
lin = LinearRegression()
dt = DecisionTreeRegressor(random_state=RANDOM_STATE)
rf_rg = RandomForestRegressor(random_state=RANDOM_STATE)

dummy_regr.fit(X_train, y_train.to_numpy().flatten())
lin.fit(X_train, y_train.to_numpy().flatten())
dt.fit(X_train, y_train.to_numpy().flatten())
rf_rg.fit(X_train, y_train.to_numpy().flatten())

y_dm = dummy_regr.predict(X_test)
y_lin = lin.predict(X_test)
y_dt = dt.predict(X_test)
y_rf = rf_rg.predict(X_test)

In [8]:
mae_dm = mean_absolute_error(y_test, y_dm)
rmse_dm = np.sqrt(mean_squared_error(y_test, y_dm))
r2_dm = r2_score(y_test, y_dm)

mae_lin = mean_absolute_error(y_test, y_lin)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_lin))
r2_lin = r2_score(y_test, y_lin)

mae_dt = mean_absolute_error(y_test, y_dt)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_dt))
r2_dt = r2_score(y_test, y_dt)

mae_rf = mean_absolute_error(y_test, y_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_rf))
r2_rf = r2_score(y_test, y_rf)

data = {'Metric': ['MAE', 'RMSE', 'R2'],
        'Dummy': [mae_dm, rmse_dm, r2_dm],
        'Linear': [mae_lin, rmse_lin, r2_lin],
        'DecisionTree': [mae_dt, rmse_dt, r2_dt],
        'RandomForest': [mae_rf, rmse_rf, r2_rf] 
       }
comp_df = pd.DataFrame(data, columns = ['Metric', 'Dummy', 'Linear', 'DecisionTree', 'RandomForest'])
comp_df

Unnamed: 0,Metric,Dummy,Linear,DecisionTree,RandomForest
0,MAE,40.24038,37.569098,0.632176,0.3812
1,RMSE,60.60367,58.07301,7.507603,5.777435
2,R2,-3.283945e-08,0.081771,0.984654,0.990912
