In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from math import sqrt

data = pd.read_excel('/AirplaneDataset/Data_Train.xlsx')
data = data.drop(index=9039).reset_index(drop=True)
data = data.drop('Additional_Info', axis=1)

#Change total stops feature to only contain numeric data

data['Total_Stops'] = data['Total_Stops'].map({
    'non-stop' : 0,
    '1 stop' : 1,
    '2 stops' : 2,
    '3 stops' : 3,
    '4 stops' : 4
})

#Change Date of Journey into individual features

data['Date_of_Journey'] = pd.to_datetime(data['Date_of_Journey'], dayfirst=True)
data['Journey_Day'] = data['Date_of_Journey'].dt.day
data['Journey_Month'] = data['Date_of_Journey'].dt.month

data.drop(columns="Date_of_Journey", axis=1, inplace=True)

#Change duration into minutes only

def convert_duration(x):
    minutes = 0
    hours = 0
    x = x.strip()

    if 'h' in x:
        hours = int(x.split('h')[0].strip())
        x = x.split('h')[1]
    if 'm' in x:
        minutes = int(x.split('m')[0].strip())

    return hours * 60 + minutes

data['Duration'] = data['Duration'].apply(convert_duration)
data.rename(columns={'Duration' : 'Duration (min)'}, inplace=True)

#Change departure time to departure hour and departure time

data['Dep_Time']   = pd.to_datetime(data['Dep_Time'], format='%H:%M')
data['Dep_Hour']   = data['Dep_Time'].dt.hour
data['Dep_Minute'] = data['Dep_Time'].dt.minute
data.drop(columns='Dep_Time', inplace=True)

def process_arrival_time(row):
   cell_length = row.split(' ')
   split_time = row.split(' ')[0].strip()

   if len(cell_length) >= 3:
       next_day = 1
   else:
       next_day = 0

   hour, time = map(int, split_time.split(':'))

   return hour, time, next_day
   
data['Arrival_Hour'], data['Arrival_Minute'], data['Arrive_Next_Day'] = zip(*data['Arrival_Time'].apply(process_arrival_time))

data['Arrival_Time'] = pd.to_datetime(data['Arrival_Time'].str.split(' ').str[0], format='%H:%M')
data['Arrival_Hour']   = data['Arrival_Time'].dt.hour
data['Arrival_Minute'] = data['Arrival_Time'].dt.minute
data.drop(columns='Arrival_Time', inplace=True)

#Change arrival time to have arrival hour and arrival minutes and if it lands next day or not

data['Arrival_Since_Midnight'] = data['Arrival_Hour']*60 + data['Arrival_Minute']
data['Arr_Cos'] = np.cos(2*np.pi * data['Arrival_Since_Midnight']/1440)
data['Arr_Sin'] = np.sin(2*np.pi * data['Arrival_Since_Midnight']/1440)

#Use cyclical encoding for time
data['Dep_Time_Since_Midnight'] = data['Dep_Hour']*60 + data['Dep_Minute']
data['Dep_Cos'] = np.cos(2*np.pi * data['Dep_Time_Since_Midnight']/1440)
data['Dep_Sin'] = np.sin(2*np.pi * data['Dep_Time_Since_Midnight']/1440)

categorical_cols = ['Route', 'Destination', 'Source', 'Airline']
numerical_cols = ['Total_Stops', 'Journey_Day', 'Journey_Month', 
                  'Duration (min)', 'Arrival_Hour', 'Arrival_Minute'
                  ,'Dep_Time_Since_Midnight', 'Dep_Cos', 'Dep_Sin', 'Arrival_Since_Midnight'
                  ,'Arr_Cos','Arr_Sin', 'Arrive_Next_Day']

cat_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore")
)

num_pipeline = make_pipeline(
    StandardScaler()
)

preprocesser = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

model = Pipeline([
    ('preprocessing', preprocesser),
    ('regressor', RandomForestRegressor(random_state=42))
])

         
y = np.log(data['Price'].values) 
X = data.drop(columns='Price')
i

KeyboardInterrupt: 

In [None]:
#Do grid search
#make dictionary of hyperparam values to search

search_space = {
    "regressor__n_estimators" : [100, 200, 300],
    "regressor__max_depth" : [10, 20, None]
} 

GS = GridSearchCV(
     estimator = model,
     param_grid = search_space,
     scoring= "neg_root_mean_squared_error",
     cv = 5,
     verbose = 1
)

y = np.log(data['Price'].values) 
X = data.drop(columns='Price')

GS.fit(X, y)

print("Best parameters:", GS.best_params_)
print("Best Estimator: ", GS.best_estimator_)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters: {'regressor__max_depth': 10, 'regressor__n_estimators': 300}
Best Estimator:  Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Total_Stops', 'Journey_Day',
                                                   'Journey_Month',
                                                   'Duration (min)',
                                                   'Arrival_Hour',
                                                   'Arrival_Minute',
                                                   'Dep_Time_Since_Midnight',
                                                   'Dep_Cos', 'Dep_Sin',
                                                   'Arrival_Since_Midnight',

In [14]:
df = pd.DataFrame(GS.cv_results_)
display(df.sort_values('mean_test_score', ascending=False))

best_model = GS.best_estimator_

y_pred_log = best_model.predict(X)
y_pred = np.exp(y_pred_log)

rmse = sqrt(mean_squared_error(np.exp(y), y_pred))
print(rmse)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__max_depth,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,12.262789,0.280002,0.052011,0.002082,10.0,300,"{'regressor__max_depth': 10, 'regressor__n_est...",-0.175454,-0.175455,-0.172871,-0.184925,-0.184157,-0.178572,0.00497,1
1,8.174731,0.166429,0.035001,0.000426,10.0,200,"{'regressor__max_depth': 10, 'regressor__n_est...",-0.175419,-0.175778,-0.173096,-0.185014,-0.184012,-0.178664,0.004874,2
0,4.04294,0.059433,0.020057,0.000385,10.0,100,"{'regressor__max_depth': 10, 'regressor__n_est...",-0.175602,-0.175815,-0.173942,-0.185097,-0.185119,-0.179115,0.004936,3
5,32.01846,0.37495,0.096742,0.01337,20.0,300,"{'regressor__max_depth': 20, 'regressor__n_est...",-0.177072,-0.179196,-0.17393,-0.190461,-0.191126,-0.182357,0.007092,4
4,21.600833,0.25415,0.073981,0.024343,20.0,200,"{'regressor__max_depth': 20, 'regressor__n_est...",-0.177454,-0.179606,-0.17379,-0.190512,-0.191311,-0.182535,0.007092,5
3,10.723567,0.131838,0.033227,0.001136,20.0,100,"{'regressor__max_depth': 20, 'regressor__n_est...",-0.178064,-0.179559,-0.174993,-0.1905,-0.19214,-0.183051,0.00693,6
8,35.180718,0.391336,0.095166,0.001387,,300,"{'regressor__max_depth': None, 'regressor__n_e...",-0.1784,-0.180173,-0.175124,-0.191423,-0.19261,-0.183546,0.007113,7
7,23.687135,0.227778,0.073821,0.007514,,200,"{'regressor__max_depth': None, 'regressor__n_e...",-0.178915,-0.180539,-0.175128,-0.191415,-0.19262,-0.183723,0.007006,8
6,14.238988,5.500833,0.040105,0.007089,,100,"{'regressor__max_depth': None, 'regressor__n_e...",-0.17958,-0.18053,-0.176088,-0.191381,-0.193656,-0.184247,0.006951,9


1596.6119368758393


In [None]:
y = np.log(data['Price'].values) 
X = data.drop(columns='Price')

cv = KFold(n_splits=5, shuffle=True, random_state=42)
log_preds = cross_val_predict(model, X, y, cv=cv)

model.fit(X, y)

resid_var_log = np.mean((y - log_preds)**2)

#bias-corrected back-transform

price_preds = np.exp(log_preds + resid_var_log/2)
y_true = np.exp(y)

#final metrics on original scale
rmse = sqrt(mean_squared_error(y_true, price_preds))
r2   = r2_score(y_true, price_preds)

print(f"Cross-validated residual variance (log): {resid_var_log:.4f}")
print(f"Cross-validated RMSE (price): {rmse:.2f}")
print(f"Cross-validated R² (price):   {r2:.4f}")
display(data)

#Cross-validated residual variance (log): 0.0594
#Cross-validated RMSE (price): 2628.71
#Cross-validated R² (price):   0.6750