# Import Libraries

In [40]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Load Dataset

In [41]:
df = pd.read_excel("Flight-price-predication.xlsx")

In [42]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


# Feature Engineering

In [43]:
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')
df['Day'] = df['Date_of_Journey'].dt.day_name()

In [44]:
def weekdayornot(x):
    return "Weekend" if x in ["Sunday", "Saturday"] else "Weekday"
df['weekdayornot'] = df['Day'].apply(weekdayornot)


In [45]:
df.drop(['Date_of_Journey', 'Day', 'Dep_Time', 'Arrival_Time', 'Route'], axis=1, inplace=True)

df.dropna(inplace=True)

In [46]:
def con_to_mins(z):
    hrs = 0
    mins = 0
    if 'h' in z:
        hrs = int(z.split('h')[0])
    if 'm' in z:
        mins = int(z.split('m')[0].split()[-1])
    return hrs * 60 + mins

df['Duration'] = df['Duration'].apply(con_to_mins)

In [47]:
df_encoded = pd.get_dummies(df, columns=['Airline', 'Source', 'Destination', 'Total_Stops', 'Additional_Info', 'weekdayornot'], drop_first=True)

In [48]:
df_encoded.head()

Unnamed: 0,Duration,Price,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,...,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No Info,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight,weekdayornot_Weekend
0,170,3897,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
1,445,7662,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,1140,13882,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
3,325,6218,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
4,285,13302,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


# Features and Target split with Train Test Split

In [49]:
X = df_encoded.drop("Price", axis=1)
y = df_encoded["Price"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23062005)

# Scaling

In [50]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

# Model Training [Linear Regression]

In [51]:
lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

# Model Evaluation [Linear Regression]

In [52]:
r2_lr = r2_score(y_test, y_pred)
mae_lr = mean_absolute_error(y_test, y_pred)
mse_lr = mean_squared_error(y_test, y_pred)
rmse_lr = np.sqrt(mse_lr)
mape_lr = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("R² Score:", r2_lr*100)
print("MAE:", mae_lr)
print("MSE:", mse_lr)
print("RMSE:", rmse_lr)
print("MAPE:", mape_lr, "%")

R² Score: 69.93291480306496
MAE: 1708.2443404301255
MSE: 5547602.328401972
RMSE: 2355.3348654494907
MAPE: 20.886876497152343 %


# Model Training [Decision Tree]

In [53]:
dt = DecisionTreeRegressor(random_state=23062005)

param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

print(f"Best Parameters: {grid_search.best_params_}")

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
Best Parameters: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10}


# Model Evaluation [Decision Tree]

In [54]:
r2_dt = r2_score(y_test, y_pred)
mae_dt = mean_absolute_error(y_test, y_pred)
mse_dt = mean_squared_error(y_test, y_pred)
rmse_dt = np.sqrt(mse_dt)
mape_dt = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"R² Score: {r2_dt * 100:.2f}")
print(f"MAE: {mae_dt:.2f}")
print(f"MSE: {mse_dt:.2f}")
print(f"RMSE: {rmse_dt:.2f}")
print(f"MAPE: {mape_dt:.2f} %")

R² Score: 73.00
MAE: 1442.08
MSE: 4982188.27
RMSE: 2232.08
MAPE: 16.94 %
