In [49]:
import numpy as np 
import pandas as pd

In [50]:
import seaborn as sns
import matplotlib.pyplot as plt

In [88]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning Libraries
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Evaluation Libraries
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score

# Deep Learning Libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.optimizers import Adam

# Hyperparameter Optimization Libraries
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import keras_tuner as kt

In [52]:
df=pd.read_excel("Flight_Fare.xlsx")

In [53]:
pd.set_option("display.max_columns",None)

In [54]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [56]:
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')

In [57]:
df['Journey_day'] = df['Date_of_Journey'].dt.day
df['Journey_month'] = df['Date_of_Journey'].dt.month
df['Journey_year'] = df['Date_of_Journey'].dt.year

In [58]:
df['Journey_year'].unique()

array([2019], dtype=int32)

In [59]:
df.drop(columns=['Date_of_Journey','Journey_year'], inplace=True)

In [60]:
df

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_month
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107,9,4
10679,Air India,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145,27,4
10680,Jet Airways,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229,27,4
10681,Vistara,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648,1,3


In [61]:
df['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
      dtype=object)

In [62]:
df['Total_Stops'].isna().sum()

np.int64(1)

In [63]:
df['Total_Stops'].fillna(df['Total_Stops'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Total_Stops'].fillna(df['Total_Stops'].mode()[0], inplace=True)


In [64]:
df['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

In [65]:
df['Total_Stops'] = df['Total_Stops'].map({'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4})

In [66]:
df

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_month
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,3897,24,3
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,1,5
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No info,13882,9,6
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,12,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,0,No info,4107,9,4
10679,Air India,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,0,No info,4145,27,4
10680,Jet Airways,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,0,No info,7229,27,4
10681,Vistara,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,0,No info,12648,1,3


In [67]:
# fill missing values just in case
df["Duration"] = df["Duration"].fillna("0h 0m")

# extract hours and minutes
df["Duration_hours"] = df["Duration"].str.extract(r"(\d+)h").fillna(0).astype(int)
df["Duration_minutes"] = df["Duration"].str.extract(r"(\d+)m").fillna(0).astype(int)

# convert to total minutes
df["Duration_Mins"] = df["Duration_hours"] * 60 + df["Duration_minutes"]

# drop helper columns
df.drop(["Duration_hours", "Duration_minutes", "Duration"], axis=1, inplace=True)


In [68]:
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Total_Stops,Additional_Info,Price,Journey_day,Journey_month,Duration_Mins
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,0,No info,3897,24,3,170
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,2,No info,7662,1,5,445
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,2,No info,13882,9,6,1140
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,1,No info,6218,12,5,325
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,1,No info,13302,1,3,285


In [69]:
df['Dep_Time'] = pd.to_datetime(df['Dep_Time'], format='%H:%M')

In [70]:
df['Dep_hour']=df['Dep_Time'].dt.hour
df['Dep_min']=df['Dep_Time'].dt.minute

df.drop(['Dep_Time'],axis=1,inplace=True)

In [71]:
df["Arrival_Time"] = pd.to_datetime(df["Arrival_Time"],errors="coerce",infer_datetime_format=True)


  df["Arrival_Time"] = pd.to_datetime(df["Arrival_Time"],errors="coerce",infer_datetime_format=True)
  df["Arrival_Time"] = pd.to_datetime(df["Arrival_Time"],errors="coerce",infer_datetime_format=True)


In [72]:
df['Arrival_hour']=df['Arrival_Time'].dt.hour
df['Arrival_min']=df['Arrival_Time'].dt.minute
df["Arrival_Day"] = df["Arrival_Time"].dt.day

df["Arrival_Day_Diff"] = df['Duration_Mins'] // (24*60)


df.drop(['Arrival_Time'],axis=1,inplace=True)


In [73]:
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Journey_day,Journey_month,Duration_Mins,Dep_hour,Dep_min,Arrival_hour,Arrival_min,Arrival_Day,Arrival_Day_Diff
0,IndiGo,Banglore,New Delhi,BLR → DEL,0,No info,3897,24,3,170,22,20,1,10,22,0
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2,No info,7662,1,5,445,5,50,13,15,15,0
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2,No info,13882,9,6,1140,9,25,4,25,10,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1,No info,6218,12,5,325,18,5,23,30,15,0
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1,No info,13302,1,3,285,16,50,21,35,15,0


In [74]:
df["Route_Legs"] = df["Route"].str.count("→") + 1
df["Route_Legs"] = df["Route_Legs"].fillna(1).astype(int)

df["Main_Layover"] = (df["Route"].fillna("Direct").str.split("→").apply(lambda x: x[1] if len(x) > 2 else "Direct"))


df.drop(['Route'], axis=1, inplace=True)


In [75]:
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Journey_day,Journey_month,Duration_Mins,Dep_hour,Dep_min,Arrival_hour,Arrival_min,Arrival_Day,Arrival_Day_Diff,Route_Legs,Main_Layover
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,170,22,20,1,10,22,0,2,Direct
1,Air India,Kolkata,Banglore,2,No info,7662,1,5,445,5,50,13,15,15,0,4,IXR
2,Jet Airways,Delhi,Cochin,2,No info,13882,9,6,1140,9,25,4,25,10,0,4,LKO
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,325,18,5,23,30,15,0,3,NAG
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,285,16,50,21,35,15,0,3,NAG


In [76]:
top_layovers = df["Main_Layover"].value_counts().nlargest(5).index

df["Main_Layover"] = df["Main_Layover"].where(
    df["Main_Layover"].isin(top_layovers),
    "Other"
)


In [77]:
cat_cols = ['Airline', 'Source', 'Destination', 'Main_Layover']

df = pd.get_dummies(df, columns=cat_cols, drop_first=True,dtype=int)

In [78]:
df.head()

Unnamed: 0,Total_Stops,Additional_Info,Price,Journey_day,Journey_month,Duration_Mins,Dep_hour,Dep_min,Arrival_hour,Arrival_min,Arrival_Day,Arrival_Day_Diff,Route_Legs,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi,Main_Layover_ BOM,Main_Layover_ DEL,Main_Layover_ HYD,Main_Layover_Direct,Main_Layover_Other
0,0,No info,3897,24,3,170,22,20,1,10,22,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,2,No info,7662,1,5,445,5,50,13,15,15,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
2,2,No info,13882,9,6,1140,9,25,4,25,10,0,4,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1
3,1,No info,6218,12,5,325,18,5,23,30,15,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,1,No info,13302,1,3,285,16,50,21,35,15,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [79]:
df['Additional_Info'].value_counts()

Additional_Info
No info                         8345
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
1 Short layover                    1
Red-eye flight                     1
2 Long layover                     1
Name: count, dtype: int64

In [80]:
df.drop(['Additional_Info'], axis=1, inplace=True)

In [81]:
num_cols = df.select_dtypes(exclude=["object", "category"]).columns
print(num_cols)

Index(['Total_Stops', 'Price', 'Journey_day', 'Journey_month', 'Duration_Mins',
       'Dep_hour', 'Dep_min', 'Arrival_hour', 'Arrival_min', 'Arrival_Day',
       'Arrival_Day_Diff', 'Route_Legs', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata', 'Destination_New Delhi', 'Main_Layover_ BOM ',
       'Main_Layover_ DEL ', 'Main_Layover_ HYD ', 'Main_Layover_Direct',
       'Main_Layover_Other'],
      dtype='object')


In [82]:
num_cols = [
    "Duration_Mins",
    "Journey_day",
    "Journey_month",
    "Dep_hour",
    "Dep_min",
    "Arrival_hour",
    "Arrival_min",
    "Arrival_Day"
]

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [83]:
x = df.drop('Price', axis=1)
y = df['Price']

In [84]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
def adjusted_r2(r2,n,p):
    return 1-((1-r2)*(n-1)/(n-p-1))

In [91]:
n = x_test.shape[0]      # number of rows
p = x_test.shape[1]      # number of features

In [None]:
lr = LinearRegression()

lr.fit(x_train,y_train)

y_pred=lr.predict(x_test)

print("MAE:",mean_absolute_error(y_test,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)
print("Adjusted R2:", adjusted_r2(r2, n, p))


MAE: 1924.265632493961
RMSE: 2760.637232128373
R2 Score: 0.6399012402796771
Adjusted R2: 0.6337281186844717


In [95]:
svr = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)

svr.fit(x_train, y_train)

y_pred = svr.predict(x_test)


print("MAE:",mean_absolute_error(y_test,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)
print("Adjusted R2:", adjusted_r2(r2, n, p))

MAE: 1817.286253988839
RMSE: 2978.315526298861
R2 Score: 0.5808742392207626
Adjusted R2: 0.5736892261788329


In [96]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=2,
    random_state=42,
    n_jobs=-1
)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print("MAE:",mean_absolute_error(y_test,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)
print("Adjusted R2:", adjusted_r2(r2, n, p))

MAE: 1245.9492095281837
RMSE: 1962.1544042571186
R2 Score: 0.8180846682509058
Adjusted R2: 0.8149661197066356


In [97]:
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

xgb.fit(x_train, y_train)

y_pred = xgb.predict(x_test)

print("MAE:",mean_absolute_error(y_test,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)
print("Adjusted R2:", adjusted_r2(r2, n, p))

MAE: 1148.09521484375
RMSE: 1739.551738236032
R2 Score: 0.8570192456245422
Adjusted R2: 0.8545681469781058


In [98]:
catboost = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    loss_function="RMSE",
    random_seed=42,
    verbose=0
)

catboost.fit(x_train, y_train)

y_pred = catboost.predict(x_test)

print("MAE:",mean_absolute_error(y_test,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)
print("Adjusted R2:", adjusted_r2(r2, n, p))

MAE: 1248.4819047772571
RMSE: 1807.4850895524817
R2 Score: 0.845633737062353
Adjusted R2: 0.8429874582691361


In [99]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

model.fit(x_train, y_train, epochs=100, validation_data=(x_test, y_test),verbose=0)

y_pred = model.predict(x_test)

print("MAE:",mean_absolute_error(y_test,y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)
print("Adjusted R2:", adjusted_r2(r2, n, p))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
MAE: 1326.706787109375
RMSE: 1915.0477800827844
R2 Score: 0.8267145156860352
Adjusted R2: 0.82374390738351
