In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil import parser
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:

economy_df = pd.read_csv('/content/economy.csv')
economy_df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price
0,11-02-2022,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953
1,11-02-2022,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953
2,11-02-2022,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956
3,11-02-2022,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955
4,11-02-2022,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955


In [None]:

economy_df.info()
economy_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206774 entries, 0 to 206773
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        206774 non-null  object
 1   airline     206774 non-null  object
 2   ch_code     206774 non-null  object
 3   num_code    206774 non-null  int64 
 4   dep_time    206774 non-null  object
 5   from        206774 non-null  object
 6   time_taken  206774 non-null  object
 7   stop        206774 non-null  object
 8   arr_time    206774 non-null  object
 9   to          206774 non-null  object
 10  price       206774 non-null  object
dtypes: int64(1), object(10)
memory usage: 17.4+ MB


Unnamed: 0,0
date,0
airline,0
ch_code,0
num_code,0
dep_time,0
from,0
time_taken,0
stop,0
arr_time,0
to,0


In [None]:
# Parse the 'date' column and extract day, month, weekday
def clean_and_parse_date(date_str):
    try:
        if pd.isnull(date_str) or not isinstance(date_str, str):
            return pd.NaT
        date_str = date_str.strip().replace(',', '').replace('.', '/')
        return parser.parse(date_str, dayfirst=True)
    except:
        return pd.NaT

economy_df['date'] = economy_df['date'].apply(clean_and_parse_date)
economy_df['day'] = economy_df['date'].dt.day
economy_df['month'] = economy_df['date'].dt.month
economy_df['weekday'] = economy_df['date'].dt.weekday
economy_df[['date', 'day', 'month', 'weekday']].head()

Unnamed: 0,date,day,month,weekday
0,2022-02-11,11,2,4
1,2022-02-11,11,2,4
2,2022-02-11,11,2,4
3,2022-02-11,11,2,4
4,2022-02-11,11,2,4


In [None]:
# Convert 'time_taken' into total minutes only
def convert_time_taken(x):
    try:
        if 'h' in x and 'm' in x:
            h, m = x.split('h')
            h = int(h.strip())
            m = int(m.replace('m', '').strip())
        elif 'h' in x:
            h = int(x.replace('h', '').strip())
            m = 0
        elif 'm' in x:
            h = 0
            m = int(x.replace('m', '').strip())
        else:
            h = 0
            m = 0
        return h*60 + m
    except:
        return np.nan

economy_df['duration_mins'] = economy_df['time_taken'].astype(str).apply(convert_time_taken)
economy_df[['time_taken', 'duration_mins']].head()

Unnamed: 0,time_taken,duration_mins
0,02h 10m,130.0
1,02h 20m,140.0
2,02h 10m,130.0
3,02h 15m,135.0
4,02h 20m,140.0


In [None]:
economy_df

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,day,month,weekday,duration_mins
0,2022-02-11,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953.0,11,2,4,130.0
1,2022-02-11,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953.0,11,2,4,140.0
2,2022-02-11,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956.0,11,2,4,130.0
3,2022-02-11,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955.0,11,2,4,135.0
4,2022-02-11,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955.0,11,2,4,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,UK,832,07:05,Chennai,13h 50m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:55,Hyderabad,7697.0,31,3,3,830.0
206770,2022-03-31,Vistara,UK,832,07:05,Chennai,13h 50m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:55,Hyderabad,7709.0,31,3,3,830.0
206771,2022-03-31,Vistara,UK,826,12:30,Chennai,20h 35m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,09:05,Hyderabad,8640.0,31,3,3,1235.0
206772,2022-03-31,Vistara,UK,822,09:45,Chennai,23h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,09:05,Hyderabad,8640.0,31,3,3,1400.0


In [None]:
economy_df.drop(['stop'], axis=1, inplace=True)

In [None]:
economy_df.drop(['ch_code'], axis=1, inplace=True)

In [None]:
economy_df

Unnamed: 0,date,airline,num_code,dep_time,from,time_taken,arr_time,to,price,day,month,weekday,duration_mins
0,2022-02-11,SpiceJet,8709,18:55,Delhi,02h 10m,21:05,Mumbai,5953.0,11,2,4,130.0
1,2022-02-11,SpiceJet,8157,06:20,Delhi,02h 20m,08:40,Mumbai,5953.0,11,2,4,140.0
2,2022-02-11,AirAsia,764,04:25,Delhi,02h 10m,06:35,Mumbai,5956.0,11,2,4,130.0
3,2022-02-11,Vistara,995,10:20,Delhi,02h 15m,12:35,Mumbai,5955.0,11,2,4,135.0
4,2022-02-11,Vistara,963,08:50,Delhi,02h 20m,11:10,Mumbai,5955.0,11,2,4,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,832,07:05,Chennai,13h 50m,20:55,Hyderabad,7697.0,31,3,3,830.0
206770,2022-03-31,Vistara,832,07:05,Chennai,13h 50m,20:55,Hyderabad,7709.0,31,3,3,830.0
206771,2022-03-31,Vistara,826,12:30,Chennai,20h 35m,09:05,Hyderabad,8640.0,31,3,3,1235.0
206772,2022-03-31,Vistara,822,09:45,Chennai,23h 20m,09:05,Hyderabad,8640.0,31,3,3,1400.0


In [None]:
economy_df.isnull().sum()

Unnamed: 0,0
date,0
airline,0
num_code,0
dep_time,0
from,0
time_taken,0
arr_time,0
to,0
price,0
day,0


In [None]:
economy_df.dropna()

Unnamed: 0,date,airline,num_code,dep_time,from,time_taken,arr_time,to,price,day,month,weekday,duration_mins
0,2022-02-11,SpiceJet,8709,18:55,Delhi,02h 10m,21:05,Mumbai,5953.0,11,2,4,130.0
1,2022-02-11,SpiceJet,8157,06:20,Delhi,02h 20m,08:40,Mumbai,5953.0,11,2,4,140.0
2,2022-02-11,AirAsia,764,04:25,Delhi,02h 10m,06:35,Mumbai,5956.0,11,2,4,130.0
3,2022-02-11,Vistara,995,10:20,Delhi,02h 15m,12:35,Mumbai,5955.0,11,2,4,135.0
4,2022-02-11,Vistara,963,08:50,Delhi,02h 20m,11:10,Mumbai,5955.0,11,2,4,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,832,07:05,Chennai,13h 50m,20:55,Hyderabad,7697.0,31,3,3,830.0
206770,2022-03-31,Vistara,832,07:05,Chennai,13h 50m,20:55,Hyderabad,7709.0,31,3,3,830.0
206771,2022-03-31,Vistara,826,12:30,Chennai,20h 35m,09:05,Hyderabad,8640.0,31,3,3,1235.0
206772,2022-03-31,Vistara,822,09:45,Chennai,23h 20m,09:05,Hyderabad,8640.0,31,3,3,1400.0


In [None]:
edf=economy_df

In [None]:
edf.isnull().sum()

Unnamed: 0,0
date,0
airline,0
num_code,0
dep_time,0
from,0
time_taken,0
arr_time,0
to,0
price,0
day,0


In [None]:
edf.dropna(inplace=True)

In [None]:
edf.isnull().sum()

Unnamed: 0,0
date,0
airline,0
num_code,0
dep_time,0
from,0
time_taken,0
arr_time,0
to,0
price,0
day,0


In [None]:
edf.drop(['num_code'], axis=1, inplace=True)

In [None]:
edf

Unnamed: 0,date,airline,dep_time,from,time_taken,arr_time,to,price,day,month,weekday,duration_mins
0,2022-02-11,SpiceJet,18:55,Delhi,02h 10m,21:05,Mumbai,5953.0,11,2,4,130.0
1,2022-02-11,SpiceJet,06:20,Delhi,02h 20m,08:40,Mumbai,5953.0,11,2,4,140.0
2,2022-02-11,AirAsia,04:25,Delhi,02h 10m,06:35,Mumbai,5956.0,11,2,4,130.0
3,2022-02-11,Vistara,10:20,Delhi,02h 15m,12:35,Mumbai,5955.0,11,2,4,135.0
4,2022-02-11,Vistara,08:50,Delhi,02h 20m,11:10,Mumbai,5955.0,11,2,4,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,07:05,Chennai,13h 50m,20:55,Hyderabad,7697.0,31,3,3,830.0
206770,2022-03-31,Vistara,07:05,Chennai,13h 50m,20:55,Hyderabad,7709.0,31,3,3,830.0
206771,2022-03-31,Vistara,12:30,Chennai,20h 35m,09:05,Hyderabad,8640.0,31,3,3,1235.0
206772,2022-03-31,Vistara,09:45,Chennai,23h 20m,09:05,Hyderabad,8640.0,31,3,3,1400.0


In [None]:
edf.dtypes

Unnamed: 0,0
date,datetime64[ns]
airline,object
dep_time,object
from,object
time_taken,object
arr_time,object
to,object
price,float64
day,int32
month,int32


In [None]:
edd=edf

In [None]:
# Function to extract hour and minute from time strings
def extract_hour_minute(time_str):
    try:
        if isinstance(time_str, str) and ':' in time_str:
            hour, minute = map(int, time_str.strip().split(':'))
            return hour, minute
    except:
        return np.nan, np.nan
    return np.nan, np.nan


edd['dep_hour'], edd['dep_min'] = zip(*edd['dep_time'].apply(extract_hour_minute))


edd['arr_hour'], edd['arr_min'] = zip(*edd['arr_time'].apply(extract_hour_minute))




In [None]:
edd

Unnamed: 0,date,airline,dep_time,from,time_taken,arr_time,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min
0,2022-02-11,SpiceJet,18:55,Delhi,02h 10m,21:05,Mumbai,5953.0,11,2,4,130.0,18,55,21,5
1,2022-02-11,SpiceJet,06:20,Delhi,02h 20m,08:40,Mumbai,5953.0,11,2,4,140.0,6,20,8,40
2,2022-02-11,AirAsia,04:25,Delhi,02h 10m,06:35,Mumbai,5956.0,11,2,4,130.0,4,25,6,35
3,2022-02-11,Vistara,10:20,Delhi,02h 15m,12:35,Mumbai,5955.0,11,2,4,135.0,10,20,12,35
4,2022-02-11,Vistara,08:50,Delhi,02h 20m,11:10,Mumbai,5955.0,11,2,4,140.0,8,50,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,07:05,Chennai,13h 50m,20:55,Hyderabad,7697.0,31,3,3,830.0,7,5,20,55
206770,2022-03-31,Vistara,07:05,Chennai,13h 50m,20:55,Hyderabad,7709.0,31,3,3,830.0,7,5,20,55
206771,2022-03-31,Vistara,12:30,Chennai,20h 35m,09:05,Hyderabad,8640.0,31,3,3,1235.0,12,30,9,5
206772,2022-03-31,Vistara,09:45,Chennai,23h 20m,09:05,Hyderabad,8640.0,31,3,3,1400.0,9,45,9,5


In [None]:


economy_df.drop(['dep_time', 'arr_time'], axis=1, inplace=True)


economy_df.head()

Unnamed: 0,date,airline,from,time_taken,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min
0,2022-02-11,SpiceJet,Delhi,02h 10m,Mumbai,5953.0,11,2,4,130.0,18,55,21,5
1,2022-02-11,SpiceJet,Delhi,02h 20m,Mumbai,5953.0,11,2,4,140.0,6,20,8,40
2,2022-02-11,AirAsia,Delhi,02h 10m,Mumbai,5956.0,11,2,4,130.0,4,25,6,35
3,2022-02-11,Vistara,Delhi,02h 15m,Mumbai,5955.0,11,2,4,135.0,10,20,12,35
4,2022-02-11,Vistara,Delhi,02h 20m,Mumbai,5955.0,11,2,4,140.0,8,50,11,10


In [None]:
edd

Unnamed: 0,date,airline,from,time_taken,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min
0,2022-02-11,SpiceJet,Delhi,02h 10m,Mumbai,5953.0,11,2,4,130.0,18,55,21,5
1,2022-02-11,SpiceJet,Delhi,02h 20m,Mumbai,5953.0,11,2,4,140.0,6,20,8,40
2,2022-02-11,AirAsia,Delhi,02h 10m,Mumbai,5956.0,11,2,4,130.0,4,25,6,35
3,2022-02-11,Vistara,Delhi,02h 15m,Mumbai,5955.0,11,2,4,135.0,10,20,12,35
4,2022-02-11,Vistara,Delhi,02h 20m,Mumbai,5955.0,11,2,4,140.0,8,50,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,Chennai,13h 50m,Hyderabad,7697.0,31,3,3,830.0,7,5,20,55
206770,2022-03-31,Vistara,Chennai,13h 50m,Hyderabad,7709.0,31,3,3,830.0,7,5,20,55
206771,2022-03-31,Vistara,Chennai,20h 35m,Hyderabad,8640.0,31,3,3,1235.0,12,30,9,5
206772,2022-03-31,Vistara,Chennai,23h 20m,Hyderabad,8640.0,31,3,3,1400.0,9,45,9,5


In [None]:
edd_copy=edd

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

edd_copy['from'] = le.fit_transform(edd_copy['from'])
edd_copy['to'] = le.fit_transform(edd_copy['to'])


In [None]:
edd_copy

Unnamed: 0,date,airline,from,time_taken,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min
0,2022-02-11,SpiceJet,2,02h 10m,5,5953.0,11,2,4,130.0,18,55,21,5
1,2022-02-11,SpiceJet,2,02h 20m,5,5953.0,11,2,4,140.0,6,20,8,40
2,2022-02-11,AirAsia,2,02h 10m,5,5956.0,11,2,4,130.0,4,25,6,35
3,2022-02-11,Vistara,2,02h 15m,5,5955.0,11,2,4,135.0,10,20,12,35
4,2022-02-11,Vistara,2,02h 20m,5,5955.0,11,2,4,140.0,8,50,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,1,13h 50m,3,7697.0,31,3,3,830.0,7,5,20,55
206770,2022-03-31,Vistara,1,13h 50m,3,7709.0,31,3,3,830.0,7,5,20,55
206771,2022-03-31,Vistara,1,20h 35m,3,8640.0,31,3,3,1235.0,12,30,9,5
206772,2022-03-31,Vistara,1,23h 20m,3,8640.0,31,3,3,1400.0,9,45,9,5


In [None]:
edd_copy.isnull().sum()

Unnamed: 0,0
date,0
airline,0
from,0
time_taken,0
to,0
price,0
day,0
month,0
weekday,0
duration_mins,0


In [None]:

import joblib


airline_dummies = pd.get_dummies(edd_copy['airline'], prefix='airline')


airline_columns = airline_dummies.columns.tolist()
joblib.dump(airline_columns, 'airline_onehot_columns.pkl')


edd_copy = pd.concat([edd_copy, airline_dummies], axis=1)


In [None]:
edd_copy

Unnamed: 0,date,airline,from,time_taken,to,price,day,month,weekday,duration_mins,...,airline_Trujet,airline_Vistara,airline_Air India,airline_AirAsia,airline_GO FIRST,airline_Indigo,airline_SpiceJet,airline_StarAir,airline_Trujet.1,airline_Vistara.1
0,2022-02-11,SpiceJet,2,02h 10m,5,5953.0,11,2,4,130.0,...,False,False,False,False,False,False,True,False,False,False
1,2022-02-11,SpiceJet,2,02h 20m,5,5953.0,11,2,4,140.0,...,False,False,False,False,False,False,True,False,False,False
2,2022-02-11,AirAsia,2,02h 10m,5,5956.0,11,2,4,130.0,...,False,False,False,True,False,False,False,False,False,False
3,2022-02-11,Vistara,2,02h 15m,5,5955.0,11,2,4,135.0,...,False,True,False,False,False,False,False,False,False,True
4,2022-02-11,Vistara,2,02h 20m,5,5955.0,11,2,4,140.0,...,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,1,13h 50m,3,7697.0,31,3,3,830.0,...,False,True,False,False,False,False,False,False,False,True
206770,2022-03-31,Vistara,1,13h 50m,3,7709.0,31,3,3,830.0,...,False,True,False,False,False,False,False,False,False,True
206771,2022-03-31,Vistara,1,20h 35m,3,8640.0,31,3,3,1235.0,...,False,True,False,False,False,False,False,False,False,True
206772,2022-03-31,Vistara,1,23h 20m,3,8640.0,31,3,3,1400.0,...,False,True,False,False,False,False,False,False,False,True


In [None]:
eca=edd

In [None]:
eca

Unnamed: 0,date,airline,from,time_taken,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min
0,2022-02-11,SpiceJet,2,02h 10m,5,5953.0,11,2,4,130.0,18,55,21,5
1,2022-02-11,SpiceJet,2,02h 20m,5,5953.0,11,2,4,140.0,6,20,8,40
2,2022-02-11,AirAsia,2,02h 10m,5,5956.0,11,2,4,130.0,4,25,6,35
3,2022-02-11,Vistara,2,02h 15m,5,5955.0,11,2,4,135.0,10,20,12,35
4,2022-02-11,Vistara,2,02h 20m,5,5955.0,11,2,4,140.0,8,50,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,1,13h 50m,3,7697.0,31,3,3,830.0,7,5,20,55
206770,2022-03-31,Vistara,1,13h 50m,3,7709.0,31,3,3,830.0,7,5,20,55
206771,2022-03-31,Vistara,1,20h 35m,3,8640.0,31,3,3,1235.0,12,30,9,5
206772,2022-03-31,Vistara,1,23h 20m,3,8640.0,31,3,3,1400.0,9,45,9,5


In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

airline_le = LabelEncoder()


eca['airline_encoded'] = airline_le.fit_transform(eca['airline'])


joblib.dump(airline_le, 'airline_label_encoder.pkl')




['airline_label_encoder.pkl']

In [None]:
eca

Unnamed: 0,date,airline,from,time_taken,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min,airline_encoded
0,2022-02-11,SpiceJet,2,02h 10m,5,5953.0,11,2,4,130.0,18,55,21,5,4
1,2022-02-11,SpiceJet,2,02h 20m,5,5953.0,11,2,4,140.0,6,20,8,40,4
2,2022-02-11,AirAsia,2,02h 10m,5,5956.0,11,2,4,130.0,4,25,6,35,1
3,2022-02-11,Vistara,2,02h 15m,5,5955.0,11,2,4,135.0,10,20,12,35,7
4,2022-02-11,Vistara,2,02h 20m,5,5955.0,11,2,4,140.0,8,50,11,10,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,1,13h 50m,3,7697.0,31,3,3,830.0,7,5,20,55,7
206770,2022-03-31,Vistara,1,13h 50m,3,7709.0,31,3,3,830.0,7,5,20,55,7
206771,2022-03-31,Vistara,1,20h 35m,3,8640.0,31,3,3,1235.0,12,30,9,5,7
206772,2022-03-31,Vistara,1,23h 20m,3,8640.0,31,3,3,1400.0,9,45,9,5,7


In [None]:
eca

Unnamed: 0,date,from,time_taken,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min,airline_encoded
0,2022-02-11,2,02h 10m,5,5953.0,11,2,4,130.0,18,55,21,5,4
1,2022-02-11,2,02h 20m,5,5953.0,11,2,4,140.0,6,20,8,40,4
2,2022-02-11,2,02h 10m,5,5956.0,11,2,4,130.0,4,25,6,35,1
3,2022-02-11,2,02h 15m,5,5955.0,11,2,4,135.0,10,20,12,35,7
4,2022-02-11,2,02h 20m,5,5955.0,11,2,4,140.0,8,50,11,10,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,1,13h 50m,3,7697.0,31,3,3,830.0,7,5,20,55,7
206770,2022-03-31,1,13h 50m,3,7709.0,31,3,3,830.0,7,5,20,55,7
206771,2022-03-31,1,20h 35m,3,8640.0,31,3,3,1235.0,12,30,9,5,7
206772,2022-03-31,1,23h 20m,3,8640.0,31,3,3,1400.0,9,45,9,5,7


In [None]:
eca.drop(['date', 'time_taken'], axis=1, inplace=True)

In [None]:
eca

Unnamed: 0,from,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min,airline_encoded
0,2,5,5953.0,11,2,4,130.0,18,55,21,5,4
1,2,5,5953.0,11,2,4,140.0,6,20,8,40,4
2,2,5,5956.0,11,2,4,130.0,4,25,6,35,1
3,2,5,5955.0,11,2,4,135.0,10,20,12,35,7
4,2,5,5955.0,11,2,4,140.0,8,50,11,10,7
...,...,...,...,...,...,...,...,...,...,...,...,...
206769,1,3,7697.0,31,3,3,830.0,7,5,20,55,7
206770,1,3,7709.0,31,3,3,830.0,7,5,20,55,7
206771,1,3,8640.0,31,3,3,1235.0,12,30,9,5,7
206772,1,3,8640.0,31,3,3,1400.0,9,45,9,5,7


In [None]:
eca.dtypes

Unnamed: 0,0
from,int64
to,int64
price,float64
day,int32
month,int32
weekday,int32
duration_mins,float64
dep_hour,int64
dep_min,int64
arr_hour,int64


In [None]:

from sklearn.model_selection import train_test_split


y = eca['price']
X = eca.drop(['price'], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


dt_pred = dt_model.predict(X_test)
print("Decision Tree Results")
print("R² Score:", r2_score(y_test, dt_pred))
print("MAE:", mean_absolute_error(y_test, dt_pred))
print("RMSE:", mean_squared_error(y_test, dt_pred))


Decision Tree Results
R² Score: 0.7767164292043668
MAE: 632.3396809659686
RMSE: 3150041.4467208604


In [None]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


rf_pred = rf_model.predict(X_test)
print("Random Forest Results")
print("R² Score:", r2_score(y_test, rf_pred))
print("MAE:", mean_absolute_error(y_test, rf_pred))
print("RMSE:", mean_squared_error(y_test, rf_pred))



Random Forest Results
R² Score: 0.8813855468344876
MAE: 564.3439582390064
RMSE: 1673389.772118432


NameError: name 'pickle' is not defined

In [None]:
import pickle
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

In [None]:
rf_model

In [None]:
#XGBoost Regressor
from xgboost import XGBRegressor


xgb_model = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
xgb_model.fit(X_train, y_train)


xgb_pred = xgb_model.predict(X_test)
print("XGBoost Results")
print("R² Score:", r2_score(y_test, xgb_pred))
print("MAE:", mean_absolute_error(y_test, xgb_pred))
print("RMSE:", mean_squared_error(y_test, xgb_pred))


XGBoost Results
R² Score: 0.8076093756284022
MAE: 1006.1484801215902
RMSE: 2714209.7314707143


In [None]:
#Gradient Boost
from sklearn.ensemble import GradientBoostingRegressor


gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)


gb_pred = gb_model.predict(X_test)
print("Gradient Boosting Results")
print("R² Score:", r2_score(y_test, gb_pred))
print("MAE:", mean_absolute_error(y_test, gb_pred))
print("RMSE:", mean_squared_error(y_test, gb_pred
                                  ))


Gradient Boosting Results
R² Score: 0.6751185827694112
MAE: 1422.6325772744815
RMSE: 4583364.221107226


In [None]:
edd_copy

Unnamed: 0,date,airline,from,time_taken,to,price,day,month,weekday,duration_mins,...,airline_Trujet,airline_Vistara,airline_Air India,airline_AirAsia,airline_GO FIRST,airline_Indigo,airline_SpiceJet,airline_StarAir,airline_Trujet.1,airline_Vistara.1
0,2022-02-11,SpiceJet,2,02h 10m,5,5953.0,11,2,4,130.0,...,False,False,False,False,False,False,True,False,False,False
1,2022-02-11,SpiceJet,2,02h 20m,5,5953.0,11,2,4,140.0,...,False,False,False,False,False,False,True,False,False,False
2,2022-02-11,AirAsia,2,02h 10m,5,5956.0,11,2,4,130.0,...,False,False,False,True,False,False,False,False,False,False
3,2022-02-11,Vistara,2,02h 15m,5,5955.0,11,2,4,135.0,...,False,True,False,False,False,False,False,False,False,True
4,2022-02-11,Vistara,2,02h 20m,5,5955.0,11,2,4,140.0,...,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,2022-03-31,Vistara,1,13h 50m,3,7697.0,31,3,3,830.0,...,False,True,False,False,False,False,False,False,False,True
206770,2022-03-31,Vistara,1,13h 50m,3,7709.0,31,3,3,830.0,...,False,True,False,False,False,False,False,False,False,True
206771,2022-03-31,Vistara,1,20h 35m,3,8640.0,31,3,3,1235.0,...,False,True,False,False,False,False,False,False,False,True
206772,2022-03-31,Vistara,1,23h 20m,3,8640.0,31,3,3,1400.0,...,False,True,False,False,False,False,False,False,False,True


In [None]:
edd_c=edd_copy

In [None]:
edd_copy.drop(['date', 'time_taken','airline'], axis=1, inplace=True)

In [None]:
edd_copy


Unnamed: 0,from,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,...,airline_Trujet,airline_Vistara,airline_Air India,airline_AirAsia,airline_GO FIRST,airline_Indigo,airline_SpiceJet,airline_StarAir,airline_Trujet.1,airline_Vistara.1
0,2,5,5953.0,11,2,4,130.0,18,55,21,...,False,False,False,False,False,False,True,False,False,False
1,2,5,5953.0,11,2,4,140.0,6,20,8,...,False,False,False,False,False,False,True,False,False,False
2,2,5,5956.0,11,2,4,130.0,4,25,6,...,False,False,False,True,False,False,False,False,False,False
3,2,5,5955.0,11,2,4,135.0,10,20,12,...,False,True,False,False,False,False,False,False,False,True
4,2,5,5955.0,11,2,4,140.0,8,50,11,...,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,1,3,7697.0,31,3,3,830.0,7,5,20,...,False,True,False,False,False,False,False,False,False,True
206770,1,3,7709.0,31,3,3,830.0,7,5,20,...,False,True,False,False,False,False,False,False,False,True
206771,1,3,8640.0,31,3,3,1235.0,12,30,9,...,False,True,False,False,False,False,False,False,False,True
206772,1,3,8640.0,31,3,3,1400.0,9,45,9,...,False,True,False,False,False,False,False,False,False,True


In [None]:
edd_copy.dtypes


Unnamed: 0,0
from,int64
to,int64
price,float64
day,int32
month,int32
weekday,int32
duration_mins,float64
dep_hour,int64
dep_min,int64
arr_hour,int64


In [None]:
eoh=edd_copy

In [None]:
eoh

Unnamed: 0,from,to,price,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,...,airline_Trujet,airline_Vistara,airline_Air India,airline_AirAsia,airline_GO FIRST,airline_Indigo,airline_SpiceJet,airline_StarAir,airline_Trujet.1,airline_Vistara.1
0,2,5,5953.0,11,2,4,130.0,18,55,21,...,False,False,False,False,False,False,True,False,False,False
1,2,5,5953.0,11,2,4,140.0,6,20,8,...,False,False,False,False,False,False,True,False,False,False
2,2,5,5956.0,11,2,4,130.0,4,25,6,...,False,False,False,True,False,False,False,False,False,False
3,2,5,5955.0,11,2,4,135.0,10,20,12,...,False,True,False,False,False,False,False,False,False,True
4,2,5,5955.0,11,2,4,140.0,8,50,11,...,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,1,3,7697.0,31,3,3,830.0,7,5,20,...,False,True,False,False,False,False,False,False,False,True
206770,1,3,7709.0,31,3,3,830.0,7,5,20,...,False,True,False,False,False,False,False,False,False,True
206771,1,3,8640.0,31,3,3,1235.0,12,30,9,...,False,True,False,False,False,False,False,False,False,True
206772,1,3,8640.0,31,3,3,1400.0,9,45,9,...,False,True,False,False,False,False,False,False,False,True


In [None]:
from sklearn.model_selection import train_test_split


y_eoh = eoh['price']
X_eoh = eoh.drop(['price'], axis=1)


X_eoh_train, X_eoh_test, y_eoh_train, y_eoh_test = train_test_split(X_eoh, y_eoh, test_size=0.2, random_state=42)

In [None]:
X_eoh


Unnamed: 0,from,to,day,month,weekday,duration_mins,dep_hour,dep_min,arr_hour,arr_min,...,airline_Trujet,airline_Vistara,airline_Air India,airline_AirAsia,airline_GO FIRST,airline_Indigo,airline_SpiceJet,airline_StarAir,airline_Trujet.1,airline_Vistara.1
0,2,5,11,2,4,130.0,18,55,21,5,...,False,False,False,False,False,False,True,False,False,False
1,2,5,11,2,4,140.0,6,20,8,40,...,False,False,False,False,False,False,True,False,False,False
2,2,5,11,2,4,130.0,4,25,6,35,...,False,False,False,True,False,False,False,False,False,False
3,2,5,11,2,4,135.0,10,20,12,35,...,False,True,False,False,False,False,False,False,False,True
4,2,5,11,2,4,140.0,8,50,11,10,...,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206769,1,3,31,3,3,830.0,7,5,20,55,...,False,True,False,False,False,False,False,False,False,True
206770,1,3,31,3,3,830.0,7,5,20,55,...,False,True,False,False,False,False,False,False,False,True
206771,1,3,31,3,3,1235.0,12,30,9,5,...,False,True,False,False,False,False,False,False,False,True
206772,1,3,31,3,3,1400.0,9,45,9,5,...,False,True,False,False,False,False,False,False,False,True


In [None]:
y_eoh

Unnamed: 0,price
0,5953.0
1,5953.0
2,5956.0
3,5955.0
4,5955.0
...,...
206769,7697.0
206770,7709.0
206771,8640.0
206772,8640.0


In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


lr_model = LinearRegression()
lr_model.fit(X_eoh_train, y_eoh_train)


lr_pred = lr_model.predict(X_eoh_test)


print(" Linear Regression Results")
print("R² Score:", r2_score(y_eoh_test, lr_pred))
print("MAE:", mean_absolute_error(y_eoh_test, lr_pred))
print("RMSE:", mean_squared_error(y_eoh_test, lr_pred))


 Linear Regression Results
R² Score: 0.48495817785698725
MAE: 1881.1315222060011
RMSE: 7266110.447642722


In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_eoh_train, y_eoh_train)

ridge_pred = ridge_model.predict(X_eoh_test)

print(" Ridge Regression Results")
print("R² Score:", r2_score(y_eoh_test, ridge_pred))
print("MAE:", mean_absolute_error(y_eoh_test, ridge_pred))
print("RMSE:", mean_squared_error(y_eoh_test, ridge_pred))

 Ridge Regression Results
R² Score: 0.4849572392371737
MAE: 1881.127600118116
RMSE: 7266123.689509577


In [None]:
#Finalizing Random Forest Regressor