In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings 
warnings.filterwarnings('ignore')

In [2]:
import pickle
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
price=pd.read_csv("Flight_Price.csv")
price.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [5]:
price.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [6]:
#feature engineering
price['Date_of_Journey'].str
price['Date_of_Journey'].str.split('/')

0        [24, 03, 2019]
1         [1, 05, 2019]
2         [9, 06, 2019]
3        [12, 05, 2019]
4        [01, 03, 2019]
              ...      
10678     [9, 04, 2019]
10679    [27, 04, 2019]
10680    [27, 04, 2019]
10681    [01, 03, 2019]
10682     [9, 05, 2019]
Name: Date_of_Journey, Length: 10683, dtype: object

In [7]:
price['Date_of_Journey'].str.split('/').str[0]

0        24
1         1
2         9
3        12
4        01
         ..
10678     9
10679    27
10680    27
10681    01
10682     9
Name: Date_of_Journey, Length: 10683, dtype: object

In [8]:
price['Date_of_Journey'].str.split('/').str[1]

0        03
1        05
2        06
3        05
4        03
         ..
10678    04
10679    04
10680    04
10681    03
10682    05
Name: Date_of_Journey, Length: 10683, dtype: object

In [9]:
price['Date_of_Journey'].str.split('/').str[2]

0        2019
1        2019
2        2019
3        2019
4        2019
         ... 
10678    2019
10679    2019
10680    2019
10681    2019
10682    2019
Name: Date_of_Journey, Length: 10683, dtype: object

In [10]:
(price['Date_of_Journey'].str.split('/').str[0].value_counts(),
price['Date_of_Journey'].str.split('/').str[1].value_counts(),
price['Date_of_Journey'].str.split('/').str[2].value_counts())

(Date_of_Journey
 9     1304
 6     1193
 27    1130
 21    1111
 24    1052
 15     984
 12     957
 1      923
 18     832
 3      751
 01     152
 09     102
 03      97
 06      95
 Name: count, dtype: int64,
 Date_of_Journey
 05    3466
 06    3414
 03    2724
 04    1079
 Name: count, dtype: int64,
 Date_of_Journey
 2019    10683
 Name: count, dtype: int64)

In [11]:
price['Date']=price['Date_of_Journey'].str.split('/').str[0]
price['Month']=price['Date_of_Journey'].str.split('/').str[1]
price['Year']=price['Date_of_Journey'].str.split('/').str[2]

In [12]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
 11  Date             10683 non-null  object
 12  Month            10683 non-null  object
 13  Year             10683 non-null  object
dtypes: int64(1), object(13)
memory usage: 1.1+ MB


In [13]:
price['Date']=price['Date'].astype(int)
price['Month']=price['Month'].astype(int)
price['Year']=price['Year'].astype(int)

price.drop(columns='Date_of_Journey',inplace=True)

In [14]:
price['Arrival_Time'].str.split(':').str[0]

0        01
1        13
2        04
3        23
4        21
         ..
10678    22
10679    23
10680    11
10681    14
10682    19
Name: Arrival_Time, Length: 10683, dtype: object

In [15]:
price.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019
1,Air India,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3,2019


In [16]:
price['Arrival_Hour']=price['Arrival_Time'].str.split(':').str[0]
price['Arrival_Minute']=price['Arrival_Time'].str.split(':').str[1]

In [17]:
price.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minute
0,IndiGo,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019,1,10 22 Mar
1,Air India,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019,13,15


In [18]:
price['Arrival_Time']=price['Arrival_Time'].apply(lambda x:x.split(' ')[0])
price['Arrival_Time']

0        01:10
1        13:15
2        04:25
3        23:30
4        21:35
         ...  
10678    22:25
10679    23:20
10680    11:20
10681    14:10
10682    19:15
Name: Arrival_Time, Length: 10683, dtype: object

In [19]:
price['Arrival_Hour']=price['Arrival_Time'].str.split(':').str[0]
price['Arrival_Minute']=price['Arrival_Time'].str.split(':').str[1]

In [20]:
price.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minute
0,IndiGo,Banglore,New Delhi,BLR ? DEL,22:20,01:10,2h 50m,non-stop,No info,3897,24,3,2019,1,10
1,Air India,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019,13,15


In [21]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Source           10683 non-null  object
 2   Destination      10683 non-null  object
 3   Route            10682 non-null  object
 4   Dep_Time         10683 non-null  object
 5   Arrival_Time     10683 non-null  object
 6   Duration         10683 non-null  object
 7   Total_Stops      10682 non-null  object
 8   Additional_Info  10683 non-null  object
 9   Price            10683 non-null  int64 
 10  Date             10683 non-null  int32 
 11  Month            10683 non-null  int32 
 12  Year             10683 non-null  int32 
 13  Arrival_Hour     10683 non-null  object
 14  Arrival_Minute   10683 non-null  object
dtypes: int32(3), int64(1), object(11)
memory usage: 1.1+ MB


In [22]:
price['Arrival_Hour']=price['Arrival_Hour'].astype(int)
price['Arrival_Minute']=price['Arrival_Minute'].astype(int)
price.drop(columns='Arrival_Time',inplace=True)

In [23]:
price['Dep_Hour']=price['Dep_Time'].str.split(':').str[0]
price['Dep_Minute']=price['Dep_Time'].str.split(':').str[1]

In [24]:
price['Dep_Hour']=price['Dep_Hour'].astype(int)
price['Dep_Minute']=price['Dep_Minute'].astype(int)
price.drop(columns='Dep_Time',inplace=True)

In [25]:
price['Total_Stops'].value_counts

<bound method IndexOpsMixin.value_counts of 0        non-stop
1         2 stops
2         2 stops
3          1 stop
4          1 stop
           ...   
10678    non-stop
10679    non-stop
10680    non-stop
10681    non-stop
10682     2 stops
Name: Total_Stops, Length: 10683, dtype: object>

In [26]:
price['Total_Stops']=price['Total_Stops'].map({
    'non-stop':0,
    '1 stop':1,
    '2 stops':2,
    '3 stops':3,
    '4 stops':4,
    np.nan:1
})

In [27]:
price.drop(columns='Route',inplace=True)

In [28]:
price.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minute,Dep_Hour,Dep_Minute
0,IndiGo,Banglore,New Delhi,2h 50m,0,No info,3897,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,7h 25m,2,No info,7662,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,19h,2,No info,13882,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,5h 25m,1,No info,6218,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,4h 45m,1,No info,13302,1,3,2019,21,35,16,50


In [29]:
price['Duration'].str.split('h').str[0]

0         2
1         7
2        19
3         5
4         4
         ..
10678     2
10679     2
10680     3
10681     2
10682     8
Name: Duration, Length: 10683, dtype: object

In [30]:
price['Duration'].str.split('h').str[1].str.split('m').str[0]

0         50
1         25
2           
3         25
4         45
        ... 
10678     30
10679     35
10680       
10681     40
10682     20
Name: Duration, Length: 10683, dtype: object

In [31]:
price['Duration_Hour']=price['Duration'].str.split('h').str[0]
price['Duration_Minute']=price['Duration'].str.split('h').str[1].str.split('m').str[0]

In [32]:
price.head(2)

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minute,Dep_Hour,Dep_Minute,Duration_Hour,Duration_Minute
0,IndiGo,Banglore,New Delhi,2h 50m,0,No info,3897,24,3,2019,1,10,22,20,2,50
1,Air India,Kolkata,Banglore,7h 25m,2,No info,7662,1,5,2019,13,15,5,50,7,25


In [33]:
price.isnull().sum()

Airline            0
Source             0
Destination        0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
Date               0
Month              0
Year               0
Arrival_Hour       0
Arrival_Minute     0
Dep_Hour           0
Dep_Minute         0
Duration_Hour      0
Duration_Minute    1
dtype: int64

In [34]:
price['Duration_Minute'].replace(np.nan,5,inplace=True)

In [35]:
price.isnull().sum()

Airline            0
Source             0
Destination        0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
Date               0
Month              0
Year               0
Arrival_Hour       0
Arrival_Minute     0
Dep_Hour           0
Dep_Minute         0
Duration_Hour      0
Duration_Minute    0
dtype: int64

In [36]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Source           10683 non-null  object
 2   Destination      10683 non-null  object
 3   Duration         10683 non-null  object
 4   Total_Stops      10683 non-null  int64 
 5   Additional_Info  10683 non-null  object
 6   Price            10683 non-null  int64 
 7   Date             10683 non-null  int32 
 8   Month            10683 non-null  int32 
 9   Year             10683 non-null  int32 
 10  Arrival_Hour     10683 non-null  int32 
 11  Arrival_Minute   10683 non-null  int32 
 12  Dep_Hour         10683 non-null  int32 
 13  Dep_Minute       10683 non-null  int32 
 14  Duration_Hour    10683 non-null  object
 15  Duration_Minute  10683 non-null  object
dtypes: int32(7), int64(2), object(7)
memory usage: 1.0+ MB


In [37]:
price['Duration_Hour'] = price['Duration_Hour'].str.replace(r'\D', '', regex=True)  # Remove non-numeric characters
price['Duration_Hour'] = price['Duration_Hour'].astype(int)  # Convert to integer

In [38]:
price['Duration_Hour']=price['Duration_Hour'].astype(int)

In [39]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Source           10683 non-null  object
 2   Destination      10683 non-null  object
 3   Duration         10683 non-null  object
 4   Total_Stops      10683 non-null  int64 
 5   Additional_Info  10683 non-null  object
 6   Price            10683 non-null  int64 
 7   Date             10683 non-null  int32 
 8   Month            10683 non-null  int32 
 9   Year             10683 non-null  int32 
 10  Arrival_Hour     10683 non-null  int32 
 11  Arrival_Minute   10683 non-null  int32 
 12  Dep_Hour         10683 non-null  int32 
 13  Dep_Minute       10683 non-null  int32 
 14  Duration_Hour    10683 non-null  int32 
 15  Duration_Minute  10683 non-null  object
dtypes: int32(8), int64(2), object(6)
memory usage: 1001.7+ KB


In [40]:
price['Duration_Minute'].replace('', np.nan , inplace=True)
price['Duration_Minute'].fillna(0,inplace=True)

In [41]:
price['Duration_Minute']=price['Duration_Minute'].astype(int)

In [42]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Source           10683 non-null  object
 2   Destination      10683 non-null  object
 3   Duration         10683 non-null  object
 4   Total_Stops      10683 non-null  int64 
 5   Additional_Info  10683 non-null  object
 6   Price            10683 non-null  int64 
 7   Date             10683 non-null  int32 
 8   Month            10683 non-null  int32 
 9   Year             10683 non-null  int32 
 10  Arrival_Hour     10683 non-null  int32 
 11  Arrival_Minute   10683 non-null  int32 
 12  Dep_Hour         10683 non-null  int32 
 13  Dep_Minute       10683 non-null  int32 
 14  Duration_Hour    10683 non-null  int32 
 15  Duration_Minute  10683 non-null  int32 
dtypes: int32(9), int64(2), object(5)
memory usage: 959.9+ KB


In [43]:
price.drop(columns='Duration',inplace=True)
price.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minute,Dep_Hour,Dep_Minute,Duration_Hour,Duration_Minute
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,2019,1,10,22,20,2,50
1,Air India,Kolkata,Banglore,2,No info,7662,1,5,2019,13,15,5,50,7,25
2,Jet Airways,Delhi,Cochin,2,No info,13882,9,6,2019,4,25,9,25,19,0
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,2019,23,30,18,5,5,25
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,2019,21,35,16,50,4,45


In [44]:
price.to_csv("Flight_Data.csv")

In [45]:
price['Airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [46]:
price['Source'].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [47]:
price['Destination'].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

In [48]:
price['Additional_Info'].unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [52]:
price_new

Unnamed: 0,Airline,Source,Destination,Additional_Info
0,3,0,5,8
1,1,3,0,8
2,4,2,1,8
3,3,3,0,8
4,3,0,5,8
...,...,...,...,...
10678,0,3,0,8
10679,1,3,0,8
10680,4,0,2,8
10681,10,0,5,8


In [53]:
price.drop(columns=['Airline','Destination','Source','Additional_Info'],inplace=True)

In [54]:
price_final=pd.concat([price_new,price],axis=1)

In [55]:
price_final.head()

Unnamed: 0,Airline,Source,Destination,Additional_Info,Total_Stops,Price,Date,Month,Year,Arrival_Hour,Arrival_Minute,Dep_Hour,Dep_Minute,Duration_Hour,Duration_Minute
0,3,0,5,8,0,3897,24,3,2019,1,10,22,20,2,50
1,1,3,0,8,2,7662,1,5,2019,13,15,5,50,7,25
2,4,2,1,8,2,13882,9,6,2019,4,25,9,25,19,0
3,3,3,0,8,1,6218,12,5,2019,23,30,18,5,5,25
4,3,0,5,8,1,13302,1,3,2019,21,35,16,50,4,45


In [56]:
price_final.isnull().sum()

Airline            0
Source             0
Destination        0
Additional_Info    0
Total_Stops        0
Price              0
Date               0
Month              0
Year               0
Arrival_Hour       0
Arrival_Minute     0
Dep_Hour           0
Dep_Minute         0
Duration_Hour      0
Duration_Minute    0
dtype: int64

In [57]:
price_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Airline          10683 non-null  int32
 1   Source           10683 non-null  int32
 2   Destination      10683 non-null  int32
 3   Additional_Info  10683 non-null  int32
 4   Total_Stops      10683 non-null  int64
 5   Price            10683 non-null  int64
 6   Date             10683 non-null  int32
 7   Month            10683 non-null  int32
 8   Year             10683 non-null  int32
 9   Arrival_Hour     10683 non-null  int32
 10  Arrival_Minute   10683 non-null  int32
 11  Dep_Hour         10683 non-null  int32
 12  Dep_Minute       10683 non-null  int32
 13  Duration_Hour    10683 non-null  int32
 14  Duration_Minute  10683 non-null  int32
dtypes: int32(13), int64(2)
memory usage: 709.5 KB


In [58]:
price_final

Unnamed: 0,Airline,Source,Destination,Additional_Info,Total_Stops,Price,Date,Month,Year,Arrival_Hour,Arrival_Minute,Dep_Hour,Dep_Minute,Duration_Hour,Duration_Minute
0,3,0,5,8,0,3897,24,3,2019,1,10,22,20,2,50
1,1,3,0,8,2,7662,1,5,2019,13,15,5,50,7,25
2,4,2,1,8,2,13882,9,6,2019,4,25,9,25,19,0
3,3,3,0,8,1,6218,12,5,2019,23,30,18,5,5,25
4,3,0,5,8,1,13302,1,3,2019,21,35,16,50,4,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,0,3,0,8,0,4107,9,4,2019,22,25,19,55,2,30
10679,1,3,0,8,0,4145,27,4,2019,23,20,20,45,2,35
10680,4,0,2,8,0,7229,27,4,2019,11,20,8,20,3,0
10681,10,0,5,8,0,12648,1,3,2019,14,10,11,30,2,40


In [59]:
tar=price_final['Price']

In [60]:
feature_classes=price_final[['Airline','Date','Month','Year','Source','Destination','Total_Stops','Additional_Info']]

In [61]:
feature_classes

Unnamed: 0,Airline,Date,Month,Year,Source,Destination,Total_Stops,Additional_Info
0,3,24,3,2019,0,5,0,8
1,1,1,5,2019,3,0,2,8
2,4,9,6,2019,2,1,2,8
3,3,12,5,2019,3,0,1,8
4,3,1,3,2019,0,5,1,8
...,...,...,...,...,...,...,...,...
10678,0,9,4,2019,3,0,0,8
10679,1,27,4,2019,3,0,0,8
10680,4,27,4,2019,0,2,0,8
10681,10,1,3,2019,0,5,0,8


In [64]:
Price_Final=pd.concat([feature_classes,tar],axis=1)
Price_Final.head()

Unnamed: 0,Airline,Date,Month,Year,Source,Destination,Total_Stops,Additional_Info,Price
0,3,24,3,2019,0,5,0,8,3897
1,1,1,5,2019,3,0,2,8,7662
2,4,9,6,2019,2,1,2,8,13882
3,3,12,5,2019,3,0,1,8,6218
4,3,1,3,2019,0,5,1,8,13302


In [65]:
# data
X = Price_Final.drop(columns='Price', axis=1)  # Features (all columns except 'Price')
y = Price_Final['Price']  # Target variable (Price)

In [66]:
# model=RandomForestRegressor(n_estimators=200,random_state=42)
# model.fit(X_train,y_train)
# feature_importances=model.feature_importances_
# feature_importance_df=pd.DataFrame({
#     'Feature':X_train.columns,
#     'Importance':feature_importances
# })
# feature_importance_df=feature_importance_df.sort_values(by='Importance',ascending=False)
# print(feature_importance_df)

In [67]:
#             Feature  Importance
# 12    Duration_Hour    0.440951
# 0           Airline    0.145050
# 5              Date    0.096217
# 3   Additional_Info    0.070320
# 4       Total_Stops    0.049683
# 6             Month    0.043784
# 2       Destination    0.036310
# 10         Dep_Hour    0.028307
# 8      Arrival_Hour    0.025701
# 11       Dep_Minute    0.019944
# 13  Duration_Minute    0.018665
# 9    Arrival_Minute    0.017589
# 1            Source    0.007479
# 7              Year    0.000000

In [68]:
#hyperparameter tuning using gridsearch
lr=LinearRegression()
rf=RandomForestRegressor(random_state=40)
xg=XGBRegressor(random_state=40)

In [None]:
# rf_param={
#     'n_estimators':[50,100,150,200],
#     'max_depth':[None,10,20,30],
#     'min_samples_split':[2,3,4,5],
#     'min_samples_leaf':[2,3,4,5],
# }
# xg_param={
#     'n_estimators':[50,100,150,200],
#     'max_depth':[None,10,20,30],
#     'learning_rate':[0.01,0.1,0,2],
#     'sub_sample':[0.6,0.7,1.2,1.3],

# }

In [None]:
# rf_gridSearch=GridSearchCV(rf,rf_param,cv=5,n_jobs=-1,verbose=1)
# rf_gridSearch.fit(X_train,y_train)
# xg_gridSearch=GridSearchCV(xg,xg_param,cv=5,n_jobs=-1,verbose=1)
# xg_gridSearch.fit(X_train,y_train)

# print("The Parameters for Random Forest Regressor are",rf_gridSearch.best_params_)
# print("The Score for Random Forest Regressor are",rf_gridSearch.best_score_)

# print("The Parameters for Xgboost Regressor are",xg_gridSearch.best_params_)
# print("The Score for Xgboost Regressor are",xg_gridSearch.best_score_)

In [69]:
categorical_cols = ['Airline', 'Source', 'Destination', 'Additional_Info']
numerical_cols = ['Date', 'Month', 'Year', 'Total_Stops']

# Define preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [70]:
preprocessor

In [71]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=2),
    "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, subsample=0.6)
}

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        pipeline = Pipeline(steps=[
            ('preprocessing', preprocessor),
            ('regressor', model)
        ])
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # Metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Log metrics and params
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)
        mlflow.log_params(model.get_params())

        # Save pipeline
        model_path = f"{name}_pipeline.pkl"
        with open(model_path, "wb") as f:
            pickle.dump(pipeline, f)
        mlflow.log_artifact(model_path)

        # Also log model to MLflow model registry
        mlflow.sklearn.log_model(pipeline, artifact_path="model")

        print(f"\n{name} Model Results:")
        print(f"RMSE: {rmse:.2f} | MAE: {mae:.2f} | R²: {r2:.2f}")
        print(f"✅ Logged and saved: {model_path}")


LinearRegression Model
RMSE: 2579.07
MAE : 1791.51
R²  : 0.69
✅ Pipeline saved as: LinearRegression_pipeline.pkl

RandomForest Model
RMSE: 1752.18
MAE : 1014.61
R²  : 0.85
✅ Pipeline saved as: RandomForest_pipeline.pkl

XGBoost Model
RMSE: 1925.01
MAE : 1277.66
R²  : 0.82
✅ Pipeline saved as: XGBoost_pipeline.pkl


In [75]:
type(model)

sklearn.ensemble._forest.RandomForestRegressor

In [78]:
prediction = model.predict(X_test)
print(f"💰 Predicted Flight Price: **₹{prediction[100]:,.2f}**")

💰 Predicted Flight Price: **₹8,658.47**
