# Flight Price Prediction

In [1]:
# importing libraries :

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
# reading datasets :
data_train = pd.read_excel(r"Data_Train.xlsx")

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# top 5 records :
data_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [5]:
data_train.shape

(10683, 11)

In [6]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [7]:
# cheaking for NAN Values present or not :
data_train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [8]:
# droping the NAN values :
data_train.dropna(inplace=True)

In [9]:
data_train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

## - Exploratory Data Analysis (EDA)

In [10]:
# we extrat day and month from "Date_of_Journey" : 

# extract day :
data_train["Journey_date"] = pd.to_datetime(data_train["Date_of_Journey"], format="%d/%m/%Y").dt.day

# extract month :
data_train["Journey_month"] = pd.to_datetime(data_train["Date_of_Journey"], format="%d/%m/%Y").dt.month

In [11]:
data_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3


In [12]:
# now we remove the feature "Date_of_Journey", as we convert it into integer feature :
data_train.drop(["Date_of_Journey"], axis=1, inplace=True)

In [13]:
# simillary we can extract hour and minute from "Dep_Time" :
# departure time is when the flight leave :

# extract hour :
data_train["Dep_hour"] = pd.to_datetime(data_train["Dep_Time"]).dt.hour

# extract minute :
data_train["Dep_minute"] = pd.to_datetime(data_train["Dep_Time"]).dt.minute

# now we remove the feature "Dep_Time", as we convert it into integer feature :
data_train.drop(["Dep_Time"], axis=1, inplace=True)

In [14]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Dep_hour,Dep_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2 stops,No info,7662,1,5,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,04:25 10 Jun,19h,2 stops,No info,13882,9,6,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,23:30,5h 25m,1 stop,No info,6218,12,5,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,21:35,4h 45m,1 stop,No info,13302,1,3,16,50


In [15]:
# simillary we can extract hour and minute from "Arrival_Time" :
# departure time is when the flight leave :

# extract hour :
data_train["Arrival_hour"] = pd.to_datetime(data_train["Arrival_Time"]).dt.hour

# extract minute :
data_train["Arrival_minute"] = pd.to_datetime(data_train["Arrival_Time"]).dt.minute

# now we remove the feature "Dep_Time", as we convert it into integer feature :
data_train.drop(["Arrival_Time"], axis=1, inplace=True)

In [16]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,12,5,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,1,3,16,50,21,35


In [17]:
# Duration :
# time taken by plane to reach destination or 
# difference between deparature time and arrival time

# coverting duration column into list :
duration = list(data_train["Duration"])

for i in range(len(duration)):
    if len(duration[i].split()) != 2:     # cheacking duration contain only hour or minute 
        if "h" in duration[i]:
            duration[i] = duration[i] + " 0m"     # adding 0 minute
        else:
            duration[i] = "0h " + duration[i]     # adding 0 hour

Duration_hour = []
Duration_minute = []
for i in range(len(duration)):
    Duration_hour.append(int(duration[i].split(sep = "h")[0]))      # Extract hours from duration
    Duration_minute.append(int(duration[i].split(sep = "m")[0].split()[-1]))      # Extracts only minutes from duration

In [18]:
# adding Duration_hour and Duration_minute to the data_train :
data_train["Duration_hour"] = Duration_hour
data_train["Duration_minute"] = Duration_minute

In [19]:
# removing the "Duration" feature :
data_train.drop(["Duration"], axis=1, inplace=True)

In [20]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7662,1,5,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,No info,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1 stop,No info,6218,12,5,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1 stop,No info,13302,1,3,16,50,21,35,4,45


## - Handling Categorical Feature :

1. Nominal Data -- data are not in order --> OneHotEncoding
2. Ordinal Data -- data are in order (Have some rank) --> Label Encoding

In [21]:
# "Airline" is Nominal categorical feature, so we perform OneHotEncoding :
data_train["Airline"].value_counts()

Jet Airways                          3849
IndiGo                               2053
Air India                            1751
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64

In [22]:
# OneHotEncoding :
Airline = data_train[["Airline"]]
Airline = pd.get_dummies(Airline, drop_first=True)
Airline.head()

Unnamed: 0,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy
0,0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0


In [23]:
# also "Source" is Nominal categorical feature, so we perform OneHotEncoding :
data_train["Source"].value_counts()

Delhi       4536
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: Source, dtype: int64

In [24]:
# OneHotEncoding :
Source = data_train[["Source"]]
Source = pd.get_dummies(Source, drop_first=True)
Source.head()

Unnamed: 0,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai
0,0,0,0,0
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,0,0


In [25]:
# also "Destination" is Nominal categorical feature, so we perform OneHotEncoding :
data_train["Destination"].value_counts()

Cochin       4536
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: Destination, dtype: int64

In [26]:
# OneHotEncoding :
Destination = data_train[["Destination"]]
Destination = pd.get_dummies(Destination, drop_first=True)
Destination.head()

Unnamed: 0,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,0,0,0,1
1,0,0,0,0,0
2,1,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,1


In [27]:
# As we see that "Route" and "Total_Stops" are related to each other  
data_train["Route"].value_counts()

DEL → BOM → COK                      2376
BLR → DEL                            1552
CCU → BOM → BLR                       979
CCU → BLR                             724
BOM → HYD                             621
                                     ... 
BOM → BBI → HYD                         1
CCU → VTZ → BLR                         1
BOM → VNS → DEL → HYD                   1
BOM → NDC → HYD                         1
BLR → CCU → BBI → HYD → VGA → DEL       1
Name: Route, Length: 128, dtype: int64

In [28]:
data_train["Total_Stops"].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [29]:
# droping "Route" feature :
# also dropping  "Additional_Info" feature beacause it has no such information contain :
data_train.drop(["Route", "Additional_Info"], axis=1, inplace=True)

In [30]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute
0,IndiGo,Banglore,New Delhi,non-stop,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,2 stops,7662,1,5,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,2 stops,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,1 stop,6218,12,5,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,1 stop,13302,1,3,16,50,21,35,4,45


In [31]:
# "Total_Stops" is Ordinal categorical feature, so we perform LabelEncoding :
data_train["Total_Stops"].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [32]:
data_train.replace({"non-stop" : 0, "1 stop" : 1, "2 stops" : 2, "3 stops" : 3, "4 stops" : 4}, inplace=True)

In [33]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute
0,IndiGo,Banglore,New Delhi,0,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,2,7662,1,5,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,2,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,1,6218,12,5,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,1,13302,1,3,16,50,21,35,4,45


In [34]:
# concatenating the dataset : [data_train + Airline + Source + Destination] 
data_train = pd.concat([data_train, Airline, Source, Destination], axis=1)

In [35]:
data_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,IndiGo,Banglore,New Delhi,0,3897,24,3,22,20,1,10,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,Air India,Kolkata,Banglore,2,7662,1,5,5,50,13,15,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,Jet Airways,Delhi,Cochin,2,13882,9,6,9,25,4,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,IndiGo,Kolkata,Banglore,1,6218,12,5,18,5,23,30,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,IndiGo,Banglore,New Delhi,1,13302,1,3,16,50,21,35,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [36]:
# now removing this columns : [Airline, Source, Destination]
data_train.drop(["Airline", "Source", "Destination"], axis=1, inplace=True)

In [37]:
data_train.head()

Unnamed: 0,Total_Stops,Price,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,3897,24,3,22,20,1,10,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2,7662,1,5,5,50,13,15,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,13882,9,6,9,25,4,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,1,6218,12,5,18,5,23,30,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,13302,1,3,16,50,21,35,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# Test Data :

In [38]:
# reading Test datasets :
data_test = pd.read_excel(r"Data_Train.xlsx")

In [39]:
pd.set_option('display.max_columns', None)

In [40]:
# top 5 records :
data_test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [41]:
# cheaking for NAN Values present or not :
data_test.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [42]:
# droping the NAN values :
data_test.dropna(inplace=True)

In [43]:
data_test.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

### - Exploratory Data Analysis (EDA)

In [44]:
# we extrat day and month from "Date_of_Journey" : 

# extract day :
data_test["Journey_date"] = pd.to_datetime(data_test["Date_of_Journey"], format="%d/%m/%Y").dt.day

# extract month :
data_test["Journey_month"] = pd.to_datetime(data_test["Date_of_Journey"], format="%d/%m/%Y").dt.month

# now we remove the feature "Date_of_Journey", as we convert it into integer feature :
data_test.drop(["Date_of_Journey"], axis=1, inplace=True)

In [45]:
# simillary we can extract hour and minute from "Dep_Time" :
# departure time is when the flight leave :

# extract hour :
data_test["Dep_hour"] = pd.to_datetime(data_test["Dep_Time"]).dt.hour

# extract minute :
data_test["Dep_minute"] = pd.to_datetime(data_test["Dep_Time"]).dt.minute

# now we remove the feature "Dep_Time", as we convert it into integer feature :
data_test.drop(["Dep_Time"], axis=1, inplace=True)

In [46]:
# simillary we can extract hour and minute from "Arrival_Time" :
# departure time is when the flight leave :

# extract hour :
data_test["Arrival_hour"] = pd.to_datetime(data_test["Arrival_Time"]).dt.hour

# extract minute :
data_test["Arrival_minute"] = pd.to_datetime(data_test["Arrival_Time"]).dt.minute

# now we remove the feature "Dep_Time", as we convert it into integer feature :
data_test.drop(["Arrival_Time"], axis=1, inplace=True)

In [47]:
# Duration :
# time taken by plane to reach destination or 
# difference between deparature time and arrival time

# coverting duration column into list :
duration = list(data_test["Duration"])

for i in range(len(duration)):
    if len(duration[i].split()) != 2:     # cheacking duration contain only hour or minute 
        if "h" in duration[i]:
            duration[i] = duration[i] + " 0m"     # adding 0 minute
        else:
            duration[i] = "0h " + duration[i]     # adding 0 hour

Duration_hour = []
Duration_minute = []
for i in range(len(duration)):
    Duration_hour.append(int(duration[i].split(sep = "h")[0]))      # Extract hours from duration
    Duration_minute.append(int(duration[i].split(sep = "m")[0].split()[-1]))      # Extracts only minutes from duration
    
# adding Duration_hour and Duration_minute to the data_train :
data_test["Duration_hour"] = Duration_hour
data_test["Duration_minute"] = Duration_minute

# removing the "Duration" feature :
data_test.drop(["Duration"], axis=1, inplace=True)

### - Handling Categorical Feature :

In [48]:
# "Airline" is Nominal categorical feature, so we perform OneHotEncoding :
# OneHotEncoding :
Airline = data_test[["Airline"]]
Airline = pd.get_dummies(Airline, drop_first=True)

In [49]:
# also "Source" is Nominal categorical feature, so we perform OneHotEncoding :
# OneHotEncoding :
Source = data_test[["Source"]]
Source = pd.get_dummies(Source, drop_first=True)

In [50]:
# also "Destination" is Nominal categorical feature, so we perform OneHotEncoding :
# OneHotEncoding :
Destination = data_test[["Destination"]]
Destination = pd.get_dummies(Destination, drop_first=True)

In [51]:
# droping "Route" and "Additional_Info" feature :
data_test.drop(["Route", "Additional_Info"], axis=1, inplace=True)

In [52]:
# "Total_Stops" is Ordinal categorical feature, so we perform LabelEncoding :
data_test.replace({"non-stop" : 0, "1 stop" : 1, "2 stops" : 2, "3 stops" : 3, "4 stops" : 4}, inplace=True)

In [53]:
# concatenating the dataset : [data_train + Airline + Source + Destination] 
data_test = pd.concat([data_test, Airline, Source, Destination], axis=1)

In [54]:
# now removing this columns : [Airline, Source, Destination]
data_test.drop(["Airline", "Source", "Destination"], axis=1, inplace=True)

In [55]:
data_test.head()

Unnamed: 0,Total_Stops,Price,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,3897,24,3,22,20,1,10,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2,7662,1,5,5,50,13,15,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,13882,9,6,9,25,4,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,1,6218,12,5,18,5,23,30,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,13302,1,3,16,50,21,35,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [56]:
data_train.columns

Index(['Total_Stops', 'Price', 'Journey_date', 'Journey_month', 'Dep_hour',
       'Dep_minute', 'Arrival_hour', 'Arrival_minute', 'Duration_hour',
       'Duration_minute', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata', 'Destination_New Delhi'],
      dtype='object')

In [57]:
X = data_train.loc[:, ['Total_Stops', 'Journey_date', 'Journey_month', 'Dep_hour',
       'Dep_minute', 'Arrival_hour', 'Arrival_minute', 'Duration_hour',
       'Duration_minute', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata', 'Destination_New Delhi']]

X.head()

Unnamed: 0,Total_Stops,Journey_date,Journey_month,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_hour,Duration_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,0,24,3,22,20,1,10,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2,1,5,5,50,13,15,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,2,9,6,9,25,4,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
3,1,12,5,18,5,23,30,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,1,3,16,50,21,35,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [58]:
y = data_train.iloc[:, 1]
y.head()

0     3897
1     7662
2    13882
3     6218
4    13302
Name: Price, dtype: int64

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Fitting Random Forest Model :

In [60]:
from sklearn.ensemble import RandomForestRegressor
random = RandomForestRegressor()
random.fit(X_train, y_train)

RandomForestRegressor()

In [61]:
y_pred = random.predict(X_test)

In [62]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_error, r2_score

print('MAE :', mean_absolute_error(y_test, y_pred))
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE :', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2_score :', r2_score(y_test, y_pred))

MAE : 1176.637250239881
MSE : 4359447.399168197
RMSE : 2087.928973688568
R2_score : 0.7978185691740758


In [64]:
data_test.columns

Index(['Total_Stops', 'Price', 'Journey_date', 'Journey_month', 'Dep_hour',
       'Dep_minute', 'Arrival_hour', 'Arrival_minute', 'Duration_hour',
       'Duration_minute', 'Airline_Air India', 'Airline_GoAir',
       'Airline_IndiGo', 'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata', 'Destination_New Delhi'],
      dtype='object')

In [63]:
y_predd = random.predict(data_test)

ValueError: Number of features of the model must match the input. Model n_features is 29 and input n_features is 30 