# Flight Price Prediction

In [41]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Data Ingestion

In [42]:
df_train =  pd.read_excel('source/data/Data_Train.xlsx')
df_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [43]:
df_train.shape

(10683, 11)

In [44]:
df_test =  pd.read_excel('source/data/Test_set.xlsx')
df_test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [45]:
df_test.shape

(2671, 10)

In [46]:
# Concating data frames
data = pd.concat([df_train, df_test], ignore_index=True)
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0


In [47]:
data.shape

(13354, 11)

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13354 entries, 0 to 13353
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Date_of_Journey  13354 non-null  object 
 2   Source           13354 non-null  object 
 3   Destination      13354 non-null  object 
 4   Route            13353 non-null  object 
 5   Dep_Time         13354 non-null  object 
 6   Arrival_Time     13354 non-null  object 
 7   Duration         13354 non-null  object 
 8   Total_Stops      13353 non-null  object 
 9   Additional_Info  13354 non-null  object 
 10  Price            10683 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.1+ MB


In [49]:
# Check NaN values
data.isnull().sum()

Airline               0
Date_of_Journey       0
Source                0
Destination           0
Route                 1
Dep_Time              0
Arrival_Time          0
Duration              0
Total_Stops           1
Additional_Info       0
Price              2671
dtype: int64

In [50]:
data.dropna(subset=['Route', 'Total_Stops'], how= 'all',inplace=True)

## Feature Engineering

***Date_of_Journey Column***

In [51]:
# Getting Date, Month and Year from date_of_journey column
data['Date'] = data['Date_of_Journey'].str.split('/').str[0]
data['Month'] = data['Date_of_Journey'].str.split('/').str[1]
data['Year'] = data['Date_of_Journey'].str.split('/').str[2]

In [52]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0,24,3,2019
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,1,5,2019
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0,9,6,2019
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0,12,5,2019
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0,1,3,2019


In [53]:
# Change data, month, year values to integer
data['Date'] = data['Date'].astype(int)
data['Month'] = data['Month'].astype(int)
data['Year'] = data['Year'].astype(int)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13353 entries, 0 to 13353
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13353 non-null  object 
 1   Date_of_Journey  13353 non-null  object 
 2   Source           13353 non-null  object 
 3   Destination      13353 non-null  object 
 4   Route            13353 non-null  object 
 5   Dep_Time         13353 non-null  object 
 6   Arrival_Time     13353 non-null  object 
 7   Duration         13353 non-null  object 
 8   Total_Stops      13353 non-null  object 
 9   Additional_Info  13353 non-null  object 
 10  Price            10682 non-null  float64
 11  Date             13353 non-null  int64  
 12  Month            13353 non-null  int64  
 13  Year             13353 non-null  int64  
dtypes: float64(1), int64(3), object(10)
memory usage: 1.5+ MB


In [55]:
data.drop('Date_of_Journey', axis='columns', inplace=True)

In [56]:
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0,1,3,2019


***Arrival time Column***

In [57]:
data['Arrival_Time'] = data['Arrival_Time'].apply(lambda x: x.split(' ')[0])
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,non-stop,No info,3897.0,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19h,2 stops,No info,13882.0,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0,1,3,2019


In [58]:
# Separating arrival hour and minutes
data['Arrival_Hour'] = data['Arrival_Time'].apply(lambda x: x.split(':')[0])
data['Arrival_Minutes'] = data['Arrival_Time'].apply(lambda x: x.split(':')[1])
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,non-stop,No info,3897.0,24,3,2019,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,1,5,2019,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19h,2 stops,No info,13882.0,9,6,2019,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0,12,5,2019,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0,1,3,2019,21,35


In [59]:
# Changing data type
data['Arrival_Hour'] = data['Arrival_Hour'].astype(int)
data['Arrival_Minutes'] = data['Arrival_Minutes'].astype(int)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13353 entries, 0 to 13353
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13353 non-null  object 
 1   Source           13353 non-null  object 
 2   Destination      13353 non-null  object 
 3   Route            13353 non-null  object 
 4   Dep_Time         13353 non-null  object 
 5   Arrival_Time     13353 non-null  object 
 6   Duration         13353 non-null  object 
 7   Total_Stops      13353 non-null  object 
 8   Additional_Info  13353 non-null  object 
 9   Price            10682 non-null  float64
 10  Date             13353 non-null  int64  
 11  Month            13353 non-null  int64  
 12  Year             13353 non-null  int64  
 13  Arrival_Hour     13353 non-null  int64  
 14  Arrival_Minutes  13353 non-null  int64  
dtypes: float64(1), int64(5), object(9)
memory usage: 1.6+ MB


In [60]:
data.drop('Arrival_Time', axis='columns', inplace=True)
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,2h 50m,non-stop,No info,3897.0,24,3,2019,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,7h 25m,2 stops,No info,7662.0,1,5,2019,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,19h,2 stops,No info,13882.0,9,6,2019,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,5h 25m,1 stop,No info,6218.0,12,5,2019,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,4h 45m,1 stop,No info,13302.0,1,3,2019,21,35


***Dep_Time Column***

In [61]:
# Separating departure hour and minutes
data['Departure_Hour'] = data['Dep_Time'].apply(lambda x: x.split(':')[0])
data['Departure_Minutes'] = data['Dep_Time'].apply(lambda x: x.split(':')[1])
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,2h 50m,non-stop,No info,3897.0,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,7h 25m,2 stops,No info,7662.0,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,19h,2 stops,No info,13882.0,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,5h 25m,1 stop,No info,6218.0,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,4h 45m,1 stop,No info,13302.0,1,3,2019,21,35,16,50


In [62]:
# Changing data type
data['Departure_Hour'] = data['Departure_Hour'].astype(int)
data['Departure_Minutes'] = data['Departure_Minutes'].astype(int)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13353 entries, 0 to 13353
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Airline            13353 non-null  object 
 1   Source             13353 non-null  object 
 2   Destination        13353 non-null  object 
 3   Route              13353 non-null  object 
 4   Dep_Time           13353 non-null  object 
 5   Duration           13353 non-null  object 
 6   Total_Stops        13353 non-null  object 
 7   Additional_Info    13353 non-null  object 
 8   Price              10682 non-null  float64
 9   Date               13353 non-null  int64  
 10  Month              13353 non-null  int64  
 11  Year               13353 non-null  int64  
 12  Arrival_Hour       13353 non-null  int64  
 13  Arrival_Minutes    13353 non-null  int64  
 14  Departure_Hour     13353 non-null  int64  
 15  Departure_Minutes  13353 non-null  int64  
dtypes: float64(1), int64(7), ob

In [63]:
data.drop('Dep_Time', axis='columns', inplace=True)
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897.0,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662.0,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882.0,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218.0,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302.0,1,3,2019,21,35,16,50


***Total_Stops Column***

In [64]:
# Unique values in Total_stops column
data['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

In [65]:
# Encoding unique values
data['Total_Stops'] = data["Total_Stops"].map({'non-stop': 0, '1 stop' : 1, '2 stops' : 2, '3 stops' : 3, '4 stops' : 4})
data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,0,No info,3897.0,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2,No info,7662.0,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2,No info,13882.0,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1,No info,6218.0,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1,No info,13302.0,1,3,2019,21,35,16,50


In [66]:
data.drop('Route', axis='columns', inplace=True)
data.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes
0,IndiGo,Banglore,New Delhi,2h 50m,0,No info,3897.0,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,7h 25m,2,No info,7662.0,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,19h,2,No info,13882.0,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,5h 25m,1,No info,6218.0,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,4h 45m,1,No info,13302.0,1,3,2019,21,35,16,50


***Duration Column***

In [67]:
# Splitting Duration column to get number of hours
data['Duration_Hour'] = data['Duration'].str.split(' ').str[0].str.split('h').str[0]
data.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes,Duration_Hour
0,IndiGo,Banglore,New Delhi,2h 50m,0,No info,3897.0,24,3,2019,1,10,22,20,2
1,Air India,Kolkata,Banglore,7h 25m,2,No info,7662.0,1,5,2019,13,15,5,50,7
2,Jet Airways,Delhi,Cochin,19h,2,No info,13882.0,9,6,2019,4,25,9,25,19
3,IndiGo,Kolkata,Banglore,5h 25m,1,No info,6218.0,12,5,2019,23,30,18,5,5
4,IndiGo,Banglore,New Delhi,4h 45m,1,No info,13302.0,1,3,2019,21,35,16,50,4


In [68]:
# Check for erraneous entry
data[data['Duration_Hour'] == '5m']

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes,Duration_Hour
6474,Air India,Mumbai,Hyderabad,5m,2,No info,17327.0,6,3,2019,16,55,16,50,5m
13343,Air India,Mumbai,Hyderabad,5m,2,No info,,12,3,2019,16,55,16,50,5m


In [69]:
# Drop erraneous entries
data.drop(6474, axis='rows', inplace=True)
data.drop(13343, axis='rows', inplace=True)

In [70]:
data['Duration_Hour'] = data['Duration_Hour'].astype(int)

In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13351 entries, 0 to 13353
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Airline            13351 non-null  object 
 1   Source             13351 non-null  object 
 2   Destination        13351 non-null  object 
 3   Duration           13351 non-null  object 
 4   Total_Stops        13351 non-null  int64  
 5   Additional_Info    13351 non-null  object 
 6   Price              10681 non-null  float64
 7   Date               13351 non-null  int64  
 8   Month              13351 non-null  int64  
 9   Year               13351 non-null  int64  
 10  Arrival_Hour       13351 non-null  int64  
 11  Arrival_Minutes    13351 non-null  int64  
 12  Departure_Hour     13351 non-null  int64  
 13  Departure_Minutes  13351 non-null  int64  
 14  Duration_Hour      13351 non-null  int64  
dtypes: float64(1), int64(9), object(5)
memory usage: 1.6+ MB


In [72]:
# Splitting duriation column to gets minutes 
data['Duration_Minutes'] = data['Duration'].str.split(' ').str[1].str.split('m').str[0]
data.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes,Duration_Hour,Duration_Minutes
0,IndiGo,Banglore,New Delhi,2h 50m,0,No info,3897.0,24,3,2019,1,10,22,20,2,50.0
1,Air India,Kolkata,Banglore,7h 25m,2,No info,7662.0,1,5,2019,13,15,5,50,7,25.0
2,Jet Airways,Delhi,Cochin,19h,2,No info,13882.0,9,6,2019,4,25,9,25,19,
3,IndiGo,Kolkata,Banglore,5h 25m,1,No info,6218.0,12,5,2019,23,30,18,5,5,25.0
4,IndiGo,Banglore,New Delhi,4h 45m,1,No info,13302.0,1,3,2019,21,35,16,50,4,45.0


In [75]:
# NAN value check
data['Duration_Minutes'].isna().sum()

np.int64(0)

In [74]:
# Replacing NA with 0
data['Duration_Minutes'] = data['Duration_Minutes'].fillna('0')

In [77]:
# Change data type
data['Duration_Hour'] = data['Duration_Hour'].astype(int)
data['Duration_Minutes'] = data['Duration_Minutes'].astype(int)

In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13351 entries, 0 to 13353
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Airline            13351 non-null  object 
 1   Source             13351 non-null  object 
 2   Destination        13351 non-null  object 
 3   Duration           13351 non-null  object 
 4   Total_Stops        13351 non-null  int64  
 5   Additional_Info    13351 non-null  object 
 6   Price              10681 non-null  float64
 7   Date               13351 non-null  int64  
 8   Month              13351 non-null  int64  
 9   Year               13351 non-null  int64  
 10  Arrival_Hour       13351 non-null  int64  
 11  Arrival_Minutes    13351 non-null  int64  
 12  Departure_Hour     13351 non-null  int64  
 13  Departure_Minutes  13351 non-null  int64  
 14  Duration_Hour      13351 non-null  int64  
 15  Duration_Minutes   13351 non-null  int64  
dtypes: float64(1), int64(10), o

In [81]:
# Hours to minutes
data['Hours_in_Minutes'] = data['Duration_Hour']*60

In [82]:
data.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes,Duration_Hour,Duration_Minutes,Hours_in_Minutes
0,IndiGo,Banglore,New Delhi,2h 50m,0,No info,3897.0,24,3,2019,1,10,22,20,2,50,120
1,Air India,Kolkata,Banglore,7h 25m,2,No info,7662.0,1,5,2019,13,15,5,50,7,25,420
2,Jet Airways,Delhi,Cochin,19h,2,No info,13882.0,9,6,2019,4,25,9,25,19,0,1140
3,IndiGo,Kolkata,Banglore,5h 25m,1,No info,6218.0,12,5,2019,23,30,18,5,5,25,300
4,IndiGo,Banglore,New Delhi,4h 45m,1,No info,13302.0,1,3,2019,21,35,16,50,4,45,240


In [83]:
# Duration in Minutes
data['Duration'] = data['Hours_in_Minutes']+data['Duration_Minutes']

In [87]:
data.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes,Duration_Hour,Duration_Minutes,Hours_in_Minutes
0,IndiGo,Banglore,New Delhi,170,0,No info,3897.0,24,3,2019,1,10,22,20,2,50,120
1,Air India,Kolkata,Banglore,445,2,No info,7662.0,1,5,2019,13,15,5,50,7,25,420
2,Jet Airways,Delhi,Cochin,1140,2,No info,13882.0,9,6,2019,4,25,9,25,19,0,1140
3,IndiGo,Kolkata,Banglore,325,1,No info,6218.0,12,5,2019,23,30,18,5,5,25,300
4,IndiGo,Banglore,New Delhi,285,1,No info,13302.0,1,3,2019,21,35,16,50,4,45,240


In [88]:
data.drop(['Duration_Hour','Duration_Minutes', 'Hours_in_Minutes'], axis='columns', inplace=True)

In [89]:
data.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes
0,IndiGo,Banglore,New Delhi,170,0,No info,3897.0,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,445,2,No info,7662.0,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,1140,2,No info,13882.0,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,325,1,No info,6218.0,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,285,1,No info,13302.0,1,3,2019,21,35,16,50


***Categorical Features***

- Airline
- Source
- Destination
- Additional_Info

In [90]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [91]:
# Label encoding the categorical features
data['Airline'] = label_encoder.fit_transform(data['Airline'])
data['Source'] = label_encoder.fit_transform(data['Source'])
data['Destination'] = label_encoder.fit_transform(data['Destination'])
data['Additional_Info'] = label_encoder.fit_transform(data['Additional_Info'])


In [92]:
data.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_Hour,Arrival_Minutes,Departure_Hour,Departure_Minutes
0,3,0,5,170,0,8,3897.0,24,3,2019,1,10,22,20
1,1,3,0,445,2,8,7662.0,1,5,2019,13,15,5,50
2,4,2,1,1140,2,8,13882.0,9,6,2019,4,25,9,25
3,3,3,0,325,1,8,6218.0,12,5,2019,23,30,18,5
4,3,0,5,285,1,8,13302.0,1,3,2019,21,35,16,50
