## Importing libraries for Data importing and Visualisation

In [1]:
import pandas as pd
import numpy as np
import sweetviz
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

### Importing Training Dataset

In [2]:
data = pd.read_excel("Data_Train.xlsx")

In [3]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
data.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [5]:
data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

## Data Analyzing using sweetviz library

In [6]:
my_report = sweetviz.analyze([data, "Train"],target_feat="Price")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=12.0), HTML(value='')), l…




In [7]:
my_report.show_html("Report.html")

Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Exploratory Data Analysis

In [8]:
## Dropping Route feature because it's highly correlated with Total_stops feature
## Dropping Additional_info beacuse its 80% data contains no_info

data.drop(['Route', 'Additional_Info'], axis = 1)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,22:20,01:10 22 Mar,2h 50m,non-stop,3897
1,Air India,1/05/2019,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,09:25,04:25 10 Jun,19h,2 stops,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302
...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,19:55,22:25,2h 30m,non-stop,4107
10679,Air India,27/04/2019,Kolkata,Banglore,20:45,23:20,2h 35m,non-stop,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,08:20,11:20,3h,non-stop,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,11:30,14:10,2h 40m,non-stop,12648


#### Date of journey is in object datatype. Convertng to date and time type using pandas to_datetime function

In [9]:
data["Journey_day"] = pd.to_datetime(data.Date_of_Journey,errors='ignore', format='%d/%m/%Y').dt.day

In [10]:
data["Journey_Month"] = pd.to_datetime(data.Date_of_Journey,errors='coerce', format='%d/%m/%Y').dt.month

In [11]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_Month
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3


In [12]:
## We have converted Data of journey feature to Journey date and Month. We can drop Date of Journey feature

data.drop(["Date_of_Journey","Route","Additional_Info"],axis=1)

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price,Journey_day,Journey_Month
0,IndiGo,Banglore,New Delhi,22:20,01:10 22 Mar,2h 50m,non-stop,3897,24,3
1,Air India,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662,1,5
2,Jet Airways,Delhi,Cochin,09:25,04:25 10 Jun,19h,2 stops,13882,9,6
3,IndiGo,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218,12,5
4,IndiGo,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302,1,3
...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,19:55,22:25,2h 30m,non-stop,4107,9,4
10679,Air India,Kolkata,Banglore,20:45,23:20,2h 35m,non-stop,4145,27,4
10680,Jet Airways,Banglore,Delhi,08:20,11:20,3h,non-stop,7229,27,4
10681,Vistara,Banglore,New Delhi,11:30,14:10,2h 40m,non-stop,12648,1,3


In [13]:
##Converting Departure time feature time to Depature hour and minutes

data["Depature_Hour"] = pd.to_datetime(data["Dep_Time"]).dt.hour
data["Depature_minutes"] = pd.to_datetime(data["Dep_Time"]).dt.minute

In [14]:
data.drop(["Dep_Time"], axis =1, inplace=True)

In [15]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_Month,Depature_Hour,Depature_minutes
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,22,20
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2 stops,No info,7662,1,5,5,50
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,04:25 10 Jun,19h,2 stops,No info,13882,9,6,9,25
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,23:30,5h 25m,1 stop,No info,6218,12,5,18,5
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,21:35,4h 45m,1 stop,No info,13302,1,3,16,50


In [16]:
##Converting Arrival time feature time to Arival hour and minutes

data["Arrival_Hour"] = pd.to_datetime(data["Arrival_Time"]).dt.hour
data["Arrival_minutes"] = pd.to_datetime(data["Arrival_Time"]).dt.minute

In [17]:
data.drop(["Arrival_Time"], axis =1)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_Month,Depature_Hour,Depature_minutes,Arrival_Hour,Arrival_minutes
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,5,50,13,15
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,9,25,4,25
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,12,5,18,5,23,30
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,1,3,16,50,21,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,2h 30m,non-stop,No info,4107,9,4,19,55,22,25
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,2h 35m,non-stop,No info,4145,27,4,20,45,23,20
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,3h,non-stop,No info,7229,27,4,8,20,11,20
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,2h 40m,non-stop,No info,12648,1,3,11,30,14,10


In [18]:
## Converting Duration into minutes using timedelta function
#Dropping Converted elements from dataset

data["Duration in mins"] = (pd.to_timedelta(data["Duration"]).dt.seconds // 60).astype(int) 
data.drop(['Additional_Info','Date_of_Journey','Arrival_Time','Duration','Route'], axis=1)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Journey_day,Journey_Month,Depature_Hour,Depature_minutes,Arrival_Hour,Arrival_minutes,Duration in mins
0,IndiGo,Banglore,New Delhi,non-stop,3897,24,3,22,20,1,10,170
1,Air India,Kolkata,Banglore,2 stops,7662,1,5,5,50,13,15,445
2,Jet Airways,Delhi,Cochin,2 stops,13882,9,6,9,25,4,25,1140
3,IndiGo,Kolkata,Banglore,1 stop,6218,12,5,18,5,23,30,325
4,IndiGo,Banglore,New Delhi,1 stop,13302,1,3,16,50,21,35,285
...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,non-stop,4107,9,4,19,55,22,25,150
10679,Air India,Kolkata,Banglore,non-stop,4145,27,4,20,45,23,20,155
10680,Jet Airways,Banglore,Delhi,non-stop,7229,27,4,8,20,11,20,180
10681,Vistara,Banglore,New Delhi,non-stop,12648,1,3,11,30,14,10,160


## Handling categorical Data

There are 2 types of categorical data

1. Nominal Data - data are not in any order - Handling nominal datatype using <span style="color: green;">**OneHotEncoder**</span> 
2. Ordinal Data - data are in order - Handling nominal datatype using <span style="color: green;">**LabelEncoder**</span> 

In [19]:
## Airline, Source and Destination are Nominal datatype. So we are using OneHotEncoder

Airlines = data["Airline"]
Airlines = pd.get_dummies(Airlines, drop_first=True)

Source = pd.get_dummies(data['Source'], drop_first=True)
Destination = pd.get_dummies(data['Destination'], drop_first=True)

In [20]:
data["Total_Stops"].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [21]:
## Total_Stops are ordinal data so we cconverting as LabelEncoder

data.replace({'non-stop' : 0, '1 stop' : 1, '2 stops' : 2,'3 stops': 3, "4 stops":4}, inplace =True)

In [22]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_Month,Depature_Hour,Depature_minutes,Arrival_Hour,Arrival_minutes,Duration in mins
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,0.0,No info,3897,24,3,22,20,1,10,170
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2.0,No info,7662,1,5,5,50,13,15,445
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,04:25 10 Jun,19h,2.0,No info,13882,9,6,9,25,4,25,1140
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,23:30,5h 25m,1.0,No info,6218,12,5,18,5,23,30,325
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,21:35,4h 45m,1.0,No info,13302,1,3,16,50,21,35,285


In [23]:
data.drop(['Additional_Info','Date_of_Journey','Arrival_Time','Duration','Route','Airline','Source','Destination'], axis=1)

Unnamed: 0,Total_Stops,Price,Journey_day,Journey_Month,Depature_Hour,Depature_minutes,Arrival_Hour,Arrival_minutes,Duration in mins
0,0.0,3897,24,3,22,20,1,10,170
1,2.0,7662,1,5,5,50,13,15,445
2,2.0,13882,9,6,9,25,4,25,1140
3,1.0,6218,12,5,18,5,23,30,325
4,1.0,13302,1,3,16,50,21,35,285
...,...,...,...,...,...,...,...,...,...
10678,0.0,4107,9,4,19,55,22,25,150
10679,0.0,4145,27,4,20,45,23,20,155
10680,0.0,7229,27,4,8,20,11,20,180
10681,0.0,12648,1,3,11,30,14,10,160


In [24]:
train_data = pd.concat([data,Airlines,Source,Destination], axis =1)

In [25]:
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,...,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,0.0,No info,3897,...,0,0,0,0,0,0,0,0,0,1
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2.0,No info,7662,...,0,0,0,1,0,0,0,0,0,0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,04:25 10 Jun,19h,2.0,No info,13882,...,0,0,1,0,0,1,0,0,0,0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,23:30,5h 25m,1.0,No info,6218,...,0,0,0,1,0,0,0,0,0,0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,21:35,4h 45m,1.0,No info,13302,...,0,0,0,0,0,0,0,0,0,1


In [32]:
final_train_data = train_data.drop(['Additional_Info','Date_of_Journey','Arrival_Time','Duration','Route','Airline','Source','Destination'], axis=1)

In [35]:
final_train_data.to_csv("Final_train_data.csv", index=False)