In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [177]:
df = pd.read_csv("/content/flight_price.csv")
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [179]:
df.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [180]:
# There is a feature which is present as data but it is object data type so now we will seperate it into
# months year and day

In [181]:
df["Date"] = df["Date_of_Journey"].str.split("/").str[0]

In [182]:
df["Month"] = df["Date_of_Journey"].str.split("/").str[1]
df["Year"] = df["Date_of_Journey"].str.split("/").str[2]

In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
 11  Date             10683 non-null  object
 12  Month            10683 non-null  object
 13  Year             10683 non-null  object
dtypes: int64(1), object(13)
memory usage: 1.1+ MB


In [184]:
# We can clearly see that the date month and year are still objects , to perform any prediction algo we need to
# convert it into numericals

In [185]:
df["Date"] = df["Date"].astype(int)
df["Month"] = df["Month"].astype(int)
df["Year"] = df["Year"].astype(int)

In [186]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
 11  Date             10683 non-null  int64 
 12  Month            10683 non-null  int64 
 13  Year             10683 non-null  int64 
dtypes: int64(4), object(10)
memory usage: 1.1+ MB


In [187]:
df.drop(["Date_of_Journey"],axis = 1,inplace = True)

In [188]:
df["Arrival_hour"] = df["Arrival_Time"].str.split(":").str[0]
df["Arrival_min"] = df["Arrival_Time"].str.split(":").str[1]

In [189]:
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019,1,10 22 Mar
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6,2019,4,25 10 Jun
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5,2019,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3,2019,21,35


In [190]:
# here we are getting the arrival minutes with some dates also we need to modify it to get the correct time

In [191]:
df["Arrival_Time"] = df["Arrival_Time"].apply(lambda x:x.split(" ")[0])

In [192]:
df["Arrival_hour"] = df["Arrival_Time"].str.split(":").str[0]
df["Arrival_min"] = df["Arrival_Time"].str.split(":").str[1]

In [193]:
df["Arrival_hour"] = df["Arrival_hour"].astype(int)
df["Arrival_min"] = df["Arrival_min"].astype(int)

In [194]:
df.drop(["Arrival_Time"],axis = 1,inplace = True)

In [195]:
df.head(1)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,2h 50m,non-stop,No info,3897,24,3,2019,1,10


In [196]:
df["Dep_hour"] = df["Dep_Time"].str.split(":").str[0]
df["Dep_min"] = df["Dep_Time"].str.split(":").str[1]

In [197]:
df["Dep_hour"] = df["Dep_hour"].astype(int)
df["Dep_min"] = df["Dep_min"].astype(int)


In [198]:
df.drop(["Dep_Time"],axis =1,inplace = True)

In [199]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Source           10683 non-null  object
 2   Destination      10683 non-null  object
 3   Route            10682 non-null  object
 4   Duration         10683 non-null  object
 5   Total_Stops      10682 non-null  object
 6   Additional_Info  10683 non-null  object
 7   Price            10683 non-null  int64 
 8   Date             10683 non-null  int64 
 9   Month            10683 non-null  int64 
 10  Year             10683 non-null  int64 
 11  Arrival_hour     10683 non-null  int64 
 12  Arrival_min      10683 non-null  int64 
 13  Dep_hour         10683 non-null  int64 
 14  Dep_min          10683 non-null  int64 
dtypes: int64(8), object(7)
memory usage: 1.2+ MB


In [200]:
df["Duration_hours"]  = df["Duration"].str.split(" ").str[0].str.split("h").str[0]
df["Duration_minutes"]  = df["Duration"].str.split(" ").str[1].str.split("m").str[0]
# df["Duration_minutes"] = df["Duration_minutes"].str.split("m").str[0]

In [203]:
df["Duration_minutes"].isnull().sum()

1032

In [209]:
df["Indexes"]= df["Duration_hours"].str.find("m")

In [223]:
for i in range(1,10683):
  if(df["Duration_hours"][i]=="5m"):
    print(i)

6474


In [227]:
df.iloc[6474]

Airline                         Air India
Source                             Mumbai
Destination                     Hyderabad
Route               BOM → GOI → PNQ → HYD
Duration                               5m
Total_Stops                       2 stops
Additional_Info                   No info
Price                               17327
Date                                    6
Month                                   3
Year                                 2019
Arrival_hour                           16
Arrival_min                            55
Dep_hour                               16
Dep_min                                50
Duration_hours                         5m
Duration_minutes                        0
Indexes                                 1
Name: 6474, dtype: object

In [230]:
df.drop(6474,inplace = True)

In [231]:
# df.drop(["Duration"],axis = 1,inplace = True)

df["Duration_minutes"].fillna("0",inplace = True)
df["Duration_minutes"] = df["Duration_minutes"].astype(int)
df["Duration_hours"] = df["Duration_hours"].astype(int)
df.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dep_hour,Dep_min,Duration_hours,Duration_minutes,Indexes
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,2019,1,10,22,20,2,50,-1
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,2019,13,15,5,50,7,25,-1
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,2019,4,25,9,25,19,0,-1
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,12,5,2019,23,30,18,5,5,25,-1
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,1,3,2019,21,35,16,50,4,45,-1


In [232]:
df["Additional_Info"].unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [233]:
df["Total_Stops"].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
      dtype=object)

In [234]:
df[df["Total_Stops"].isnull()]

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dep_hour,Dep_min,Duration_hours,Duration_minutes,Indexes
9039,Air India,Delhi,Cochin,,23h 40m,,No info,7480,6,5,2019,9,25,9,45,23,40,-1


In [235]:
df["Total_Stops"].mode()

0    1 stop
Name: Total_Stops, dtype: object

In [236]:
df["Total_Stops"].fillna("1 stop",inplace = True)

In [237]:

# Import label encoder
from sklearn import preprocessing

# label_encoder object knows
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()


df['Total_Stops']= label_encoder.fit_transform(df['Total_Stops'])

In [238]:
df.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dep_hour,Dep_min,Duration_hours,Duration_minutes,Indexes
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,4,No info,3897,24,3,2019,1,10,22,20,2,50,-1
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,1,No info,7662,1,5,2019,13,15,5,50,7,25,-1


In [239]:
df["Total_Stops"].unique()

array([4, 1, 0, 2, 3])

In [240]:
df["Total_Stops"] = df["Total_Stops"].map({4:0,0:1,1:2,2:3,3:4})

In [241]:
df["Airline"]= label_encoder.fit_transform(df["Airline"])
df["Source"]= label_encoder.fit_transform(df["Source"])
df["Destination"] = label_encoder.fit_transform(df["Destination"])

In [242]:
df.drop(["Route"],axis = 1,inplace = True)
df.head(10)

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dep_hour,Dep_min,Duration_hours,Duration_minutes,Indexes
0,3,0,5,2h 50m,0,No info,3897,24,3,2019,1,10,22,20,2,50,-1
1,1,3,0,7h 25m,2,No info,7662,1,5,2019,13,15,5,50,7,25,-1
2,4,2,1,19h,2,No info,13882,9,6,2019,4,25,9,25,19,0,-1
3,3,3,0,5h 25m,1,No info,6218,12,5,2019,23,30,18,5,5,25,-1
4,3,0,5,4h 45m,1,No info,13302,1,3,2019,21,35,16,50,4,45,-1
5,8,3,0,2h 25m,0,No info,3873,24,6,2019,11,25,9,0,2,25,-1
6,4,0,5,15h 30m,1,In-flight meal not included,11087,12,3,2019,10,25,18,55,15,30,-1
7,4,0,5,21h 5m,1,No info,22270,1,3,2019,5,5,8,0,21,5,-1
8,4,0,5,25h 30m,1,In-flight meal not included,11087,12,3,2019,10,25,8,55,25,30,-1
9,6,2,1,7h 50m,1,No info,8625,27,5,2019,19,15,11,25,7,50,-1


In [243]:
from sklearn.linear_model import Lasso

In [244]:
LassoRegression = Lasso()

In [245]:
from sklearn.model_selection import train_test_split

In [246]:
x = df.drop(["Price"],axis = 1)
y = df.Price

In [247]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

In [248]:
x_train.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Date,Month,Year,Arrival_hour,Arrival_min,Dep_hour,Dep_min,Duration_hours,Duration_minutes,Indexes
7123,6,2,1,11h,1,No info,12,6,2019,21,0,10,0,11,0,-1
9862,4,3,0,22h 15m,1,No info,1,4,2019,18,15,20,0,22,15,-1
8934,4,0,5,18h 10m,1,No info,18,3,2019,8,15,14,5,18,10,-1
2933,2,0,2,2h 50m,0,No info,6,5,2019,14,30,11,40,2,50,-1
5761,4,2,1,3h 15m,0,No info,9,3,2019,14,15,11,0,3,15,-1


In [249]:
df["Additional_Info"] = label_encoder.fit_transform(df["Additional_Info"])

In [261]:
df.drop(["Duration"],axis = 1,inplace = True)

In [263]:
df.head(10)

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dep_hour,Dep_min,Duration_hours,Duration_minutes
0,3,0,5,0,8,3897,24,3,2019,1,10,22,20,2,50
1,1,3,0,2,8,7662,1,5,2019,13,15,5,50,7,25
2,4,2,1,2,8,13882,9,6,2019,4,25,9,25,19,0
3,3,3,0,1,8,6218,12,5,2019,23,30,18,5,5,25
4,3,0,5,1,8,13302,1,3,2019,21,35,16,50,4,45
5,8,3,0,0,8,3873,24,6,2019,11,25,9,0,2,25
6,4,0,5,1,5,11087,12,3,2019,10,25,18,55,15,30
7,4,0,5,1,8,22270,1,3,2019,5,5,8,0,21,5
8,4,0,5,1,5,11087,12,3,2019,10,25,8,55,25,30
9,6,2,1,1,8,8625,27,5,2019,19,15,11,25,7,50
