In [530]:
import pandas as pd
import numpy as np
import re
import os
from feature_engine.outliers import Winsorizer
from sklearn.model_selection import train_test_split

In [478]:
PROJECT_DIR = r"C:\Users\yashg\OneDrive\Desktop\flight-sagemaker"
DATA_DIR = r"data"
file_name = r"flight_price.csv"

In [479]:
def get_data(file_name):
    flights_df = pd.read_csv(os.path.join(PROJECT_DIR, DATA_DIR,file_name)) 
    return flights_df

flights_df = get_data(file_name)

In [480]:
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [481]:
flights_df.shape

(10683, 11)

In [482]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [483]:
flights_df.columns.to_list()

['Airline',
 'Date_of_Journey',
 'Source',
 'Destination',
 'Route',
 'Dep_Time',
 'Arrival_Time',
 'Duration',
 'Total_Stops',
 'Additional_Info',
 'Price']

In [484]:
flights_df.columns = [i.lower() for i in flights_df.columns.to_list()]

In [485]:
flights_df.columns

Index(['airline', 'date_of_journey', 'source', 'destination', 'route',
       'dep_time', 'arrival_time', 'duration', 'total_stops',
       'additional_info', 'price'],
      dtype='object')

## Data Cleaning

#### Now lets focus on Null and Duplicate values

In [486]:
flights_df.isnull().sum()

airline            0
date_of_journey    0
source             0
destination        0
route              1
dep_time           0
arrival_time       0
duration           0
total_stops        1
additional_info    0
price              0
dtype: int64

In [487]:
## Null_values

flights_df[flights_df.isnull().any(axis=1)]

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


In [488]:
flights_df.dropna(axis=0, inplace=True)

In [489]:
flights_df.isnull().sum()

airline            0
date_of_journey    0
source             0
destination        0
route              0
dep_time           0
arrival_time       0
duration           0
total_stops        0
additional_info    0
price              0
dtype: int64

In [490]:
flights_df.head()

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [491]:
## Duplicate rows

flights_df[flights_df.duplicated(keep=False)].sort_values(["airline", "date_of_journey", "source", "destination"])

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1495,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
...,...,...,...,...,...,...,...,...,...,...,...
2692,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
2870,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
3711,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,20:30,23:20,2h 50m,non-stop,No check-in baggage included,3873
2634,Vistara,24/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,5403


In [492]:
flights_df.drop_duplicates(inplace=True)

In [493]:
flights_df

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


#### Airline column clean

In [494]:
flights_df["airline"].value_counts(normalize=True)

airline
Jet Airways                          0.353661
IndiGo                               0.195278
Air India                            0.161919
Multiple carriers                    0.114318
SpiceJet                             0.077901
Vistara                              0.045689
Air Asia                             0.030491
GoAir                                0.018543
Multiple carriers Premium economy    0.001243
Jet Airways Business                 0.000574
Vistara Premium economy              0.000287
Trujet                               0.000096
Name: proportion, dtype: float64

In [499]:
flights_df.groupby("airline")["price"].max()

airline
Air Asia                             13774
Air India                            31945
GoAir                                22794
IndiGo                               22153
Jet Airways                          54826
Jet Airways Business                 79512
Multiple carriers                    36983
Multiple carriers Premium economy    14629
SpiceJet                             23267
Trujet                                4140
Vistara                              21730
Vistara Premium economy              11793
Name: price, dtype: int64

In [500]:
flights_df.groupby("airline")["price"].min()

airline
Air Asia                              3383
Air India                             2050
GoAir                                 3398
IndiGo                                2227
Jet Airways                           1840
Jet Airways Business                 46490
Multiple carriers                     5797
Multiple carriers Premium economy     9845
SpiceJet                              1759
Trujet                                4140
Vistara                               3687
Vistara Premium economy               5969
Name: price, dtype: int64

In [426]:
# Using vectorized operations with str.replace
flights_df["airline"] = flights_df["airline"].str.replace(" Premium economy", "", regex=False)
flights_df["airline"] = flights_df["airline"].str.replace(" Business", "", regex=False)



In [503]:
flights_df[flights_df["price"] > 50000]

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
657,Jet Airways Business,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,05:45,10:45,5h,1 stop,No info,52229
1478,Jet Airways,18/03/2019,Banglore,New Delhi,BLR → BOM → DEL,18:40,00:45 16 Mar,6h 5m,1 stop,No info,54826
2618,Jet Airways,18/03/2019,Banglore,New Delhi,BLR → BOM → DEL,22:50,05:05 16 Mar,6h 15m,1 stop,No info,54826
2924,Jet Airways Business,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,05:45,11:25,5h 40m,1 stop,Business class,79512
5372,Jet Airways Business,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,05:45,12:25,6h 40m,1 stop,Business class,62427
5439,Jet Airways,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,16:55,23:00,6h 5m,1 stop,No info,54826
9715,Jet Airways Business,6/03/2019,Delhi,Cochin,DEL → ATQ → BOM → COK,20:05,04:25 07 Mar,8h 20m,2 stops,No info,52285
10364,Jet Airways Business,01/03/2019,Banglore,New Delhi,BLR → MAA → DEL,09:45,14:25,4h 40m,1 stop,Business class,57209


#### Datetime cloumns

In [428]:
flights_df[["date_of_journey", "dep_time", "arrival_time", "duration"]]

Unnamed: 0,date_of_journey,dep_time,arrival_time,duration
0,24/03/2019,22:20,01:10 22 Mar,2h 50m
1,1/05/2019,05:50,13:15,7h 25m
2,9/06/2019,09:25,04:25 10 Jun,19h
3,12/05/2019,18:05,23:30,5h 25m
4,01/03/2019,16:50,21:35,4h 45m
...,...,...,...,...
10678,9/04/2019,19:55,22:25,2h 30m
10679,27/04/2019,20:45,23:20,2h 35m
10680,27/04/2019,08:20,11:20,3h
10681,01/03/2019,11:30,14:10,2h 40m


In [429]:
# Date of journey
flights_df["date_of_journey"] = pd.to_datetime(flights_df["date_of_journey"], format= "%d/%m/%Y")
flights_df.head()

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [430]:
flights_df.sort_values("date_of_journey", ascending=True, inplace=True)

In [431]:
# Assuming dep_time is in "HH:MM" format
flights_df["dep_time"] =  pd.to_datetime(flights_df["dep_time"], format='%H:%M').dt.time
flights_df.head()

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
8957,Jet Airways,2019-03-01,Banglore,New Delhi,BLR → BOM → DEL,08:55:00,05:05 02 Mar,20h 10m,1 stop,1 Long layover,26890
7137,IndiGo,2019-03-01,Banglore,New Delhi,BLR → DEL,21:20:00,00:15 02 Mar,2h 55m,non-stop,No info,12649
1353,Air India,2019-03-01,Banglore,New Delhi,BLR → BOM → IDR → DEL,06:45:00,11:10 02 Mar,28h 25m,2 stops,No info,20999
8940,Air India,2019-03-01,Banglore,New Delhi,BLR → DEL,21:10:00,23:55,2h 45m,non-stop,No info,25703
8937,IndiGo,2019-03-01,Banglore,New Delhi,BLR → DEL,13:00:00,15:50,2h 50m,non-stop,No info,19685


In [432]:
# Assuming arrival_time is in "HH:MM" format

flights_df["arrival_time"]  = pd.to_datetime(flights_df["arrival_time"].str.split(" ", expand=True)[0]).dt.time
flights_df.head()

  flights_df["arrival_time"]  = pd.to_datetime(flights_df["arrival_time"].str.split(" ", expand=True)[0]).dt.time


Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
8957,Jet Airways,2019-03-01,Banglore,New Delhi,BLR → BOM → DEL,08:55:00,05:05:00,20h 10m,1 stop,1 Long layover,26890
7137,IndiGo,2019-03-01,Banglore,New Delhi,BLR → DEL,21:20:00,00:15:00,2h 55m,non-stop,No info,12649
1353,Air India,2019-03-01,Banglore,New Delhi,BLR → BOM → IDR → DEL,06:45:00,11:10:00,28h 25m,2 stops,No info,20999
8940,Air India,2019-03-01,Banglore,New Delhi,BLR → DEL,21:10:00,23:55:00,2h 45m,non-stop,No info,25703
8937,IndiGo,2019-03-01,Banglore,New Delhi,BLR → DEL,13:00:00,15:50:00,2h 50m,non-stop,No info,19685


In [433]:
## Fixing duration 

In [434]:
flights_df["duration"].str.split(" ", expand=True)

Unnamed: 0,0,1
8957,20h,10m
7137,2h,55m
1353,28h,25m
8940,2h,45m
8937,2h,50m
...,...,...
5740,8h,
10367,1h,30m
1986,2h,25m
6089,30h,45m


In [435]:
flights_df.duration.loc[lambda ser: ~ser.str.contains("m")].unique()

array(['20h', '13h', '5h', '27h', '3h', '9h', '26h', '7h', '14h', '47h',
       '11h', '8h', '4h', '21h', '6h', '23h', '12h', '24h', '16h', '15h',
       '25h', '10h', '22h', '38h', '30h', '34h', '19h', '18h', '28h'],
      dtype=object)

In [436]:
flights_df.duration.loc[lambda ser: ~ser.str.contains("h")].unique()

array(['5m'], dtype=object)

In [437]:
flights_df.duration.loc[lambda ser: ~ser.str.contains("h")]

6474    5m
Name: duration, dtype: object

In [438]:
## lets drop 6474 column, becoz a flight can't have 5 minute duration

flights_df.drop(6474, inplace=True)

In [439]:
duration = flights_df["duration"].str.split(" ", expand=True).set_axis(["hour", "minute"], axis=1)

In [440]:
flights_df["duration_in_minute"] = duration["hour"].str.replace("h","").astype(int)*60 + duration["minute"].str.replace("m","").fillna(0).astype(int)

In [441]:
flights_df.drop("duration", axis=1, inplace=True)

In [442]:
flights_df.head()


Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,total_stops,additional_info,price,duration_in_minute
8957,Jet Airways,2019-03-01,Banglore,New Delhi,BLR → BOM → DEL,08:55:00,05:05:00,1 stop,1 Long layover,26890,1210
7137,IndiGo,2019-03-01,Banglore,New Delhi,BLR → DEL,21:20:00,00:15:00,non-stop,No info,12649,175
1353,Air India,2019-03-01,Banglore,New Delhi,BLR → BOM → IDR → DEL,06:45:00,11:10:00,2 stops,No info,20999,1705
8940,Air India,2019-03-01,Banglore,New Delhi,BLR → DEL,21:10:00,23:55:00,non-stop,No info,25703,165
8937,IndiGo,2019-03-01,Banglore,New Delhi,BLR → DEL,13:00:00,15:50:00,non-stop,No info,19685,170


#### source column

In [443]:
flights_df["source"].value_counts(normalize=True)

source
Delhi       0.415352
Kolkata     0.273396
Banglore    0.208297
Mumbai      0.066533
Chennai     0.036421
Name: proportion, dtype: float64

#### destination column

In [444]:
flights_df["destination"].value_counts(normalize=True)

destination
Cochin       0.415352
Banglore     0.273396
Delhi        0.120925
New Delhi    0.087372
Hyderabad    0.066533
Kolkata      0.036421
Name: proportion, dtype: float64

In [445]:
flights_df["destination"] = flights_df["destination"].str.replace("New Delhi", "Delhi")

#### route

In [446]:
flights_df.head()

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,total_stops,additional_info,price,duration_in_minute
8957,Jet Airways,2019-03-01,Banglore,Delhi,BLR → BOM → DEL,08:55:00,05:05:00,1 stop,1 Long layover,26890,1210
7137,IndiGo,2019-03-01,Banglore,Delhi,BLR → DEL,21:20:00,00:15:00,non-stop,No info,12649,175
1353,Air India,2019-03-01,Banglore,Delhi,BLR → BOM → IDR → DEL,06:45:00,11:10:00,2 stops,No info,20999,1705
8940,Air India,2019-03-01,Banglore,Delhi,BLR → DEL,21:10:00,23:55:00,non-stop,No info,25703,165
8937,IndiGo,2019-03-01,Banglore,Delhi,BLR → DEL,13:00:00,15:50:00,non-stop,No info,19685,170


#### total_stops

In [447]:
flights_df["total_stops"].value_counts(normalize=True)

total_stops
1 stop      0.537711
non-stop    0.332186
2 stops     0.125896
3 stops     0.004111
4 stops     0.000096
Name: proportion, dtype: float64

In [448]:
flights_df["total_stops"] = flights_df["total_stops"].str.split(" ", expand=True)[0].replace("non-stop", 0).astype(int)

In [449]:
flights_df["total_stops"].value_counts(normalize=True)

total_stops
1    0.537711
0    0.332186
2    0.125896
3    0.004111
4    0.000096
Name: proportion, dtype: float64

#### additional_info

In [450]:
flights_df["additional_info"].value_counts(normalize=True)

additional_info
No info                         0.782048
In-flight meal not included     0.184112
No check-in baggage included    0.030399
1 Long layover                  0.001816
Change airports                 0.000669
Business class                  0.000382
No Info                         0.000287
1 Short layover                 0.000096
Red-eye flight                  0.000096
2 Long layover                  0.000096
Name: proportion, dtype: float64

In [564]:
## Optimized fucntion code
def clean_data(df):
    # Drop missing and duplicate rows
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.columns = [i.lower() for i in df.columns.to_list()]


    # Using vectorized operations with str.replace
    df["airline"] = df["airline"].str.replace(" Premium economy", "", regex=False)
    df["airline"] = df["airline"].str.replace(" Business", "", regex=False)


    # Convert date and time columns
    df["date_of_journey"] = pd.to_datetime(df["date_of_journey"], format="%d/%m/%Y")
    df["dep_time"] = pd.to_datetime(df["dep_time"], format='%H:%M').dt.time
    df["arrival_time"] = pd.to_datetime(df["arrival_time"].str.split(" ", n=1).str[0], dayfirst=True, format="mixed").dt.time

    # Parse and convert 'duration' to minutes
    df.drop(6474, inplace=True)
    duration = df["duration"].str.split(" ", expand=True).set_axis(["hour", "minute"], axis=1)
    df["duration"] = duration["hour"].str.replace("h","").astype(int)*60 + duration["minute"].str.replace("m","").fillna(0).astype(int)

    # Standardize 'destination' column
    df["destination"] = df["destination"].replace("New Delhi", "Delhi")

    # Imouting numeric in Total stops
    df["total_stops"] = df["total_stops"].str.split(" ", expand=True)[0].replace("non-stop", 0).astype(int)

    df.drop(["route","additional_info"], axis=1, inplace=True)
    
    
    return df



In [565]:
flights = pd.read_csv("C:/Users/yashg/OneDrive/Desktop/flight-sagemaker/data/flight_price.csv")

In [566]:
flights_cleaned = clean_data(flights)

In [567]:
flights_cleaned.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
0,IndiGo,2019-03-24,Banglore,Delhi,22:20:00,01:10:00,170,0,3897
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,445,2,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,1140,2,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,325,1,6218
4,IndiGo,2019-03-01,Banglore,Delhi,16:50:00,21:35:00,285,1,13302


In [568]:
flights_cleaned.shape

(10461, 9)

## Split Data 

In [569]:
X = flights_cleaned.drop("price", axis=1)
y = flights_cleaned["price"]

In [570]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [571]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, random_state=42, test_size=0.2)

In [572]:
print("train data shape" , (X_train.shape , y_train.shape))
print("val data shape" , (X_val.shape , y_val.shape))
print("test data shape" , (X_test.shape , y_test.shape))

train data shape ((6694, 8), (6694,))
val data shape ((1674, 8), (1674,))
test data shape ((2093, 8), (2093,))


In [573]:
win = Winsorizer(capping_method="iqr", fold=1.5)
X_train["price"] = win.fit_transform(pd.DataFrame(y_train, columns = ["price"]))

In [574]:
def export_data_to_csv(X,y,name):
    X["price"]  = y
    file_name = f"{name}.csv"
    X.to_csv(os.path.join(PROJECT_DIR, DATA_DIR,file_name), index=False)
    return X.head()

    
    

In [575]:
## Trian dataset export
export_data_to_csv(X_train,y_train,"train")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
3832,Jet Airways,2019-05-27,Delhi,Cochin,20:55:00,12:35:00,940,1,12898
3095,Jet Airways,2019-06-12,Kolkata,Banglore,18:55:00,16:20:00,1285,1,13044
9717,Air India,2019-05-18,Delhi,Cochin,09:45:00,09:25:00,1420,2,10975
3262,IndiGo,2019-06-03,Mumbai,Hyderabad,21:20:00,22:50:00,90,0,2227
6502,Jet Airways,2019-04-01,Mumbai,Hyderabad,02:55:00,04:20:00,85,0,5678


In [576]:
## Test dataset export
export_data_to_csv(X_test,y_test,"test")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
2150,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,08:15:00,1455,1,17996
3784,SpiceJet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0,3873
714,IndiGo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0,4462
7558,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0,2228
7413,SpiceJet,2019-04-27,Banglore,Delhi,09:30:00,12:20:00,170,0,4991


In [577]:
## Val dataset export
export_data_to_csv(X_val,y_val,"val")

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,price
4171,Jet Airways,2019-05-27,Delhi,Cochin,09:00:00,19:00:00,600,1,10675
9069,Jet Airways,2019-05-24,Kolkata,Banglore,18:55:00,10:05:00,910,1,8586
2010,Jet Airways,2019-03-18,Banglore,Delhi,21:25:00,09:30:00,725,1,13555
5454,SpiceJet,2019-06-27,Chennai,Kolkata,17:45:00,20:05:00,140,0,3543
6302,Air Asia,2019-05-15,Kolkata,Banglore,07:35:00,19:25:00,710,1,5192
