In [2]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split

In [3]:
PROJECT_DIR = r"C:\Users\yashg\OneDrive\Desktop\flight-sagemaker"
DATA_DIR = "data"
file_name = "flight_price.csv"

In [4]:
def get_data(file_name):
    flights_df = pd.read_csv(os.path.join(PROJECT_DIR, DATA_DIR,file_name)) 
    return flights_df

flights_df = get_data(file_name)

In [5]:
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [6]:
flights_df.shape

(10683, 11)

In [7]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [8]:
flights_df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price'],
      dtype='object')

## Data Cleaning

#### Now lets focus on Null and Duplicate values

In [11]:
flights_df.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [12]:
## Null_values

flights_df[flights_df.isnull().any(axis=1)]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


In [13]:
flights_df.dropna(axis=0, inplace=True)

In [14]:
flights_df.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [15]:
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [16]:
## Duplicate rows

flights_df[flights_df.duplicated(keep=False)].sort_values(["Airline", "Date_of_Journey", "Source", "Destination"])

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6321,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
9848,Air India,01/03/2019,Banglore,New Delhi,BLR → BOM → AMD → DEL,08:50,23:55 02 Mar,39h 5m,2 stops,No info,17135
572,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
8168,Air India,03/03/2019,Banglore,New Delhi,BLR → DEL,21:10,23:55,2h 45m,non-stop,No info,7591
1495,Air India,1/04/2019,Kolkata,Banglore,CCU → DEL → COK → BLR,10:00,01:20 02 Apr,15h 20m,2 stops,No info,10408
...,...,...,...,...,...,...,...,...,...,...,...
2692,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
2870,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,05:45,08:35,2h 50m,non-stop,No check-in baggage included,4273
3711,SpiceJet,24/03/2019,Banglore,New Delhi,BLR → DEL,20:30,23:20,2h 50m,non-stop,No check-in baggage included,3873
2634,Vistara,24/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,5403


In [17]:
flights_df.drop_duplicates(inplace=True)

In [18]:
flights_df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


#### Airline column clean

In [20]:
flights_df["Airline"].value_counts(normalize=True)

Airline
Jet Airways                          0.353661
IndiGo                               0.195278
Air India                            0.161919
Multiple carriers                    0.114318
SpiceJet                             0.077901
Vistara                              0.045689
Air Asia                             0.030491
GoAir                                0.018543
Multiple carriers Premium economy    0.001243
Jet Airways Business                 0.000574
Vistara Premium economy              0.000287
Trujet                               0.000096
Name: proportion, dtype: float64

In [21]:
# Using vectorized operations with str.replace
flights_df["Airline"] = flights_df["Airline"].str.replace(" Premium economy", "", regex=False)
flights_df["Airline"] = flights_df["Airline"].str.replace(" Business", "", regex=False)



In [22]:
flights_df["Airline"].value_counts()

Airline
Jet Airways          3706
IndiGo               2043
Air India            1694
Multiple carriers    1209
SpiceJet              815
Vistara               481
Air Asia              319
GoAir                 194
Trujet                  1
Name: count, dtype: int64

#### Datetime cloumns

In [24]:
flights_df[["Date_of_Journey", "Dep_Time", "Arrival_Time", "Duration"]]

Unnamed: 0,Date_of_Journey,Dep_Time,Arrival_Time,Duration
0,24/03/2019,22:20,01:10 22 Mar,2h 50m
1,1/05/2019,05:50,13:15,7h 25m
2,9/06/2019,09:25,04:25 10 Jun,19h
3,12/05/2019,18:05,23:30,5h 25m
4,01/03/2019,16:50,21:35,4h 45m
...,...,...,...,...
10678,9/04/2019,19:55,22:25,2h 30m
10679,27/04/2019,20:45,23:20,2h 35m
10680,27/04/2019,08:20,11:20,3h
10681,01/03/2019,11:30,14:10,2h 40m


In [25]:
# Date of journey
flights_df["Date_of_Journey"] = pd.to_datetime(flights_df["Date_of_Journey"], format= "%d/%m/%Y")
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [26]:
# Assuming Dep_Time is in "HH:MM" format
flights_df["Dep_Time"] =  pd.to_datetime(flights_df["Dep_Time"], format='%H:%M').dt.time
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20:00,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50:00,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25:00,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05:00,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50:00,21:35,4h 45m,1 stop,No info,13302


In [27]:
# Assuming Arrival_time is in "HH:MM" format

flights_df["Arrival_Time"]  = pd.to_datetime(flights_df["Arrival_Time"].str.split(" ", expand=True)[0]).dt.time
flights_df.head()

  flights_df["Arrival_Time"]  = pd.to_datetime(flights_df["Arrival_Time"].str.split(" ", expand=True)[0]).dt.time


Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20:00,01:10:00,2h 50m,non-stop,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50:00,13:15:00,7h 25m,2 stops,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25:00,04:25:00,19h,2 stops,No info,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05:00,23:30:00,5h 25m,1 stop,No info,6218
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50:00,21:35:00,4h 45m,1 stop,No info,13302


In [28]:
## Fixing Duration 

In [29]:
flights_df["Duration"].str.split(" ", expand=True)

Unnamed: 0,0,1
0,2h,50m
1,7h,25m
2,19h,
3,5h,25m
4,4h,45m
...,...,...
10678,2h,30m
10679,2h,35m
10680,3h,
10681,2h,40m


In [30]:
flights_df.Duration.loc[lambda ser: ~ser.str.contains("m")].unique()

array(['19h', '23h', '22h', '12h', '3h', '5h', '10h', '18h', '24h', '15h',
       '16h', '8h', '14h', '20h', '13h', '11h', '9h', '27h', '26h', '4h',
       '7h', '30h', '21h', '28h', '47h', '6h', '25h', '38h', '34h'],
      dtype=object)

In [31]:
flights_df.Duration.loc[lambda ser: ~ser.str.contains("h")].unique()

array(['5m'], dtype=object)

In [32]:
flights_df.Duration.loc[lambda ser: ~ser.str.contains("h")]

6474    5m
Name: Duration, dtype: object

In [33]:
## lets drop 6474 column, becoz a flight can't have 5 minute duration

flights_df.drop(6474, inplace=True)

In [34]:
duration = flights_df["Duration"].str.split(" ", expand=True).set_axis(["hour", "minute"], axis=1)

In [35]:
flights_df["Duration_in_minute"] = duration["hour"].str.replace("h","").astype(int)*60 + duration["minute"].str.replace("m","").fillna(0).astype(int)

In [36]:
flights_df.drop("Duration", axis=1, inplace=True)

In [37]:
flights_df.head()


Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Total_Stops,Additional_Info,Price,Duration_in_minute
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20:00,01:10:00,non-stop,No info,3897,170
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50:00,13:15:00,2 stops,No info,7662,445
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25:00,04:25:00,2 stops,No info,13882,1140
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05:00,23:30:00,1 stop,No info,6218,325
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50:00,21:35:00,1 stop,No info,13302,285


#### Source column

In [39]:
flights_df["Source"].value_counts(normalize=True)

Source
Delhi       0.415352
Kolkata     0.273396
Banglore    0.208297
Mumbai      0.066533
Chennai     0.036421
Name: proportion, dtype: float64

#### Destination column

In [41]:
flights_df["Destination"].value_counts(normalize=True)

Destination
Cochin       0.415352
Banglore     0.273396
Delhi        0.120925
New Delhi    0.087372
Hyderabad    0.066533
Kolkata      0.036421
Name: proportion, dtype: float64

In [42]:
flights_df["Destination"] = flights_df["Destination"].str.replace("New Delhi", "Delhi")

#### Route

In [44]:
flights_df.drop("Route", axis=1, inplace=True)

In [45]:
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Total_Stops,Additional_Info,Price,Duration_in_minute
0,IndiGo,2019-03-24,Banglore,Delhi,22:20:00,01:10:00,non-stop,No info,3897,170
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,2 stops,No info,7662,445
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,2 stops,No info,13882,1140
3,IndiGo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,1 stop,No info,6218,325
4,IndiGo,2019-03-01,Banglore,Delhi,16:50:00,21:35:00,1 stop,No info,13302,285


#### Total_Stops

In [47]:
flights_df["Total_Stops"].value_counts(normalize=True)

Total_Stops
1 stop      0.537711
non-stop    0.332186
2 stops     0.125896
3 stops     0.004111
4 stops     0.000096
Name: proportion, dtype: float64

In [48]:
flights_df["Total_Stops"] = flights_df["Total_Stops"].str.split(" ", expand=True)[0].replace("non-stop", 0).astype(int)

#### Additional_Info

In [50]:
flights_df["Additional_Info"].value_counts(normalize=True)

Additional_Info
No info                         0.782048
In-flight meal not included     0.184112
No check-in baggage included    0.030399
1 Long layover                  0.001816
Change airports                 0.000669
Business class                  0.000382
No Info                         0.000287
1 Short layover                 0.000096
Red-eye flight                  0.000096
2 Long layover                  0.000096
Name: proportion, dtype: float64

In [51]:
flights_df.drop("Additional_Info", axis=1, inplace=True)

In [52]:
flights_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Total_Stops,Price,Duration_in_minute
0,IndiGo,2019-03-24,Banglore,Delhi,22:20:00,01:10:00,0,3897,170
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,2,7662,445
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,2,13882,1140
3,IndiGo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,1,6218,325
4,IndiGo,2019-03-01,Banglore,Delhi,16:50:00,21:35:00,1,13302,285


In [188]:
## Final function for data clean

# def clean_data(df):
#     df.dropna(axis=0, inplace=True)
#     df.drop_duplicates(inplace=True)
#     df["Airline"] = df["Airline"].str.replace(" Premium economy", "", regex=False)
#     df["Airline"] = df["Airline"].str.replace(" Business", "", regex=False)
#     df["Date_of_Journey"] = pd.to_datetime(df["Date_of_Journey"], format= "%d/%m/%Y")
#     df["Dep_Time"] = pd.to_datetime(df["Dep_Time"], format='%H:%M').dt.time
#     df["Arrival_Time"] = pd.to_datetime(df["Arrival_Time"].str.split(" ", expand=True)[0]).dt.time
#     duration = df["Duration"].str.split(" ", expand=True).set_axis(["hour", "minute"], axis=1)
#     df["Duration_in_minute"] = duration["hour"].str.replace("h","").astype(int)*60 + duration["minute"].str.replace("m","").fillna(0).astype(int)
#     df["Destination"] = df["Destination"].str.replace("New Delhi", "Delhi")
#     df.drop("Additional_Info", axis=1, inplace=True)

#     return df


## Optimized fucntion code
def clean_data(df):
    # Drop missing and duplicate rows
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)

    # Clean 'Airline' column in one step using regex for both replacements
    df["Airline"] = df["Airline"].str.replace(r" Premium economy| Business", "", regex=True)

    # Convert date and time columns
    df["Date_of_Journey"] = pd.to_datetime(df["Date_of_Journey"], format="%d/%m/%Y")
    df["Dep_Time"] = pd.to_datetime(df["Dep_Time"], format='%H:%M').dt.time
    df["Arrival_Time"] = pd.to_datetime(df["Arrival_Time"].str.split(" ", n=1).str[0], dayfirst=True, format="mixed").dt.time

    # Parse and convert 'Duration' to minutes
    df.drop(6474, inplace=True)
    duration = df["Duration"].str.split(" ", expand=True).set_axis(["hour", "minute"], axis=1)
    df["Duration_in_minute"] = duration["hour"].str.replace("h","").astype(int)*60 + duration["minute"].str.replace("m","").fillna(0).astype(int)

    # Standardize 'Destination' column
    df["Destination"] = df["Destination"].replace("New Delhi", "Delhi")

    # Imouting numeric in Total stops
    df["Total_Stops"] = df["Total_Stops"].str.split(" ", expand=True)[0].replace("non-stop", 0).astype(int)
    
    # Drop unnecessary column
    df.drop("Additional_Info", axis=1, inplace=True)
    df.drop("Duration",axis=1, inplace=True)
    df.drop("Route",axis=1, inplace=True)

    return df



In [190]:
flights = pd.read_csv("C:/Users/yashg/OneDrive/Desktop/flight-sagemaker/data/flight_price.csv")

In [192]:
flights_cleaned = clean_data(flights)

In [194]:
flights_cleaned.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Total_Stops,Price,Duration_in_minute
0,IndiGo,2019-03-24,Banglore,Delhi,22:20:00,01:10:00,0,3897,170
1,Air India,2019-05-01,Kolkata,Banglore,05:50:00,13:15:00,2,7662,445
2,Jet Airways,2019-06-09,Delhi,Cochin,09:25:00,04:25:00,2,13882,1140
3,IndiGo,2019-05-12,Kolkata,Banglore,18:05:00,23:30:00,1,6218,325
4,IndiGo,2019-03-01,Banglore,Delhi,16:50:00,21:35:00,1,13302,285


In [195]:
flights_cleaned.shape

(10461, 9)

## Split Data 

In [200]:
flights_final = flights_cleaned.sample(9000, random_state=42)

In [201]:
flights_final.shape

(9000, 9)

In [204]:
leftout_data =  flights_cleaned.loc[~flights_cleaned.index.isin(flights_final.index)]

In [206]:
leftout_data.to_csv(os.path.join(PROJECT_DIR, DATA_DIR,"leftout_data.csv"), index=False)

In [208]:
flights_final.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Total_Stops,Price,Duration_in_minute
2150,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,08:15:00,1,17996,1455
3784,SpiceJet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,0,3873,140
714,IndiGo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,0,4462,170
7558,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,0,2228,90
7413,SpiceJet,2019-04-27,Banglore,Delhi,09:30:00,12:20:00,0,4991,170


In [210]:
X = flights_final.drop("Price", axis=1)
y = flights_final["Price"]

In [212]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [214]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, random_state=42, test_size=0.2)

In [217]:
def export_data_to_csv(X,y,name):
    X["Price"]  = y
    file_name = f"{name}.csv"
    X.to_csv(os.path.join(PROJECT_DIR, DATA_DIR,file_name), index=False)
    return X.head()

    
    

In [219]:
## Trian dataset export
export_data_to_csv(X_train,y_train,"train")

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Total_Stops,Duration_in_minute,Price
5574,IndiGo,2019-06-24,Delhi,Cochin,05:05:00,16:10:00,1,665,6442
8675,Vistara,2019-06-06,Chennai,Kolkata,07:05:00,09:20:00,0,135,3687
6730,Air India,2019-05-06,Kolkata,Banglore,12:00:00,07:55:00,2,1195,10151
1443,Air India,2019-06-24,Delhi,Cochin,17:15:00,19:15:00,2,1560,11989
9273,IndiGo,2019-04-24,Kolkata,Banglore,11:30:00,14:05:00,0,155,4804


In [220]:
## Test dataset export
export_data_to_csv(X_test,y_test,"test")

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Total_Stops,Duration_in_minute,Price
10415,Jet Airways,2019-04-03,Banglore,Delhi,18:55:00,22:00:00,0,185,4544
7381,IndiGo,2019-03-06,Delhi,Cochin,07:30:00,13:20:00,1,350,13628
542,Air India,2019-03-06,Delhi,Cochin,08:00:00,19:15:00,1,675,15284
7291,Multiple carriers,2019-06-06,Delhi,Cochin,07:05:00,19:15:00,1,730,14848
3934,SpiceJet,2019-06-21,Mumbai,Hyderabad,22:45:00,00:15:00,0,90,2017


In [221]:
## Val dataset export
export_data_to_csv(X_val,y_val,"val")

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Total_Stops,Duration_in_minute,Price
4980,Jet Airways,2019-05-01,Kolkata,Banglore,06:30:00,18:15:00,1,705,14781
6250,SpiceJet,2019-05-27,Mumbai,Hyderabad,22:45:00,00:15:00,0,90,2017
6667,Air India,2019-05-18,Delhi,Cochin,17:20:00,09:25:00,1,965,7690
4501,Jet Airways,2019-03-24,Kolkata,Banglore,06:30:00,16:20:00,1,590,8824
9445,Multiple carriers,2019-06-06,Delhi,Cochin,11:40:00,01:30:00,1,830,6795
