In [313]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## FEATURE ENGINERRING
The first step is to import the cleaned dataset and run a couple checks to make sure everything is in order!

In [315]:
df = pd.read_excel('flight_price.xlsx')
df_copy = pd.read_parquet('cleaned dataset.parquet')

In [316]:
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,Price,Arrival_Time_Hour,Dep_Time_Dummy,Dep_Time_Hour,Arrival_Time_Dummy
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2:50,0,No info,3897,01:10:00,1900-01-01 22:20:00,22:20:00,1900-01-01 01:10:00
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,7662,13:15:00,1900-01-01 05:50:00,05:50:00,1900-01-01 13:15:00


In [317]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10462 entries, 0 to 10682
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Airline             10462 non-null  object        
 1   Date of Journey     10462 non-null  datetime64[ns]
 2   Source              10462 non-null  object        
 3   Destination         10462 non-null  object        
 4   Route               10462 non-null  object        
 5   Dep Time            10462 non-null  object        
 6   Arr Time            10462 non-null  object        
 7   Duration            10462 non-null  object        
 8   Total Stops         10462 non-null  object        
 9   Additional Info     10462 non-null  object        
 10  Price               10462 non-null  int64         
 11  Arrival_Time_Hour   10462 non-null  object        
 12  Dep_Time_Dummy      10462 non-null  datetime64[ns]
 13  Dep_Time_Hour       10462 non-null  object        


### 1. EXTRACT INFO
This section of Feature Engineering is typically intended for machine learning purposes, where the training process requires clearly separated and numerical values.

However, the existing features have already been cleaned and formatted appropriately for Exploratory Data Analysis (EDA) — including time, date, and duration variables, which are already structured for statistical evaluations.

Therefore, the steps in this section should be considered an additional component beyond the main scope of this project, aimed at showcasing familiarity with preprocessing techniques used in machine learning workflows

#### 1.1. extracting date, month and year from Date of Journey 

In [339]:
# Separating the component of Date of Journey into numerical (integer) values
df_copy['Date'] = df_copy['Date of Journey'].dt.day
df_copy['Month'] = df_copy['Date of Journey'].dt.month
df_copy['Year'] = df_copy['Date of Journey'].dt.year
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Arrival_Time_Dummy,Date,Month,Year,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,1900-01-01 01:10:00,24,3,2019,22,20,1,10,2,50
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,1900-01-01 13:15:00,1,5,2019,5,50,13,15,7,25


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 1.2 Extracting hour and minutes from Dep Time

In [342]:
# Separating the component of Dep Time into numerical (integer) values 
df_copy['Dep Time Hour'] = df_copy['Dep Time'].str.split(':').str[0].astype(int)
df_copy['Dep Time Min'] = df_copy['Dep Time'].str.split(':').str[1].astype(int)
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Arrival_Time_Dummy,Date,Month,Year,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,1900-01-01 01:10:00,24,3,2019,22,20,1,10,2,50
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,1900-01-01 13:15:00,1,5,2019,5,50,13,15,7,25


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 1.3 Exctracting hours and minutes from Arr Time

In [345]:
#Separating the component of Arr Time into numerical (integer) values
df_copy['Arr Time'] = df_copy['Arr Time'].str.split(' ').str[0]
df_copy['Arr Time Hour'] = df_copy['Arr Time'].str.split(':').str[0].astype(int)
df_copy['Arr Time Min'] = df_copy['Arr Time'].str.split(':').str[1].astype(int)
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Arrival_Time_Dummy,Date,Month,Year,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,1900-01-01 01:10:00,24,3,2019,22,20,1,10,2,50
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,1900-01-01 13:15:00,1,5,2019,5,50,13,15,7,25


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 1.4 Extracting hours and minutes from Duration

In [337]:
# Separating the component of Duration into numerical (integer) values
df_copy['Duration_Hour']=df_copy['Duration'].str.split(':').str[0].astype(int)
df_copy['Duration_min']=df_copy['Duration'].str.split(':').str[1]
df_copy['Duration_min'] = df_copy['Duration_min'].fillna(0).astype(int)

In [209]:
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Arrival_Time_Dummy,Arr Time +1,Inverse Dep/Arr,Date,Month,Year,Dep Time Hour,Dep Time Min,Duration_Hour,Duration_min
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2:50,0,No info,...,1900-01-01 01:10:00,2019-03-22,X,24,3,2019,22,20,2,50
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,1900-01-01 13:15:00,NaT,O,1,5,2019,5,50,7,25


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
### 2. CREATING NEW FEATURES
This section contributes to both machine learning workflows—by transforming key features into model-friendly numerical formats—and exploratory data analysis, by enabling deeper statistical insights through well-structured variables.

#### 2.1. Creating new feature for the total Duration in minutes

In [349]:
df_copy['Duration_Mins']=df_copy['Duration_Hour']*60+df_copy['Duration_min']

In [351]:
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Date,Month,Year,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min,Duration_Mins
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,24,3,2019,22,20,1,10,2,50,170
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,1,5,2019,5,50,13,15,7,25,445


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 2.2. Creating a new feature from Arrival_Time indicating a change in the date of arrival

In [354]:
# Using np.where() to add the year and converting the column to timestamp
df_copy['Arr Time +1'] = df['Arrival_Time'].str.split(' ', n=1).str[1].fillna('0')
df_copy['Arr Time +1'] = np.where(df_copy['Arr Time +1'] != '0', df_copy['Arr Time +1'] + ' 2019', np.nan)
df_copy['Arr Time +1'] = pd.to_datetime(df_copy['Arr Time +1'])

In [356]:
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Month,Year,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min,Duration_Mins,Arr Time +1
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,3,2019,22,20,1,10,2,50,170,2019-03-22
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,5,2019,5,50,13,15,7,25,445,NaT


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 2.3. Creation of a Column to flag the records where date of departure is after date of arrival
##### 2.3.1. consistency check on date of Journey and Arr Time +1 
The objective is to check if there are records of flights where the date of departure is after the date of arrival

In [359]:
#Let's now compare Date of Journey and Arr Time+1
df_copy[df_copy['Date of Journey']<df_copy['Arr Time +1']].shape

(4053, 26)

In [361]:
#Let's now compare Date of Journey and Arr Time+1
df_copy[df_copy['Date of Journey']>df_copy['Arr Time +1']].shape

(89, 26)

In [363]:
#percentage of flights where Date of Journey is after Arr Time
(89/(89+4053)*100, 89/10462*100)

(2.1487204249154996, 0.8506977633339706)

#### Insights - records showing departure after arrival

The percentage of records where the departure date is after the arrival date is 2.15% among flights whose arrival occurs at least one day later.
This percentage drops to 0.8% when considering all flights in the dataset.

Although this is a relatively small portion of the data, such inconsistencies cannot be resolved through basic imputation or correction methods — it is not possible to determine whether the error lies in the departure or arrival date.

Therefore, the best approach for now is to flag these records by creating a new column dedicated to this inconsistency. In this column, any entry where the journey date and arrival time are logically incompatible will be marked with an 'X'.

🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
##### 2.3.2 Creation of a Column to flag the records where date of departure is after date of arrival

In [367]:
# Using np.where() to flag flights where departure date is after the arrival with an X
df_copy['Inverse Dep/Arr'] = np.where(df_copy['Date of Journey']>df_copy['Arr Time +1'], 'X', 'O')

In [369]:
df_copy.head(2)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Year,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min,Duration_Mins,Arr Time +1,Inverse Dep/Arr
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,2019,22,20,1,10,2,50,170,2019-03-22,X
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,2019,5,50,13,15,7,25,445,NaT,O


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 2.4. Creation of a column with the count of days of delay in arrival

In [372]:
# Creation of a column with the difference in days or departure and arrival
df_copy['Days of Travelling'] = df_copy['Arr Time +1']-df_copy['Date of Journey']

In [374]:
# Implementation of the previous column with NaN when the day of arrival is prior to the date of departure
df_copy['Days of Travelling'] = np.where(
    df_copy['Date of Journey']<df_copy['Arr Time +1'], (df_copy['Arr Time +1']-df_copy['Date of Journey']).dt.days,
np.where(df_copy['Date of Journey']>df_copy['Arr Time +1'], np.nan, 0)
)

🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
##### 2.4.1. 
In order to improve clarity of the features created previously, a quick renaming of said columns will be performed

In [377]:
# Changing name of the columns for improved clarity
df_copy.rename({'Days of Travelling':'Arrival day Offset', 'Arr Time +1':'Date of Arrival (+)'}, axis=1, inplace= True)

In [379]:
df_copy.head(3)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min,Duration_Mins,Date of Arrival (+),Inverse Dep/Arr,Arrival day Offset
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,22,20,1,10,2,50,170,2019-03-22,X,
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,5,50,13,15,7,25,445,NaT,O,0.0
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19:00,2,No info,...,9,25,4,25,19,0,1140,2019-06-10,O,1.0


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
##### 2.4.2 Estimation of the percentage of flights with date of departure after date of arrival

In [382]:
#Number of flights where arrival time is more than 1 day after
df_copy[(df_copy['Date of Arrival (+)']-df_copy['Date of Journey']).dt.days>1].shape

(86, 28)

In [384]:
#Number of total flights where arrival time is aqt least 1 day after (only flights with a shift in the date)
df_copy[(df_copy['Date of Arrival (+)']-df_copy['Date of Journey']).dt.days>=0].shape

(4053, 28)

In [386]:
# Total flights
df_copy.shape

(10462, 28)

In [388]:
#Estimation in percentage of flights arriving more than 1 day after the departure on total flights and flights with any shift in the day
#of arrival
(86/(86+4053)*100, 86/(10462)*100)

(2.0777965692196183, 0.8220225578283311)

##### INSIGHTS - percentage of records with departure after arrival
Approximately 2.1% of flights with a date shift in the arrival time show arrival more than one day after the departure date.
This percentage drops to 0.8% when compared to the entire dataset — suggesting the issue is limited in scope and not significant at first glance.

##### NOTE: Arrival time over one day after the departure
Further analysis of the Duration column revealed several instances of flights showing an arrival time up to three days after departure. Such durations are unrealistic for commercial flights, especially considering that some of these records are marked as direct flights (i.e., with no stopovers).

This strongly suggests data entry errors rather than valid long-haul flights.

However, without additional information, it's not possible to determine:

- Whether the departure or arrival time is incorrect
- Or whether the flight duration itself is plausible

Moreover, imputation methods would be purely speculative in this case. Therefore, the most appropriate approach is to:

- Flag these entries in a new column
- Mark durations that appear inconsistent given the number of stops or the time difference between departure and arrival

This allows for further filtering or downstream treatment of these anomalies without introducing noise through uncertain corrections.

In [391]:
df_copy.head(10)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Dep Time Hour,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min,Duration_Mins,Date of Arrival (+),Inverse Dep/Arr,Arrival day Offset
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,22,20,1,10,2,50,170,2019-03-22,X,
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,5,50,13,15,7,25,445,NaT,O,0.0
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19:00,2,No info,...,9,25,4,25,19,0,1140,2019-06-10,O,1.0
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5:25,1,No info,...,18,5,23,30,5,25,325,NaT,O,0.0
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4:45,1,No info,...,16,50,21,35,4,45,285,NaT,O,0.0
5,SpiceJet,2019-06-24,Kolkata,Banglore,CCU → BLR,09:00,11:25,2:25,0,No info,...,9,0,11,25,2,25,145,NaT,O,0.0
6,Jet Airways,2019-03-12,Banglore,New Delhi,BLR → BOM → DEL,18:55,10:25,15:30,1,In-flight meal not included,...,18,55,10,25,15,30,930,2019-03-13,O,1.0
7,Jet Airways,2019-03-01,Banglore,New Delhi,BLR → BOM → DEL,08:00,05:05,21:5,1,No info,...,8,0,5,5,21,5,1265,2019-03-02,O,1.0
8,Jet Airways,2019-03-12,Banglore,New Delhi,BLR → BOM → DEL,08:55,10:25,25:30,1,In-flight meal not included,...,8,55,10,25,25,30,1530,2019-03-13,O,1.0
9,Multiple carriers,2019-05-27,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7:50,1,No info,...,11,25,19,15,7,50,470,NaT,O,0.0


🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 2.7. Merging Inverse Dep/Arr and Arrival day Offset

In [394]:
# Checking if every X in Inverse Dep/Arr corresponds to a np.nan in Arrival day Offset
df_copy[(df_copy['Arrival day Offset'].isna()) & (df_copy['Inverse Dep/Arr']=='X')].shape

(89, 28)

NOTE on previous check:  
the previous check is not strictly necessary since the np.nan(s) in Arrival day Offset is implicitly derived from the X in Inverse Dep/Arr.

In [397]:
# Merging said two columns into Arrival Information
df_copy['Date of Arrival Info'] = np.where(
    df_copy['Arrival day Offset'].isna(),np.nan, df_copy['Arrival day Offset'])

In [399]:
df_copy.head(3)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Dep Time Min,Arr Time Hour,Arr Time Min,Duration_Hour,Duration_min,Duration_Mins,Date of Arrival (+),Inverse Dep/Arr,Arrival day Offset,Date of Arrival Info
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2:50,0,No info,...,20,1,10,2,50,170,2019-03-22,X,,
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7:25,2,No info,...,50,13,15,7,25,445,NaT,O,0.0,0.0
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19:00,2,No info,...,25,4,25,19,0,1140,2019-06-10,O,1.0,1.0


#### NOTE - Date of Arrival Info

This new column provides info on the relationship between departure and arrival dates in the following manner:
- np.nan for records where the arrival occured before to the departure
- 0.0 when the departure and arrival occurred on the same day and
- float values representing how many days after the the date of departure the flight has arrived
  
This column is useful for identifying long-duration or inconsistent flight records.

🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 2.8 Encoding (mostly for practicing and showcasing porpouses)
This step is mainly intended for cases where the dataset would be used in a machine learning model.
It is not strictly necessary for exploratory data analysis, but it was intentionally included here for practicing and showcasing purposes.

##### 2.8.1 Encoding Total Stops

In [404]:
df_copy['Total Stops'].unique()

array(['0', '2', '1', '3', '4'], dtype=object)

In [406]:
df_copy['Total Stops'] = df['Total_Stops'].map({'non_stop':0, '1 stop':1, '2 stops':2, '3 stops':3, '4 stops':4})

#### NOTE - encoding Total Stops:
An informal process of encoding was already performed during the data celaning process (step 5.4. in the data cleaning section).  
The present encoding process is more formal and also results in a less verbose approach for turning Total Stops from string to single integeres

🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
##### 2.8.2 Encoding Airline

In [410]:
#Importing the Encoder module
from sklearn.preprocessing import OneHotEncoder

In [412]:
#Creation of an instance of the encoder
encoder = OneHotEncoder()

In [414]:
encoded = encoder.fit_transform(df_copy[['Airline']]).toarray()
encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [416]:
encoded_df= pd.DataFrame(encoded, columns = encoder.get_feature_names_out())
encoded_df

Unnamed: 0,Airline_Air Asia,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10457,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10458,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10459,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


This encoded dataframe for airline could be easiliy concatenated to the analyzed Dataframe (df_copy).  
The code would be:
pd.concat([encoded_df,df_copy], axis = 1)

🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷🔷
#### 2.8. Organizing Dataset: dropping Unnecessary columns, change name, rearrange order

##### 2.8.1 Dropping columns

In [427]:
#Dropping unnecessary columns
df_copy.drop(['Dep Time','Arr Time',
              'Duration',
              'Dep_Time_Hour','Arrival_Time_Hour',
              'Inverse Dep/Arr', 'Arrival day Offset',
              'Duration_todrop', 
              'Stops by Route'], 
             axis =1, 
             inplace=True)

- Dep Time, Arr Time: these two columns are no longer needed as they have been rearranged into timestamp format (Dep_Time_Dummy and Arr_Time_Dummy) and into integer (Dep Hour/Min and Arr Hour/Min) for modelling purposes
  
- Duration: same as the previous point

  
- Dep_Time_Hour, Arrival_Time_Hour: these columns can easily be retrieved from Dep_Time_Dummy and Arr_Time_Dummy, so they are not strictly necessary

  
- Inverse Dep/Arr, Arrival day Offset: we merged these two column into

  
- Duration_todrop: this column was created just to perform a check on the minimum and maximum duration of flights and is no longer needed
- Stops by Route: just like the previous point it was created to run a consistency check between the stops portrayed in Route and the number of stops in Total Stops: since the check was showed perfect consistency, this column is no longer needed.

##### 2.8.2 Renaming columns for improved clarity

In [435]:
#Renaming columns
df_copy.rename({'Dep_Time_Dummy':'Dep Time (Dummy Date)',
                'Arrival_Time_Dummy':'Arr Time (Dummy Date)', 
                'Duration_Hour':'Duration Hour', 'Duration_min':'Duration Min',
               'Duration_Mins': 'Total Duration (Min)',
               'Date':'Day of Journey',
               'Month':'Month of journey',
               'Year':'Year of Journey',
               'Arrival Information':'Date of Arrival Info'}, 
               axis =1, 
               inplace=True)
df_copy.head(0)

Unnamed: 0,Airline,Date of Journey,Source,Destination,Route,Dep Time,Arr Time,Duration,Total Stops,Additional Info,...,Dep Time Min,Arr Time Hour,Arr Time Min,Duration Hour,Duration Min,Total Duration (Min),Date of Arrival (+),Inverse Dep/Arr,Arrival day Offset,Date of Arrival Info


##### 2.8.3 Rearringing columns

In [438]:
#Rearranging order of columns
new_order = ['Airline', 'Source','Destination', 'Route', 'Total Stops', 
             'Date of Journey',
             'Day of Journey', 'Month of journey', 'Year of Journey',
             'Dep Time (Dummy Date)','Dep Time Hour', 'Dep Time Min',
             'Arr Time (Dummy Date)', 'Date of Arrival (+)', 'Arr Time Hour', 'Arr Time Min',
             'Date of Arrival Info',
             'Duration Hour', 'Duration Min', 'Total Duration (Min)',
            ]
df_complete = df_copy[new_order]
df_complete.head(2)

Unnamed: 0,Airline,Source,Destination,Route,Total Stops,Date of Journey,Day of Journey,Month of journey,Year of Journey,Dep Time (Dummy Date),Dep Time Hour,Dep Time Min,Arr Time (Dummy Date),Date of Arrival (+),Arr Time Hour,Arr Time Min,Date of Arrival Info,Duration Hour,Duration Min,Total Duration (Min)
0,IndiGo,Banglore,New Delhi,BLR → DEL,,2019-03-24,24,3,2019,1900-01-01 22:20:00,22,20,1900-01-01 01:10:00,2019-03-22,1,10,,2,50,170
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2.0,2019-05-01,1,5,2019,1900-01-01 05:50:00,5,50,1900-01-01 13:15:00,NaT,13,15,0.0,7,25,445


In [440]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10462 entries, 0 to 10682
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Airline                10462 non-null  object        
 1   Date of Journey        10462 non-null  datetime64[ns]
 2   Source                 10462 non-null  object        
 3   Destination            10462 non-null  object        
 4   Route                  10462 non-null  object        
 5   Dep Time               10462 non-null  object        
 6   Arr Time               10462 non-null  object        
 7   Duration               10462 non-null  object        
 8   Total Stops            6987 non-null   float64       
 9   Additional Info        10462 non-null  object        
 10  Price                  10462 non-null  int64         
 11  Arrival_Time_Hour      10462 non-null  object        
 12  Dep Time (Dummy Date)  10462 non-null  datetime64[ns]
 13  Dep_Ti

In [442]:
df_complete.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10462 entries, 0 to 10682
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Airline                10462 non-null  object        
 1   Source                 10462 non-null  object        
 2   Destination            10462 non-null  object        
 3   Route                  10462 non-null  object        
 4   Total Stops            6987 non-null   float64       
 5   Date of Journey        10462 non-null  datetime64[ns]
 6   Day of Journey         10462 non-null  int32         
 7   Month of journey       10462 non-null  int32         
 8   Year of Journey        10462 non-null  int32         
 9   Dep Time (Dummy Date)  10462 non-null  datetime64[ns]
 10  Dep Time Hour          10462 non-null  int64         
 11  Dep Time Min           10462 non-null  int64         
 12  Arr Time (Dummy Date)  10462 non-null  datetime64[ns]
 13  Date o

##### NOTE - wrong dtype for Date of Arrival Info
After running a final .info() check on the dataset, it became clear that the Date of Arrival column was stored with the wrong data type.
Several attempts to convert the data type directly in df_complete were unsuccessful, so the more efficient solution was to go back to df_copy and correct it there.

**NOTE**:
This issue occurred during a previous execution of the full script but did not appear in the current run. Nevertheless, it has been retained here for educational purposes, as it may be useful to understand and troubleshoot similar situations.

In [447]:
df_complete['Date of Arrival Info'].apply(type).value_counts()

Date of Arrival Info
<class 'float'>    10462
Name: count, dtype: int64

In [451]:
df_complete['Date of Arrival Info'].unique() #The floating values here are stored as string

array([nan,  0.,  1.,  4.,  2.])

In [None]:
df_complete['Date of Arrival info'] = df_complete['Date of Arrival info'].astype(float) 

In [None]:
I tried to convert the data type but it resulted in the entire dataset being converted to NaN

In [None]:
df_complete['Date of Arrival info'].unique()

In [None]:
df_complete.loc[:, 'Date of Arrival info'] = df_complete['Date of Arrival info'].astype(float)

Here I tried to convert the entire column to float using loc (as suggested in the warning) but it didn't work

In [None]:
df_copy.info()

Then I went back to df_copy

In [None]:
df_copy['Date of Arrival Info'].apply(type).value_counts()

In [None]:
df_copy['Date of Arrival Info'].unique()

In [None]:
df_complete.head(3)

In [None]:
df_copy.info()

In [None]:
#forcing the column to numbers
df_copy['Date of Arrival Info'] = pd.to_numeric(df_copy5['Date of Arrival Info'], errors = 'coerce')

#### It finally worked!!!

In [None]:
#Rearranging order of columns
new_order = ['Airline', 'Source','Destination', 'Route', 'Total Stops', 
             'Date of Journey',
             'Day of Journey', 'Month of journey', 'Year of Journey',
             'Dep Time (Dummy Date)','Dep Time Hour', 'Dep Time Min',
             'Arr Time (Dummy Date)', 'Arr Date +', 'Arr Time Hour', 'Arr Time Min',
             'Date of Arrival Info',
             'Duration Hour', 'Duration Min', 'Total Duration (Min)',
            ]
df_complete = df_copy[new_order]    

In [None]:
df_complete.info()

In [None]:
df_complete.head(5)

### Function for Feature engineering