In [1]:
# import the relevant libraries
import pandas as pd
from ydata_profiling import ProfileReport

In [2]:
# select the filepath
file_path = './Tunisair_flights_dataset.csv'
# Load the dataset to pandas dataframe
df = pd.read_csv(file_path , low_memory=False)
# to display all columns in the dataset
pd.set_option('display.max_columns',200)

In [3]:
# Exploration of the datase
# getting the first 5 values of the dataset
df.head()

Unnamed: 0,Filght_date,Flight_ID,Departure point,Arrival point,Scheduled_departure_time,Scheduled_arrival_time,STATUS,Aircraft_code,Arrival delay
0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0


In [4]:
# get the general info in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107833 entries, 0 to 107832
Data columns (total 9 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Filght_date               107833 non-null  object 
 1   Flight_ID                 107833 non-null  object 
 2   Departure point           107833 non-null  object 
 3   Arrival point             107833 non-null  object 
 4   Scheduled_departure_time  107833 non-null  object 
 5   Scheduled_arrival_time    107833 non-null  object 
 6   STATUS                    107833 non-null  object 
 7   Aircraft_code             107833 non-null  object 
 8   Arrival delay             107833 non-null  float64
dtypes: float64(1), object(8)
memory usage: 7.4+ MB


In [5]:
# renaming column filight_date to flight_date 
df = df.rename(columns ={'Filght_date':'Flight_date'})

In [11]:
# display columns 
df.columns

Index(['Flight_date', 'Flight_ID', 'Departure point', 'Arrival point',
       'Scheduled_departure_time', 'Scheduled_arrival_time', 'STATUS',
       'Aircraft_code', 'Arrival delay'],
      dtype='object')

In [8]:
# checking for duplicate values in the dataset = none
df.loc[df.duplicated()]

Unnamed: 0,Flight_date,Flight_ID,Departure point,Arrival point,Scheduled_departure_time,Scheduled_arrival_time,STATUS,Aircraft_code,Arrival delay


In [9]:
# checking for null values int the dataset
df.isnull().sum()

Flight_date                 0
Flight_ID                   0
Departure point             0
Arrival point               0
Scheduled_departure_time    0
Scheduled_arrival_time      0
STATUS                      0
Aircraft_code               0
Arrival delay               0
dtype: int64

In [10]:
# general statistics  of numerical values in the dataset
df.describe()

Unnamed: 0,Arrival delay
count,107833.0
mean,48.733013
std,117.135562
min,0.0
25%,0.0
50%,14.0
75%,43.0
max,3451.0


In [13]:
# generating a profiling report
profile = ProfileReport(df, title='Tunisair Flight Delays Analysis', explorative =True)
# specify porfile_path
profile_path ='Tunisair_flight_delays_report.html'
profile.to_file(profile_path)
print(f"\nProfiling report saved to {profile_path}")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


Profiling report saved to Tunisair_flight_delays_report.html


In [14]:
# There are no missing values in the dataset
# The only corelation in the dataset is betweeen the status and arrival delay,the correlation is positive, 
# a high percentage of the filghts are ATA(Arrival on time ) thus the arrival delay is a significant predictor of the flight Status.
# There is a high number of zeros in the arrival delay column indicating most flights arrive on time

# Summary
<p>The dataset includes columns related to flight details, status, and delays.
Key variables likely include:
Status: Categorical data indicating whether a flight is " Arrival On Time," "Delayed," or "Cancelled."
Arrival Delay: Numerical data indicating delay duration in minutes.
Other attributes: Information about departure times, routes </p>

* High correlation between variables like Departure Delay and Arrival Delay suggests causality.
* Preliminary findings indicate that "Delayed" statuses strongly correlate with positive values of Arrival Delay.
 <p>Box plots and statistical measures highlight outliers in Arrival Delay
Certain times of day, routes and operational factors likely influence delays
 </p>
 
* Most flights are "On Time," this imbalance might impact predictive models.
 
**Recommendations**
  * Provide, aircraft types and crew schedules to show patterns contributing to delays or cancellations.
  * Create a new variables : Delay Category to group delays into ranges.