<a href="https://colab.research.google.com/github/Sts87/FlightOnTime/blob/feature%2Fmodel-training/FlightOnTime-Analisis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### CARGA DE DATOS

In [53]:
# DATASET
# Airline:  aerolínea
# Flight:   tipo de aeronave
# Time:     hora de salida en minutos (desde medianoche)
# Length:   tiempo de vuelo en minutos
url = 'https://raw.githubusercontent.com/Sts87/FlightOnTime/refs/heads/main/data/Airlines.csv'
df = pd.read_csv(url)

### INSPECCIÓN INICIAL

In [54]:
df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0


In [55]:
df.shape

(539383, 9)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539383 entries, 0 to 539382
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           539383 non-null  int64 
 1   Airline      539383 non-null  object
 2   Flight       539383 non-null  int64 
 3   AirportFrom  539383 non-null  object
 4   AirportTo    539383 non-null  object
 5   DayOfWeek    539383 non-null  int64 
 6   Time         539383 non-null  int64 
 7   Length       539383 non-null  int64 
 8   Delay        539383 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 37.0+ MB


### LIMPIEZA DE DATOS

In [57]:
# ELIMINAR COLUMNA CODIGO
df.drop('id', axis=1, inplace=True)

In [58]:
(df == '').sum()

Unnamed: 0,0
Airline,0
Flight,0
AirportFrom,0
AirportTo,0
DayOfWeek,0
Time,0
Length,0
Delay,0


In [59]:
#Esta línea tiene una advertencia por no haber valores nulos
#Esto no debiera afectar al dataset ni el analisis ni el machinelearning

df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
df.replace('', np.nan, inplace=True)
df.isnull().sum()

#Alternativa para evitar adventerncia
#df.replace('', np.nan, inplace=True)

Unnamed: 0,0
Airline,0
Flight,0
AirportFrom,0
AirportTo,0
DayOfWeek,0
Time,0
Length,0
Delay,0


### ANÁLISIS ESTADÍSTICO

In [60]:
# VARIABLES NÚMERICAS
df.describe()

Unnamed: 0,Flight,DayOfWeek,Time,Length,Delay
count,539383.0,539383.0,539383.0,539383.0,539383.0
mean,2427.92863,3.929668,802.728963,132.202007,0.445442
std,2067.429837,1.914664,278.045911,70.117016,0.497015
min,1.0,1.0,10.0,0.0,0.0
25%,712.0,2.0,565.0,81.0,0.0
50%,1809.0,4.0,795.0,115.0,0.0
75%,3745.0,5.0,1035.0,162.0,1.0
max,7814.0,7.0,1439.0,655.0,1.0


In [61]:
# VARIABLES CATEGÓRICAS
df.describe(include='object')

Unnamed: 0,Airline,AirportFrom,AirportTo
count,539383,539383,539383
unique,18,293,293
top,WN,ATL,ATL
freq,94097,34449,34440


### PROPORCIÓN DE CLASES DE LA VARIABLE OBJETIVO

In [62]:
# IDENTIFICAR SI HAY DESBALANCEO
# No hay desbalanceo: proporcion 55-45%
df_cnt = df['Delay'].value_counts()
df_pct = df['Delay'].value_counts(normalize=True).round(2) * 100

df_delay = pd.concat([df_cnt, df_pct], axis=1)
df_delay

Unnamed: 0_level_0,count,proportion
Delay,Unnamed: 1_level_1,Unnamed: 2_level_1
0,299119,55.0
1,240264,45.0


### NUEVAS VARIABLES

In [63]:
df['Length'].describe().round(1)

Unnamed: 0,Length
count,539383.0
mean,132.2
std,70.1
min,0.0
25%,81.0
50%,115.0
75%,162.0
max,655.0


In [64]:
# Nuevas variables candidatas:
# Time of day
clases = [0, 360, 720, 1080, 1440]
labels = ['Early morning', 'Morning', 'Afternoon', 'Night']
df['Time_day'] = pd.cut(x=df['Time'], bins=clases, labels=labels, include_lowest=True)

# Duration
# : short (<3 hrs), medium (3-6 hrs), long (6-16 hrs), and ultra-long (>16 hrs)
clases = [0, 90, 180, 960]
labels = ['Short', 'Medium', 'Long']
df['Duration'] = pd.cut(x=df['Length'], bins=clases, labels=labels, include_lowest=True)

In [65]:
# TODO: Value counts en variables, graficos basicos, distribuciones
# length -> distribucion normal

df.sample(5)

Unnamed: 0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,Time_day,Duration
130683,US,90,PHX,PDX,3,885,165,0,Afternoon,Medium
269663,CO,337,CLT,EWR,4,915,130,0,Afternoon,Medium
327736,AA,2001,DFW,OKC,7,1250,55,1,Night,Short
198823,XE,2923,CLE,ATL,7,920,112,0,Afternoon,Medium
500915,WN,775,MSY,BNA,3,1045,85,1,Afternoon,Short


In [66]:
df['Duration'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Duration,Unnamed: 1_level_1
Medium,0.49054
Short,0.330463
Long,0.178997
