In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

##### Read CSV

In [3]:
airlines = pd.read_csv("data/Airlines.csv", index_col="id")

##### Data treatment

Observations

In [4]:
airlines.shape

(539383, 8)

In [5]:
airlines.dtypes

Airline        object
Flight          int64
AirportFrom    object
AirportTo      object
DayOfWeek       int64
Time            int64
Length          int64
Delay           int64
dtype: object

In [6]:
airlines["Airline"].value_counts()

WN    94097
DL    60940
OO    50254
AA    45656
MQ    36605
US    34500
XE    31126
EV    27983
UA    27619
CO    21118
FL    20827
9E    20686
B6    18112
YV    13725
OH    12630
AS    11471
F9     6456
HA     5578
Name: Airline, dtype: int64

In [7]:
import matplotlib.pyplot as plt

corr = airlines.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Flight,DayOfWeek,Time,Length,Delay
Flight,1.0,0.000416,-0.00575,-0.341481,-0.046175
DayOfWeek,0.000416,1.0,0.001273,0.013397,-0.026199
Time,-0.00575,0.001273,1.0,-0.020612,0.150454
Length,-0.341481,0.013397,-0.020612,1.0,0.040489
Delay,-0.046175,-0.026199,0.150454,0.040489,1.0


Get delay column

In [8]:
delay_column = airlines["Delay"].values
delay_column

array([1, 1, 1, ..., 0, 1, 1])

Check NaN and Inf values

In [9]:
def drop_inf_and_na_values(df, column_names):
    df_cleaned = df.copy()
    for column_name in column_names:
        df_cleaned[column_name] = df_cleaned[column_name].replace([np.inf, -np.inf], np.nan)
    
    return df_cleaned.dropna()
    
airlines = drop_inf_and_na_values(airlines, ["Time", "Length"])

Drop duplicates

In [10]:
airlines.drop_duplicates()

Unnamed: 0_level_0,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,CO,269,SFO,IAH,3,15,205,1
2,US,1558,PHX,CLT,3,15,222,1
3,AA,2400,LAX,DFW,3,20,165,1
4,AA,2466,SFO,DFW,3,20,195,1
5,AS,108,ANC,SEA,3,30,202,0
...,...,...,...,...,...,...,...,...
539364,DL,1002,SLC,JFK,5,1425,264,1
539366,US,119,KOA,PHX,5,1425,349,1
539368,UA,86,HNL,LAX,5,1428,333,0
539375,DL,2354,LAX,ATL,5,1435,255,0


Drop Flight column (too many values)

In [11]:
column_to_drop = ["Airline", "Flight", "Delay"]
airlines = airlines.drop(column_to_drop, axis=1)
airlines

Unnamed: 0_level_0,AirportFrom,AirportTo,DayOfWeek,Time,Length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,SFO,IAH,3,15,205
2,PHX,CLT,3,15,222
3,LAX,DFW,3,20,165
4,SFO,DFW,3,20,195
5,ANC,SEA,3,30,202
...,...,...,...,...,...
539379,OGG,SNA,5,1439,326
539380,SEA,ATL,5,1439,305
539381,SFO,MKE,5,1439,255
539382,HNL,SFO,5,1439,313


Normalize data

In [12]:
from sklearn.preprocessing import MinMaxScaler

def normalize(df, features):
    min_max_scaler = MinMaxScaler()
    new_df = df.copy()
    for feature in features:
        new_df[feature] = min_max_scaler.fit_transform(new_df[[feature]])
    return new_df

normalized_airlines = normalize(airlines, ["Time", "Length"])
normalized_airlines

Unnamed: 0_level_0,AirportFrom,AirportTo,DayOfWeek,Time,Length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,SFO,IAH,3,0.003499,0.312977
2,PHX,CLT,3,0.003499,0.338931
3,LAX,DFW,3,0.006998,0.251908
4,SFO,DFW,3,0.006998,0.297710
5,ANC,SEA,3,0.013996,0.308397
...,...,...,...,...,...
539379,OGG,SNA,5,1.000000,0.497710
539380,SEA,ATL,5,1.000000,0.465649
539381,SFO,MKE,5,1.000000,0.389313
539382,HNL,SFO,5,1.000000,0.477863


OneHotEncoded

In [13]:
airlines_numerical = pd.get_dummies(normalized_airlines, columns=["DayOfWeek", "AirportFrom", "AirportTo"])

Split data

In [14]:
y = delay_column
X = airlines_numerical
X

Unnamed: 0_level_0,Time,Length,DayOfWeek_1,DayOfWeek_2,DayOfWeek_3,DayOfWeek_4,DayOfWeek_5,DayOfWeek_6,DayOfWeek_7,AirportFrom_ABE,...,AirportTo_TXK,AirportTo_TYR,AirportTo_TYS,AirportTo_UTM,AirportTo_VLD,AirportTo_VPS,AirportTo_WRG,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.003499,0.312977,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.003499,0.338931,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.006998,0.251908,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.006998,0.297710,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.013996,0.308397,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539379,1.000000,0.497710,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539380,1.000000,0.465649,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539381,1.000000,0.389313,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
539382,1.000000,0.477863,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Entrainement

In [16]:
kmeans = KMeans(n_clusters=2, random_state=42).fit(X_train)

##### Résultats

In [17]:
from sklearn.metrics import accuracy_score

print(f"Training accuracy score : {accuracy_score(kmeans.labels_, y_train)}")
print(f"Test accuracy score: {accuracy_score(kmeans.predict(X_test), y_test)}")

Training accuracy score : 0.5379206778121277
Test accuracy score: 0.5380757714804824
