In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [3]:
dataframes = []

for i in range(1, 13):
    file_number = f'{i:02}'
    file_name = f'data/2016_{file_number}.csv'
    try:
        # Lire le fichier CSV en ignorant les lignes malformées
        df = pd.read_csv(file_name, on_bad_lines='skip', low_memory=False)
        dataframes.append(df)
        print(f'Successfully read {file_name}')
    except Exception as e:
        print(f'Error reading {file_name}: {e}')

if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
else:
    print("No valid dataframes to concatenate.")


Successfully read data/2016_01.csv
Successfully read data/2016_02.csv
Successfully read data/2016_03.csv
Successfully read data/2016_04.csv
Successfully read data/2016_05.csv
Successfully read data/2016_06.csv
Successfully read data/2016_07.csv
Successfully read data/2016_08.csv
Successfully read data/2016_09.csv
Successfully read data/2016_10.csv
Successfully read data/2016_11.csv
Successfully read data/2016_12.csv


In [4]:
selection=['MONTH','DAY_OF_MONTH','DAY_OF_WEEK','CARRIER','ORIGIN','DEST','CRS_DEP_TIME','CRS_ARR_TIME','DISTANCE','ARR_DELAY_NEW']
df2=df[selection]
df2.dropna

<bound method DataFrame.dropna of          MONTH DAY_OF_MONTH DAY_OF_WEEK CARRIER ORIGIN DEST  CRS_DEP_TIME  \
0            1            6           3      AA    DFW  DTW        1100.0   
1            1            7           4      AA    DFW  DTW        1100.0   
2            1            8           5      AA    DFW  DTW        1100.0   
3            1            9           6      AA    DFW  DTW        1100.0   
4            1           10           7      AA    DFW  DTW        1100.0   
...        ...          ...         ...     ...    ...  ...           ...   
5635973     12           31           6      WN    TUS  LAX         755.0   
5635974     12           31           6      WN    TUS  LAX        1320.0   
5635975     12           31           6      WN    TUS  MDW         705.0   
5635976     12           31           6      WN    TUS  SAN        1220.0   
5635977     12           31           6      WN    TUS  SAN         800.0   

         CRS_ARR_TIME  DISTANCE  ARR_DELA

In [5]:
ev_indices = df2[df2['DAY_OF_MONTH'] == 'EV'].index

# Supprimer les lignes où 'DAY_OF_MONTH' est 'EV'
df2 = df2.drop(ev_indices)

In [9]:
df

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,TAIL_NUM,...,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,Unnamed: 64
0,2016,1,1,6,3,2016-01-06,AA,19805,AA,N4YBAA,...,4.0,,,,,,,,,
1,2016,1,1,7,4,2016-01-07,AA,19805,AA,N434AA,...,4.0,,,,,,,,,
2,2016,1,1,8,5,2016-01-08,AA,19805,AA,N541AA,...,4.0,,,,,,,,,
3,2016,1,1,9,6,2016-01-09,AA,19805,AA,N489AA,...,4.0,,,,,,,,,
4,2016,1,1,10,7,2016-01-10,AA,19805,AA,N439AA,...,4.0,0.0,0.0,47.0,0.0,66.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5635973,2016,4,12,31,6,2016-12-31,WN,19393,WN,N7703A,...,2.0,,,,,,,,,
5635974,2016,4,12,31,6,2016-12-31,WN,19393,WN,N7815L,...,2.0,,,,,,,,,
5635975,2016,4,12,31,6,2016-12-31,WN,19393,WN,N967WN,...,6.0,,,,,,,,,
5635976,2016,4,12,31,6,2016-12-31,WN,19393,WN,N271LV,...,2.0,,,,,,,,,


In [7]:
df2['DAY_OF_WEEK'] = pd.to_numeric(df2['DAY_OF_WEEK'], errors='coerce')

# Supprimer les lignes avec des NaN dans 'DAY_OF_MONTH'
df2 = df2.dropna(subset=['DAY_OF_WEEK'])

df2['CRS_ARR_TIME'] = pd.to_numeric(df2['CRS_ARR_TIME'], errors='coerce')

# Supprimer les lignes avec des NaN dans 'DAY_OF_MONTH'
df2 = df2.dropna(subset=['CRS_ARR_TIME'])

In [8]:
df2['DELAY'] = df2['ARR_DELAY_NEW'].apply(lambda x: 1 if x > 0 else 0)


In [None]:
print(df2.dtypes)

MONTH              int64
DAY_OF_MONTH      object
DAY_OF_WEEK      float64
CARRIER           object
ORIGIN            object
DEST              object
CRS_DEP_TIME     float64
ARR_TIME         float64
DISTANCE         float64
ARR_DELAY_NEW    float64
DELAY              int64
dtype: object


In [10]:

def convert_column_dtype(df, column_name, dtype):
    try:
        df[column_name] = df[column_name].astype(dtype)
    except ValueError as e:
        print(f"Error converting column {column_name} to {dtype}: {e}")

# List of columns and their desired data types
columns_to_convert = {
    'DAY_OF_MONTH': 'int64',
    'DAY_OF_WEEK': 'int64',
    'CRS_DEP_TIME': 'int64',
    'CRS_ARR_TIME': 'int64',
    'DISTANCE': 'float64',
    'ARR_DELAY_NEW': 'float64'
    # Add other columns and their desired data types as needed
}

# Convert the columns
for column, dtype in columns_to_convert.items():
    convert_column_dtype(df2, column, dtype)

# Verify the data type conversion
print(df2.dtypes)

# Now, save the DataFrame to a Parquet file
df2.to_parquet('data.parquet')

MONTH              int64
DAY_OF_MONTH       int64
DAY_OF_WEEK        int64
CARRIER           object
ORIGIN            object
DEST              object
CRS_DEP_TIME       int64
CRS_ARR_TIME       int64
DISTANCE         float64
ARR_DELAY_NEW    float64
DELAY              int64
dtype: object


In [8]:
df2.shape

(5567993, 11)

In [None]:
numeric_columns = [ 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'DISTANCE', 'ARR_DELAY_NEW']
#'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
df2[numeric_columns] = df2[numeric_columns].apply(pd.to_numeric, errors='coerce')

df2 = df2.dropna(subset=numeric_columns)

df2['DELAY'] = df2['ARR_DELAY_NEW'].apply(lambda x: 1 if x > 0 else 0)

label_encoders = {}
categorical_columns = ['CARRIER', 'ORIGIN', 'DEST']
for col in categorical_columns:
    le = LabelEncoder()
    df2[col] = le.fit_transform(df2[col])
    label_encoders[col] = le

X = df2.drop(['ARR_DELAY_NEW', 'DELAY'], axis=1)
y = df2['DELAY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['On-time', 'Delayed'], yticklabels=['On-time', 'Delayed'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()