In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm



In [2]:

# load dataset

data = pd.read_csv('T_ONTIME_REPORTING.csv')

data.head()


Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY
0,2024,1,1,1,ABE,ORD,626,621.0,-5.0,805,726.0,-39.0
1,2024,1,1,1,ABE,ORD,1730,1723.0,-7.0,1900,1836.0,-24.0
2,2024,1,1,1,ABQ,MDW,600,600.0,0.0,940,944.0,4.0
3,2024,1,1,1,ABQ,ORD,1258,1250.0,-8.0,1648,1631.0,-17.0
4,2024,1,1,1,ABQ,ORD,1358,1349.0,-9.0,1745,1736.0,-9.0


In [3]:
# filter the dataset for the chosen airport ord
chosen_airport = 'ORD'
filtered_data = data[data['ORIGIN'] == chosen_airport]

filtered_data.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY
885,2024,1,1,1,ORD,ABE,1350,1350.0,0.0,1655,1649.0,-6.0
886,2024,1,1,1,ORD,ABE,1800,1806.0,6.0,2100,2106.0,6.0
887,2024,1,1,1,ORD,ABQ,920,916.0,-4.0,1148,1106.0,-42.0
888,2024,1,1,1,ORD,ABQ,1047,1039.0,-8.0,1307,1230.0,-37.0
889,2024,1,1,1,ORD,ALB,1150,1145.0,-5.0,1449,1439.0,-10.0


In [4]:
filtered_data.shape

(20321, 12)

In [5]:
#check for missding values

missing_values = filtered_data.isnull().sum()
missing_values

YEAR               0
MONTH              0
DAY_OF_MONTH       0
DAY_OF_WEEK        0
ORIGIN             0
DEST               0
CRS_DEP_TIME       0
DEP_TIME        1384
DEP_DELAY       1397
CRS_ARR_TIME       0
ARR_TIME        1441
ARR_DELAY       1505
dtype: int64

In [6]:
# remove missing values
cleaned_data = filtered_data.dropna()
# check missing values are gone
missing_values = cleaned_data.isnull().sum()
missing_values

YEAR            0
MONTH           0
DAY_OF_MONTH    0
DAY_OF_WEEK     0
ORIGIN          0
DEST            0
CRS_DEP_TIME    0
DEP_TIME        0
DEP_DELAY       0
CRS_ARR_TIME    0
ARR_TIME        0
ARR_DELAY       0
dtype: int64

In [7]:
# Rename columns to match the expected format in poly_regressor_Python_1.0.0.py
column_mapping = {
    "YEAR": "YEAR",
    "MONTH": "MONTH",
    "DAY_OF_MONTH": "DAY",
    "DAY_OF_WEEK": "DAY_OF_WEEK",
    "ORIGIN": "ORG_AIRPORT",
    "DEST": "DEST_AIRPORT",
    "CRS_DEP_TIME": "SCHEDULED_DEPARTURE",
    "DEP_TIME": "DEPARTURE_TIME",
    "DEP_DELAY": "DEPARTURE_DELAY",
    "CRS_ARR_TIME": "SCHEDULED_ARRIVAL",
    "ARR_TIME": "ARRIVAL_TIME",
    "ARR_DELAY": "ARRIVAL_DELAY"
}

# Rename columns
data = data.rename(columns=column_mapping)

# Display the updated column names
data.head()


Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,ORG_AIRPORT,DEST_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY
0,2024,1,1,1,ABE,ORD,626,621.0,-5.0,805,726.0,-39.0
1,2024,1,1,1,ABE,ORD,1730,1723.0,-7.0,1900,1836.0,-24.0
2,2024,1,1,1,ABQ,MDW,600,600.0,0.0,940,944.0,4.0
3,2024,1,1,1,ABQ,ORD,1258,1250.0,-8.0,1648,1631.0,-17.0
4,2024,1,1,1,ABQ,ORD,1358,1349.0,-9.0,1745,1736.0,-9.0


In [8]:
#looking at cleaned data
cleaned_data.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY
885,2024,1,1,1,ORD,ABE,1350,1350.0,0.0,1655,1649.0,-6.0
886,2024,1,1,1,ORD,ABE,1800,1806.0,6.0,2100,2106.0,6.0
887,2024,1,1,1,ORD,ABQ,920,916.0,-4.0,1148,1106.0,-42.0
888,2024,1,1,1,ORD,ABQ,1047,1039.0,-8.0,1307,1230.0,-37.0
889,2024,1,1,1,ORD,ALB,1150,1145.0,-5.0,1449,1439.0,-10.0


In [9]:
# save the dataset
cleaned_data.to_csv("cleaned_data.csv", index=False)

print("Dataset saved successfully as 'cleaned_data.csv'")

Dataset saved successfully as 'cleaned_data.csv'
