In [301]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML
from datetime import datetime


In [302]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [303]:
arrivals_syr = pd.read_csv('arrivals.csv', date_parser=lambda x: datetime.strptime(x, '%m/%d/%y'))
arrivals_syr.head()


The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.



Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),Wheels-on Time,Taxi-In time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
0,UA,01/01/2022,1282.0,N4901U,IAD,23:10,00:01,70.0,76.0,51.0,23:55,6.0,23.0,0.0,6.0,0.0,22.0
1,UA,01/01/2023,604.0,N814UA,DEN,14:58,14:52,193.0,177.0,-6.0,14:48,4.0,0.0,0.0,0.0,0.0,0.0
2,UA,01/01/2023,2488.0,N38458,EWR,23:14,23:15,75.0,62.0,1.0,23:10,5.0,0.0,0.0,0.0,0.0,0.0
3,UA,01/01/2023,2645.0,N23721,ORD,23:57,23:47,107.0,100.0,-10.0,23:41,6.0,0.0,0.0,0.0,0.0,0.0
4,UA,01/02/2022,1282.0,N4901U,IAD,23:10,23:27,70.0,64.0,17.0,23:19,8.0,17.0,0.0,0.0,0.0,0.0


In [304]:
len(arrivals_syr)
arrivals_syr.columns
arrivals_syr.dtypes

69525

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)', 'Delay Weather (Minutes)',
       'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)',
       'Delay Late Aircraft Arrival (Minutes)'],
      dtype='object')

Carrier Code                                 object
Date (MM/DD/YYYY)                            object
Flight Number                               float64
Tail Number                                  object
Origin Airport                               object
Scheduled Arrival Time                       object
Actual Arrival Time                          object
Scheduled Elapsed Time (Minutes)            float64
Actual Elapsed Time (Minutes)               float64
Arrival Delay (Minutes)                     float64
Wheels-on Time                               object
Taxi-In time (Minutes)                      float64
Delay Carrier (Minutes)                     float64
Delay Weather (Minutes)                     float64
Delay National Aviation System (Minutes)    float64
Delay Security (Minutes)                    float64
Delay Late Aircraft Arrival (Minutes)       float64
dtype: object

In [305]:
arrivals_syr.drop(columns=['Tail Number', 'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)', 'Delay Carrier (Minutes)', 'Delay Weather (Minutes)', 'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)', 'Delay Late Aircraft Arrival (Minutes)'], inplace=True)


In [306]:
len(arrivals_syr)
arrivals_syr.columns
arrivals_syr.dtypes

69525

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Origin Airport',
       'Scheduled Arrival Time', 'Actual Arrival Time',
       'Arrival Delay (Minutes)'],
      dtype='object')

Carrier Code                object
Date (MM/DD/YYYY)           object
Flight Number              float64
Origin Airport              object
Scheduled Arrival Time      object
Actual Arrival Time         object
Arrival Delay (Minutes)    float64
dtype: object

In [307]:
departures_syr = pd.read_csv('departures.csv', date_parser=lambda x: datetime.strptime(x, '%m/%d/%y'))
departures_syr.head()


The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.



Unnamed: 0.1,Unnamed: 0,index,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes),Origin Airport
0,0,53,G4,01/02/2020,1737,241NV,SYR,06:30,06:29,176,167,-1,06:42,13,0,0,0,0,0,FLL
1,1,102,G4,01/03/2019,1720,226NV,SYR,06:50,06:49,182,167,-1,07:01,12,0,0,0,0,0,FLL
2,2,121,G4,01/03/2021,1744,245NV,SYR,16:25,17:40,178,168,75,17:53,13,0,0,0,0,65,FLL
3,3,137,G4,01/03/2022,962,222NV,SYR,06:15,06:15,176,175,0,06:30,15,0,0,0,0,0,FLL
4,4,152,G4,01/03/2023,523,234NV,SYR,06:30,10:34,182,170,244,10:51,17,232,0,0,0,0,FLL


In [308]:
len(departures_syr)
departures_syr.columns
departures_syr.dtypes

56542

Index(['Unnamed: 0', 'index', 'Carrier Code', 'Date (MM/DD/YYYY)',
       'Flight Number', 'Tail Number', 'Destination Airport',
       'Scheduled departure time', 'Actual departure time',
       'Scheduled elapsed time (Minutes)', 'Actual elapsed time (Minutes)',
       'Departure delay (Minutes)', 'Wheels-off time',
       'Taxi-Out time (Minutes)', 'Delay Carrier (Minutes)',
       'Delay Weather (Minutes)', 'Delay National Aviation System (Minutes)',
       'Delay Security (Minutes)', 'Delay Late Aircraft Arrival (Minutes)',
       'Origin Airport'],
      dtype='object')

Unnamed: 0                                   int64
index                                        int64
Carrier Code                                object
Date (MM/DD/YYYY)                           object
Flight Number                                int64
Tail Number                                 object
Destination Airport                         object
Scheduled departure time                    object
Actual departure time                       object
Scheduled elapsed time (Minutes)             int64
Actual elapsed time (Minutes)                int64
Departure delay (Minutes)                    int64
Wheels-off time                             object
Taxi-Out time (Minutes)                      int64
Delay Carrier (Minutes)                      int64
Delay Weather (Minutes)                      int64
Delay National Aviation System (Minutes)     int64
Delay Security (Minutes)                     int64
Delay Late Aircraft Arrival (Minutes)        int64
Origin Airport                 

In [309]:
departures_syr.drop(columns=['Tail Number', 'Scheduled Elapsed Time (Minutes)', 'Actual elapsed time (Minutes)', 'Wheels-off time', 'Taxi-Out time (Minutes)', 'Delay Carrier (Minutes)', 'Delay Weather (Minutes)', 'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)', 'Delay Late Aircraft Arrival (Minutes)'], inplace=True, errors='ignore')


In [310]:
len(departures_syr)
departures_syr.columns
departures_syr.dtypes

56542

Index(['Unnamed: 0', 'index', 'Carrier Code', 'Date (MM/DD/YYYY)',
       'Flight Number', 'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Departure delay (Minutes)', 'Origin Airport'],
      dtype='object')

Unnamed: 0                           int64
index                                int64
Carrier Code                        object
Date (MM/DD/YYYY)                   object
Flight Number                        int64
Destination Airport                 object
Scheduled departure time            object
Actual departure time               object
Scheduled elapsed time (Minutes)     int64
Departure delay (Minutes)            int64
Origin Airport                      object
dtype: object

In [311]:
# Assuming departures_syr and arrivals_syr are pandas DataFrames
import pandas as pd

# Renaming columns to clarify data after merge (if needed)
departures_syr.rename(columns={
    'Scheduled departure time': 'Scheduled departure time',
    'Actual departure time': 'Actual departure time',
    'Scheduled elapsed time (Minutes)': 'Scheduled departure elapsed time (Minutes)',
    'Departure delay (Minutes)': 'Departure delay (Minutes)',
    'Origin Airport': 'Departure Airport'
}, inplace=True)

arrivals_syr.rename(columns={
    'Scheduled Arrival Time': 'Scheduled arrival time',
    'Actual Arrival Time': 'Actual arrival time',
    'Arrival Delay (Minutes)': 'Arrival delay (Minutes)',
    'Origin Airport': 'Arrival Origin Airport'
}, inplace=True)

# Merging datasets on 'Carrier Code', 'Date (MM/DD/YYYY)', and 'Flight Number'
merged_flights = pd.merge(departures_syr, arrivals_syr, 
                          on=['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number'], 
                          how='outer', 
                          suffixes=('_depart', '_arrive'))

# The result is a DataFrame with both departure and arrival information for flights.


In [312]:
merged_flights.head()

Unnamed: 0.1,Unnamed: 0,index,Carrier Code,Date (MM/DD/YYYY),Flight Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled departure elapsed time (Minutes),Departure delay (Minutes),Departure Airport,Arrival Origin Airport,Scheduled arrival time,Actual arrival time,Arrival delay (Minutes)
0,0.0,53.0,G4,01/02/2020,1737.0,SYR,06:30,06:29,176.0,-1.0,FLL,FLL,09:26,09:16,-10.0
1,1.0,102.0,G4,01/03/2019,1720.0,SYR,06:50,06:49,182.0,-1.0,FLL,FLL,09:52,09:36,-16.0
2,2.0,121.0,G4,01/03/2021,1744.0,SYR,16:25,17:40,178.0,75.0,FLL,FLL,19:23,20:28,65.0
3,3.0,137.0,G4,01/03/2022,962.0,SYR,06:15,06:15,176.0,0.0,FLL,FLL,09:11,09:10,-1.0
4,4.0,152.0,G4,01/03/2023,523.0,SYR,06:30,10:34,182.0,244.0,FLL,FLL,09:32,13:24,232.0


In [313]:
# Dropping the 'Destination Airport' column from the merged DataFrame
merged_flights = merged_flights.drop(columns=['Destination Airport'])


In [314]:

merged_flights.head()

Unnamed: 0.1,Unnamed: 0,index,Carrier Code,Date (MM/DD/YYYY),Flight Number,Scheduled departure time,Actual departure time,Scheduled departure elapsed time (Minutes),Departure delay (Minutes),Departure Airport,Arrival Origin Airport,Scheduled arrival time,Actual arrival time,Arrival delay (Minutes)
0,0.0,53.0,G4,01/02/2020,1737.0,06:30,06:29,176.0,-1.0,FLL,FLL,09:26,09:16,-10.0
1,1.0,102.0,G4,01/03/2019,1720.0,06:50,06:49,182.0,-1.0,FLL,FLL,09:52,09:36,-16.0
2,2.0,121.0,G4,01/03/2021,1744.0,16:25,17:40,178.0,75.0,FLL,FLL,19:23,20:28,65.0
3,3.0,137.0,G4,01/03/2022,962.0,06:15,06:15,176.0,0.0,FLL,FLL,09:11,09:10,-1.0
4,4.0,152.0,G4,01/03/2023,523.0,06:30,10:34,182.0,244.0,FLL,FLL,09:32,13:24,232.0


In [315]:
merged_flights.dtypes

Unnamed: 0                                    float64
index                                         float64
Carrier Code                                   object
Date (MM/DD/YYYY)                              object
Flight Number                                 float64
Scheduled departure time                       object
Actual departure time                          object
Scheduled departure elapsed time (Minutes)    float64
Departure delay (Minutes)                     float64
Departure Airport                              object
Arrival Origin Airport                         object
Scheduled arrival time                         object
Actual arrival time                            object
Arrival delay (Minutes)                       float64
dtype: object

In [316]:
def determine_arrival_status(delay):
    if delay < -5:
        return 'early'
    elif delay == 0:
        return 'on-time'
    else:
        return 'late'

# Apply the function to the 'Arrival delay (Minutes)' column to create the 'ARRIVAL_STATUS' column
merged_flights['ARRIVAL_STATUS'] = merged_flights['Arrival delay (Minutes)'].apply(determine_arrival_status)
merged_flights.drop('Arrival delay (Minutes)', axis=1, inplace=True)


In [317]:
# One-hot encoding categorical variables
merged_flights.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)


merged_flights['Date (MM/DD/YYYY)'] = pd.to_datetime(merged_flights['Date (MM/DD/YYYY)'])

# Assuming 'Date (MM/DD/YYYY)' is already in datetime format
# Extracting day of week, month, and day from the date
merged_flights['DayOfWeek'] = merged_flights['Date (MM/DD/YYYY)'].dt.dayofweek
merged_flights['Month'] = merged_flights['Date (MM/DD/YYYY)'].dt.month
merged_flights['Day'] = merged_flights['Date (MM/DD/YYYY)'].dt.day
merged_flights.drop('Date (MM/DD/YYYY)', axis=1, inplace=True)

# For time columns, assuming they're in HH:MM format and converting them to minutes
def time_to_minutes(time_str):
    if pd.isnull(time_str):
        return np.nan  # Returning NaN for missing values
    hours, minutes = map(int, time_str.split(':'))
    return hours * 60 + minutes

time_columns = ['Scheduled departure time', 'Actual departure time', 'Scheduled arrival time', 'Actual arrival time']
for col in time_columns:
    merged_flights[col + ' (Minutes)'] = merged_flights[col].apply(time_to_minutes)
    
merged_flights_encoded = pd.get_dummies(merged_flights, columns=['Carrier Code', 'Departure Airport', 'Arrival Origin Airport'])


from sklearn.impute import SimpleImputer

# Imputing missing values with median for numerical columns
numerical_cols = merged_flights_encoded.select_dtypes(include=['float64']).columns
imputer = SimpleImputer(strategy='median')
merged_flights_encoded[numerical_cols] = imputer.fit_transform(merged_flights_encoded[numerical_cols])



In [318]:
merged_flights_encoded.columns


Index(['Flight Number', 'Scheduled departure time', 'Actual departure time',
       'Scheduled departure elapsed time (Minutes)',
       'Departure delay (Minutes)', 'Scheduled arrival time',
       'Actual arrival time', 'ARRIVAL_STATUS', 'DayOfWeek', 'Month', 'Day',
       'Scheduled departure time (Minutes)', 'Actual departure time (Minutes)',
       'Scheduled arrival time (Minutes)', 'Actual arrival time (Minutes)',
       'Carrier Code_ SOURCE: Bureau of Transportation Statistics',
       'Carrier Code_9E', 'Carrier Code_AA', 'Carrier Code_B6',
       'Carrier Code_DL', 'Carrier Code_EV', 'Carrier Code_F9',
       'Carrier Code_G4', 'Carrier Code_MQ', 'Carrier Code_OH',
       'Carrier Code_OO', 'Carrier Code_UA', 'Carrier Code_WN',
       'Carrier Code_YV', 'Carrier Code_YX', 'Departure Airport_ALB',
       'Departure Airport_ATL', 'Departure Airport_BNA',
       'Departure Airport_BOS', 'Departure Airport_BWI',
       'Departure Airport_CLT', 'Departure Airport_DCA',
       'De

In [324]:
for column, dtype in merged_flights_encoded.dtypes.items():
    print(f"{column}: {dtype}")


Flight Number: float64
Scheduled departure time: object
Actual departure time: object
Scheduled departure elapsed time (Minutes): float64
Departure delay (Minutes): float64
Scheduled arrival time: object
Actual arrival time: object
ARRIVAL_STATUS: object
DayOfWeek: float64
Month: float64
Day: float64
Scheduled departure time (Minutes): float64
Actual departure time (Minutes): float64
Scheduled arrival time (Minutes): float64
Actual arrival time (Minutes): float64
Carrier Code_ SOURCE: Bureau of Transportation Statistics: bool
Carrier Code_9E: bool
Carrier Code_AA: bool
Carrier Code_B6: bool
Carrier Code_DL: bool
Carrier Code_EV: bool
Carrier Code_F9: bool
Carrier Code_G4: bool
Carrier Code_MQ: bool
Carrier Code_OH: bool
Carrier Code_OO: bool
Carrier Code_UA: bool
Carrier Code_WN: bool
Carrier Code_YV: bool
Carrier Code_YX: bool
Departure Airport_ALB: bool
Departure Airport_ATL: bool
Departure Airport_BNA: bool
Departure Airport_BOS: bool
Departure Airport_BWI: bool
Departure Airport_CL

In [319]:
merged_flights_encoded.head()

Unnamed: 0,Flight Number,Scheduled departure time,Actual departure time,Scheduled departure elapsed time (Minutes),Departure delay (Minutes),Scheduled arrival time,Actual arrival time,ARRIVAL_STATUS,DayOfWeek,Month,...,Arrival Origin Airport_MYR,Arrival Origin Airport_ORD,Arrival Origin Airport_PGD,Arrival Origin Airport_PHL,Arrival Origin Airport_PIE,Arrival Origin Airport_RDU,Arrival Origin Airport_RSW,Arrival Origin Airport_SFB,Arrival Origin Airport_SRQ,Arrival Origin Airport_TPA
0,1737.0,06:30,06:29,176.0,-1.0,09:26,09:16,early,3.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,1720.0,06:50,06:49,182.0,-1.0,09:52,09:36,early,3.0,1.0,...,False,False,False,False,False,False,False,False,False,False
2,1744.0,16:25,17:40,178.0,75.0,19:23,20:28,late,6.0,1.0,...,False,False,False,False,False,False,False,False,False,False
3,962.0,06:15,06:15,176.0,0.0,09:11,09:10,late,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
4,523.0,06:30,10:34,182.0,244.0,09:32,13:24,late,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [320]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Assuming merged_flights_encoded is your DataFrame and is ready for model training

# 1. Prepare the Data
# Selecting features - dropping the target variable 'ARRIVAL_STATUS'
X = merged_flights_encoded.drop('ARRIVAL_STATUS', axis=1)
# Target variable
y = merged_flights_encoded['ARRIVAL_STATUS']

# 2. Split the Data
# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 70% training and 30% testing

# 3. Train the Model
# Create a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
# Training the classifier
clf.fit(X_train, y_train)

# 4. Evaluate the Model
# Predicting the Test set results
y_pred = clf.predict(X_test)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')

# Printing the classification report
print(classification_report(y_test, y_pred))


ValueError: could not convert string to float: '10:50'