input: merged CSVs with only Origin based delay and weather data

In [43]:
import pandas as pd
import numpy as np
import seaborn as sns

In [44]:
dtypes = {
    'DayOfWeek':                       int,
    'FlightDate':                      'string',
    'IATA_CODE_Reporting_Airline':     'string',
    'Tail_Number':                     'string',
    'Flight_Number_Reporting_Airline': int,
    'OriginAirportID':                 int,
    'Origin':                          'string',
    'OriginState':                     'string',
    'DestAirportID':                   int,
    'Dest':                            'string',
    'DestState':                       'string',
    'CRSDepTime':                      int,
    'DepTime':                         np.float64,
    'DepDelayMinutes':                 np.float64,
    'CRSArrTime':                      int,
    'ArrTime':                         np.float64,
    'ArrDelayMinutes':                 np.float64,
    'Cancelled':                       int,
    'CancellationCode':                'string',
    'Diverted':                        int,
    'CarrierDelay':                    np.float64,
    'WeatherDelay':                    np.float64,
    'NASDelay':                        np.float64,
    'SecurityDelay':                   np.float64,
    'LateAircraftDelay':               np.float64
}

In [74]:
# read data
df1 = pd.read_csv('data/weather-joined/w-2017.csv')
df2 = pd.read_csv('data/weather-joined/w-2018.csv')
df3 = pd.read_csv('data/weather-joined/w-2019.csv')
frames = [df1,df2]
df = pd.concat(frames)

In [70]:
df['wind_gusts_10m'].describe()

count    70551.000000
mean        17.845734
std          8.339332
min          0.894800
25%         11.856100
50%         16.553800
75%         22.593700
max         64.425600
Name: wind_gusts_10m, dtype: float64

In [71]:
df.count()

Unnamed: 0                         70646
DayOfWeek                          70646
FlightDate                         70646
IATA_CODE_Reporting_Airline        70646
Tail_Number                        70646
Flight_Number_Reporting_Airline    70646
OriginAirportID                    70646
Origin                             70646
OriginState                        70646
DestAirportID                      70646
Dest                               70646
DestState                          70646
CRSDepTime                         70646
DepTime                            70646
DepDelayMinutes                    70646
CRSArrTime                         70646
ArrTime                            70646
ArrDelayMinutes                    70646
Cancelled                          70646
CancellationCode                       0
Diverted                           70646
CarrierDelay                       70646
WeatherDelay                       70646
NASDelay                           70646
SecurityDelay   

In [76]:
df = df.drop(['CancellationCode'], axis = 1)

In [77]:
df.isna().sum()

Unnamed: 0                            0
DayOfWeek                             0
FlightDate                            0
IATA_CODE_Reporting_Airline           0
Tail_Number                           0
Flight_Number_Reporting_Airline       0
OriginAirportID                       0
Origin                                0
OriginState                           0
DestAirportID                         0
Dest                                  0
DestState                             0
CRSDepTime                            0
DepTime                               0
DepDelayMinutes                       0
CRSArrTime                            0
ArrTime                               0
ArrDelayMinutes                       0
Cancelled                             0
Diverted                              0
CarrierDelay                          0
WeatherDelay                          0
NASDelay                              0
SecurityDelay                         0
LateAircraftDelay                     0


In [78]:
df['snow_depth'].describe()

count    68626.000000
mean         0.059591
std          0.266063
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          6.692914
Name: snow_depth, dtype: float64

In [79]:
df_dropped = df.dropna()

In [80]:
df_dropped.isna().sum()

Unnamed: 0                         0
DayOfWeek                          0
FlightDate                         0
IATA_CODE_Reporting_Airline        0
Tail_Number                        0
Flight_Number_Reporting_Airline    0
OriginAirportID                    0
Origin                             0
OriginState                        0
DestAirportID                      0
Dest                               0
DestState                          0
CRSDepTime                         0
DepTime                            0
DepDelayMinutes                    0
CRSArrTime                         0
ArrTime                            0
ArrDelayMinutes                    0
Cancelled                          0
Diverted                           0
CarrierDelay                       0
WeatherDelay                       0
NASDelay                           0
SecurityDelay                      0
LateAircraftDelay                  0
CRSDepHour                         0
CRSArrHour                         0
t

In [101]:
df_dropped = df_dropped[(df_dropped['WeatherDelay'] > 0) & (df_dropped['NASDelay'] > 0)]

# df3 = df3.drop(['CancellationCode'], axis = 1)

In [103]:
df3 = df3.dropna()

In [104]:
df3.count()

Unnamed: 0                         42684
DayOfWeek                          42684
FlightDate                         42684
IATA_CODE_Reporting_Airline        42684
Tail_Number                        42684
Flight_Number_Reporting_Airline    42684
OriginAirportID                    42684
Origin                             42684
OriginState                        42684
DestAirportID                      42684
Dest                               42684
DestState                          42684
CRSDepTime                         42684
DepTime                            42684
DepDelayMinutes                    42684
CRSArrTime                         42684
ArrTime                            42684
ArrDelayMinutes                    42684
Cancelled                          42684
Diverted                           42684
CarrierDelay                       42684
WeatherDelay                       42684
NASDelay                           42684
SecurityDelay                      42684
LateAircraftDela

In [82]:
feature_columns = df.columns.tolist()
print(feature_columns)

['Unnamed: 0', 'DayOfWeek', 'FlightDate', 'IATA_CODE_Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline', 'OriginAirportID', 'Origin', 'OriginState', 'DestAirportID', 'Dest', 'DestState', 'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'CRSArrTime', 'ArrTime', 'ArrDelayMinutes', 'Cancelled', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'CRSDepHour', 'CRSArrHour', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code', 'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_speed_10m', 'wind_speed_100m', 'wind_gusts_10m', 'shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 'direct_normal_irradiance', 'terrestrial_radiation', 'airport', 'day', 'hour']


In [83]:
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.tolist()
print(non_numeric_columns)

['FlightDate', 'IATA_CODE_Reporting_Airline', 'Tail_Number', 'Origin', 'OriginState', 'Dest', 'DestState', 'airport', 'day']


In [87]:

#encoding string data to numeric
label_encoder = LabelEncoder()
feature_columns_1 = ['CRSDepTime', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code', 'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_speed_10m', 'wind_speed_100m', 'wind_gusts_10m', 'shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 'direct_normal_irradiance', 'terrestrial_radiation']


In [111]:
df3 = df3[(df3['WeatherDelay'] > 0) & (df3['NASDelay'] > 0)]

In [112]:
df3.loc[:, 'target_variable'] = df3['WeatherDelay'] + df3['NASDelay']

In [113]:
df_dropped.count()

Unnamed: 0                         68626
DayOfWeek                          68626
FlightDate                         68626
IATA_CODE_Reporting_Airline        68626
Tail_Number                        68626
Flight_Number_Reporting_Airline    68626
OriginAirportID                    68626
Origin                             68626
OriginState                        68626
DestAirportID                      68626
Dest                               68626
DestState                          68626
CRSDepTime                         68626
DepTime                            68626
DepDelayMinutes                    68626
CRSArrTime                         68626
ArrTime                            68626
ArrDelayMinutes                    68626
Cancelled                          68626
Diverted                           68626
CarrierDelay                       68626
WeatherDelay                       68626
NASDelay                           68626
SecurityDelay                      68626
LateAircraftDela

In [114]:
df_dropped.loc[:, 'target_variable'] = df_dropped['WeatherDelay'] + df_dropped['NASDelay']

In [137]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing  import LabelEncoder
from math import sqrt

X_train = df_dropped[feature_columns_1]
y_train = df_dropped['target_variable']
model = LinearRegression()
model.fit(X_train, y_train)
X_test = df3[feature_columns_1]
Y_test = df3['target_variable']
y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
print(f'Root Mean Squared Error on Training Set: {rmse}')
print(f'Mean Absolute Error on Training Set: {mae}')

Root Mean Squared Error on Training Set: 110.11591695528243
Mean Absolute Error on Training Set: 59.47666594202245


In [129]:
r2 = r2_score(Y_test, y_pred)
print(f'R2 Score on Test Set: {r2}')

R2 Score on Test Set: -0.002149821949724018


In [131]:
predictions_df = pd.DataFrame({'Actual': Y_test, 'Predicted': y_pred})
print(predictions_df)

       Actual  Predicted
0       794.0  74.511154
1        72.0  81.373309
2       673.0  77.200609
3        90.0  89.759914
4       202.0  73.026855
...       ...        ...
44085    19.0  74.394165
44086    27.0  80.648429
44087    69.0  70.913791
44088   127.0  74.571042
44089    37.0  80.417124

[42684 rows x 2 columns]


In [132]:
print(y_pred.mean())

77.34689657501727


In [135]:
print(df_dropped['target_variable'].mean())

76.46327922361787


In [138]:
X_train = df_dropped[feature_columns_1]
y_train = df_dropped['DepDelayMinutes']
model = LinearRegression()
model.fit(X_train, y_train)
X_test = df3[feature_columns_1]
Y_test = df3['DepDelayMinutes']
y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(Y_test, y_pred)
print(f'Root Mean Squared Error on Training Set: {rmse}')
print(f'Mean Absolute Error on Training Set: {mae}')

Root Mean Squared Error on Training Set: 120.42796120832135
Mean Absolute Error on Training Set: 67.45991254268975


In [152]:

X_train = df_dropped[feature_columns_1]
y_train = df_dropped['target_variable']
df_dropped_mean_wd = df_dropped['WeatherDelay'].mean()
df_dropped_mean_nd = df_dropped['NASDelay'].mean()
y_pred_train = df_dropped_mean_wd + df_dropped_mean_nd

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_pred_train)

print(f'Root Mean Squared Error on Training Set: {rmse_train}')
print(f'Mean Absolute Error on Training Set: {mae_train}')

X_test = df3[feature_columns_1]
y_test = df3['target_variable']
df3_mean_wd = df3['WeatherDelay'].mean()
df3_mean_nd = df3['NASDelay'].mean()
y_pred_test = df3_mean_wd + df3_mean_nd
rmse_test = sqrt(mean_squared_error(y_test, y_pred_test))
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f'Root Mean Squared Error on Test Set: {rmse_test}')
print(f'Mean Absolute Error on Test Set: {mae_test}')


InvalidParameterError: The 'y_pred' parameter of mean_squared_error must be an array-like. Got 76.46327922361787 instead.