In [None]:
import pandas as pd
df = pd.read_csv('2018.csv')
top_10_groups = df.groupby(['ORIGIN', 'DEST', 'OP_CARRIER']).size().sort_values(ascending=False).head(50)
print(top_10_groups)

In [272]:
import pandas as pd
SELECTED_ORIGIN = 'LGA'
SELECTED_DEST = 'ATL'
SELECTED_OP_CARRIER = 'DL'
start_year = 2018
end_year = 2018

def get_df(origin, dest, carrier, start_year, end_year, exclude=None):
    ans = None
    for year in range(start_year, end_year+1):
        df = pd.read_csv(f"{year}.csv")
        if ans is None:
            ans = df[(df['OP_CARRIER'] == carrier) & (df['ORIGIN'] == origin) & (df['DEST'] == dest)]
            continue
        ans = pd.concat([ans, df[(df['OP_CARRIER'] == carrier) & (df['ORIGIN'] == origin) & (df['DEST'] == dest)]])
    
    return ans


df = get_df(SELECTED_ORIGIN, SELECTED_DEST, SELECTED_OP_CARRIER, start_year, end_year)

In [273]:
filtered_df = df.reset_index()

filtered_df['DEP_DELAY'] = filtered_df['DEP_DELAY'].apply(lambda x: max(0,x))

# Convert the time column to a string and format it
filtered_df['CRS_DEP_TIME'] = filtered_df['CRS_DEP_TIME'].apply(lambda x: '{:04.0f}'.format(x))
filtered_df['CRS_DEP_TIME'] = filtered_df['CRS_DEP_TIME'].str[:2] + ':' + filtered_df['CRS_DEP_TIME'].str[2:]

# Combine date and time columns
filtered_df['datetime'] = pd.to_datetime(filtered_df['FL_DATE'] + ' ' + filtered_df['CRS_DEP_TIME'])
DROPPED_COLUMNS = ['CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 27', 'NAS_DELAY', 'CANCELLED', 'DIVERTED']
filtered_df = filtered_df.drop(columns=DROPPED_COLUMNS)
filtered_df.dropna(inplace=True)

data = filtered_df.copy()
# Convert the time column to a string and format it
data['CRS_DEP_TIME'] = data['CRS_DEP_TIME'].apply(lambda x: x.replace(":","")).astype(int).apply(lambda x: '{:04.0f}'.format(x))
data['CRS_DEP_TIME'] = data['CRS_DEP_TIME'].str[:2] + ':' + data['CRS_DEP_TIME'].str[2:]

# Combine date and time columns
data['datetime'] = pd.to_datetime(data['FL_DATE'] + ' ' + data['CRS_DEP_TIME'])
data = data.sort_values(by='datetime')

In [274]:
import requests
lat = 28.43
long = -81.31
response = requests.get(f'https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={long}&start_date={start_year}-01-01&end_date={end_year}-12-31&hourly=temperature_2m,rain,snowfall,cloudcover,windspeed_100m')
weather = response.json()

weather_dict = {}
from datetime import datetime
for i, date in enumerate(weather['hourly']['time']):
    hour  = datetime.strptime(date, "%Y-%m-%dT%H:%M")
    weather_dict[hour] = {'temperature_2m': weather['hourly']['temperature_2m'][i],'rain': weather['hourly']['rain'][i], 'snowfall': weather['hourly']['snowfall'][i], 'cloudcover': weather['hourly']['cloudcover'][i], 'windspeed_100m': weather['hourly']['windspeed_100m'][i] }

# Floor the datetime to the nearest hour
data["floored_datetime"] = data["datetime"].dt.floor("H")

# Look up the weather data and add new columns to the DataFrame
for feature in ['temperature_2m', 'rain', 'snowfall', 'cloudcover', 'windspeed_100m']:
    data[feature] = data["floored_datetime"].map(lambda x: weather_dict.get(x, {}).get(feature, None))

# Drop the 'floored_datetime' column if not needed
data.drop(columns=["floored_datetime"], inplace=True)


In [275]:
data['DEP_DELAY'].describe()

count    5400.000000
mean       12.977407
std        47.526788
min         0.000000
25%         0.000000
50%         0.000000
75%         3.000000
max      1038.000000
Name: DEP_DELAY, dtype: float64

In [276]:
data

Unnamed: 0,index,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,datetime,temperature_2m,rain,snowfall,cloudcover,windspeed_100m
3,16914,2018-01-01,DL,1447,LGA,ATL,06:00,606.0,6.0,16.0,...,160.0,148.0,125.0,762.0,2018-01-01 06:00:00,13.7,0.0,0.0,64,14.6
1,16479,2018-01-01,DL,904,LGA,ATL,07:00,700.0,0.0,16.0,...,162.0,157.0,125.0,762.0,2018-01-01 07:00:00,13.9,0.0,0.0,74,12.0
6,17248,2018-01-01,DL,1842,LGA,ATL,07:59,757.0,0.0,15.0,...,155.0,144.0,123.0,762.0,2018-01-01 07:59:00,13.9,0.0,0.0,74,12.0
4,16994,2018-01-01,DL,1539,LGA,ATL,10:00,1012.0,12.0,24.0,...,170.0,152.0,122.0,762.0,2018-01-01 10:00:00,14.6,0.2,0.0,100,6.5
0,16284,2018-01-01,DL,658,LGA,ATL,13:00,1542.0,162.0,18.0,...,164.0,150.0,125.0,762.0,2018-01-01 13:00:00,14.3,1.7,0.0,100,15.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5466,7199300,2018-12-31,DL,821,LGA,ATL,13:00,1255.0,0.0,12.0,...,161.0,134.0,117.0,762.0,2018-12-31 13:00:00,20.0,0.0,0.0,55,19.1
5463,7196561,2018-12-31,DL,1877,LGA,ATL,14:00,1356.0,0.0,14.0,...,161.0,139.0,119.0,762.0,2018-12-31 14:00:00,21.4,0.0,0.0,52,18.4
5465,7199187,2018-12-31,DL,645,LGA,ATL,15:00,1454.0,0.0,14.0,...,152.0,140.0,118.0,762.0,2018-12-31 15:00:00,22.9,0.0,0.0,65,16.8
5468,7199728,2018-12-31,DL,1346,LGA,ATL,17:00,1658.0,0.0,12.0,...,168.0,137.0,119.0,762.0,2018-12-31 17:00:00,25.0,0.0,0.0,42,23.6


In [278]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def create_sequences(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

# Using past 5 hours to predict the next hour's delay
window_size = 8

# Convert datetime to its components
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['dayofweek'] = data['datetime'].dt.dayofweek  # Monday=0, Sunday=6

# One-hot encode the time variables
data = pd.get_dummies(data, columns=['month', 'day', 'hour', 'dayofweek'])

features = data[['rain', 'snowfall', 'windspeed_100m'] + [col for col in data.columns if 'month_' in col or 'day_' in col or 'hour_' in col or 'dayofweek_' in col]]
X, y = create_sequences(features.values, window_size)
y = data['DEP_DELAY'].values[window_size:]

# Reshape X for LSTM [samples, timesteps, features]
X = X.reshape(X.shape[0], X.shape[1], features.shape[1])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.3))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), shuffle=False)

# Predict
y_pred = model.predict(X_test)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [240]:
from sklearn.metrics import mean_squared_error
from math import sqrt


# Compute RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Square Error (RMSE): {rmse}")

Root Mean Square Error (RMSE): 27.679236873767657


In [253]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Conv1D
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


def create_sequences(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

# Using past 5 hours to predict the next hour's delay
window_size = 8

# Convert datetime to its components
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['dayofweek'] = data['datetime'].dt.dayofweek  # Monday=0, Sunday=6

features = data[['month', 'day', 'hour', 'dayofweek', 'rain', 'snowfall', 'windspeed_100m']]
X, y = create_sequences(features.values, window_size)
y = data['DEP_DELAY'].values[window_size:]

# Reshape X for LSTM [samples, timesteps, features]
X = X.reshape(X.shape[0], X.shape[1], features.shape[1])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model with Stacked layers
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, strides=1, activation='relu', padding='causal', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(32, return_sequences=True)) # Add return_sequences=True for stacking
#model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=False))  # Additional LSTM layer
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), shuffle=False)

# Predict
y_pred = model.predict(X_test)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [251]:
from sklearn.metrics import mean_squared_error
from math import sqrt


# Compute RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Square Error (RMSE): {rmse}")

Root Mean Square Error (RMSE): 27.640011489633636


In [254]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def create_sequences(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

# Sample data loading (please ensure you have loaded your data into the 'data' variable)
# data = pd.read_csv('your_data_path.csv')

# Using past 8 hours to predict the next hour's delay
window_size = 8

# Convert datetime to its components
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['dayofweek'] = data['datetime'].dt.dayofweek  # Monday=0, Sunday=6

features = data[['month', 'day', 'hour', 'dayofweek', 'rain', 'snowfall', 'windspeed_100m']]
X, y = create_sequences(features.values, window_size)
y = data['DEP_DELAY'].values[window_size:]

# Feature Standardization
scaler_X = MinMaxScaler()
X = scaler_X.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
scaler_y = MinMaxScaler()
y = scaler_y.fit_transform(y.reshape(-1, 1))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model with Stacked layers
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) # Add return_sequences=True for stacking
model.add(Dropout(0.3))
model.add(LSTM(30, return_sequences=False))  # Additional LSTM layer
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

# Train model with Callbacks
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), shuffle=False, callbacks=[early_stopping, reduce_lr])

# Predict
y_pred = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred)  # Inverse scaling for predictions
y_test = scaler_y.inverse_transform(y_test)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200

KeyboardInterrupt: 

In [231]:
from sklearn.metrics import mean_squared_error
from math import sqrt


# Compute RMSE
y_test = scaler_y.inverse_transform(y_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Square Error (RMSE): {rmse}")

Root Mean Square Error (RMSE): 42.26330723504036


In [244]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def create_sequences(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

# Sample data loading (please ensure you have loaded your data into the 'data' variable)
# data = pd.read_csv('your_data_path.csv')

# Using past 8 hours to predict the next hour's delay
window_size = 8

# Convert datetime to its components
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['dayofweek'] = data['datetime'].dt.dayofweek  # Monday=0, Sunday=6

features = data[['month', 'day', 'hour', 'dayofweek', 'rain', 'snowfall', 'windspeed_100m']]
X, y = create_sequences(features.values, window_size)
y = data['DEP_DELAY'].values[window_size:]

# Feature Standardization
scaler_X = MinMaxScaler()
X = scaler_X.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)
scaler_y = MinMaxScaler()
y = scaler_y.fit_transform(y.reshape(-1, 1))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Build LSTM model with Stacked layers
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True)) # Add return_sequences=True for stacking
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=False))  # Additional LSTM layer
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

# Train model with Callbacks
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), shuffle=False, callbacks=[early_stopping, reduce_lr])

# Predict
y_pred = model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred)  # Inverse scaling for predictions
y_test = scaler_y.inverse_transform(y_test)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200

KeyboardInterrupt: 

In [221]:
from sklearn.metrics import mean_squared_error
from math import sqrt


# Compute RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Square Error (RMSE): {rmse}")


Root Mean Square Error (RMSE): 27.612384856338696


In [222]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Sample data
y_true = y_test

# Define a function to bin the values
def bin_values(values):
    bins = [0, 30, 60, float('inf')]
    labels = ['0-30', '30-60', '60+']
    return np.digitize(values, bins=bins, right=False).astype(str)

# Bin the true and predicted values
binned_y_true = bin_values(y_true)
binned_y_pred = bin_values(y_pred)

# Calculate classification metrics
print("Confusion Matrix:")
print(confusion_matrix(binned_y_true, binned_y_pred))
print("\nClassification Report:")
print(classification_report(binned_y_true, binned_y_pred))
print("\nAccuracy Score:")
print(accuracy_score(binned_y_true, binned_y_pred))


Confusion Matrix:
[[  0   0   0   0]
 [229 819  23   6]
 [ 10  30   4   1]
 [  3  27   5  14]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.93      0.76      0.84      1077
           2       0.12      0.09      0.10        45
           3       0.67      0.29      0.40        49

    accuracy                           0.71      1171
   macro avg       0.43      0.28      0.34      1171
weighted avg       0.89      0.71      0.79      1171


Accuracy Score:
0.7147736976942783


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [242]:
response2 = requests.get('https://archive-api.open-meteo.com/v1/archive?latitude=33.75&longitude=-84.39&start_date=2017-01-01&end_date=2018-12-31&hourly=temperature_2m,rain,snowfall,cloudcover,windspeed_100m')

In [243]:
weather2 = response2.json()

In [244]:
weather_dict = {}
from datetime import datetime
for i, date in enumerate(weather2['hourly']['time']):
    hour  = datetime.strptime(date, "%Y-%m-%dT%H:%M")
    weather_dict[hour] = {'temperature_2m': weather2['hourly']['temperature_2m'][i],'rain': weather2['hourly']['rain'][i], 'snowfall': weather2['hourly']['snowfall'][i], 'cloudcover': weather2['hourly']['cloudcover'][i], 'windspeed_100m': weather2['hourly']['windspeed_100m'][i] }

In [246]:
# Floor the datetime to the nearest hour
data["floored_datetime"] = data["datetime"].dt.floor("H") + pd.Timedelta(hours=2)

# Look up the weather data and add new columns to the DataFrame
for feature in ['temperature_2m', 'rain', 'snowfall', 'cloudcover', 'windspeed_100m','precipitation']:
    data[feature+"_arr" + "_2hr"] = data["floored_datetime"].map(lambda x: weather_dict.get(x, {}).get(feature, None))

# Drop the 'floored_datetime' column if not needed
data.drop(columns=["floored_datetime"], inplace=True)


In [247]:
data.columns

Index(['index', 'FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'tiCRS_DEP_TIMEme', 'datetime', 'temperature_2m', 'rain', 'snowfall',
       'cloudcover', 'windspeed_100m', 'precipitation',
       'temperature_2m_arr_2hr', 'rain_arr_2hr', 'snowfall_arr_2hr',
       'cloudcover_arr_2hr', 'windspeed_100m_arr_2hr', 'precipitation_arr_2hr',
       'year', 'month', 'day', 'hour', 'dayofweek'],
      dtype='object')

In [248]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


def create_sequences(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

# Using past 5 hours to predict the next hour's delay
window_size = 8

# Convert datetime to its components
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['dayofweek'] = data['datetime'].dt.dayofweek  # Monday=0, Sunday=6

features = data[['month', 'day', 'hour', 'dayofweek', 'rain', 'snowfall', 'windspeed_100m', 'rain_arr_2hr','snowfall_arr_2hr', 'windspeed_100m_arr_2hr']]
X, y = create_sequences(features.values, window_size)
y = data['DEP_DELAY'].values[window_size:]

# Reshape X for LSTM [samples, timesteps, features]
X = X.reshape(X.shape[0], X.shape[1], features.shape[1])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.3))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), shuffle=False)

# Predict
y_pred = model.predict(X_test)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [250]:
from sklearn.metrics import mean_squared_error
from math import sqrt


# Compute RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Square Error (RMSE): {rmse}")


Root Mean Square Error (RMSE): 33.847136609376165


In [249]:
data

Unnamed: 0,index,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,...,rain_arr_2hr,snowfall_arr_2hr,cloudcover_arr_2hr,windspeed_100m_arr_2hr,precipitation_arr_2hr,year,month,day,hour,dayofweek
6,8463,2017-01-01,DL,1827,MCO,ATL,06:30,631.0,1.0,8.0,...,1.2,0.0,100,29.5,,2017,1,1,6,6
3,7839,2017-01-01,DL,1109,MCO,ATL,08:00,800.0,0.0,14.0,...,0.4,0.0,100,12.6,,2017,1,1,8,6
13,8646,2017-01-01,DL,2032,MCO,ATL,08:30,827.0,0.0,11.0,...,0.4,0.0,100,12.6,,2017,1,1,8,6
4,8460,2017-01-01,DL,1824,MCO,ATL,09:20,919.0,0.0,19.0,...,0.2,0.0,100,13.6,,2017,1,1,9,6
5,8462,2017-01-01,DL,1826,MCO,ATL,10:25,1023.0,0.0,12.0,...,0.0,0.0,100,15.6,,2017,1,1,10,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5910,5662786,2017-12-31,DL,82,MCO,ATL,12:20,1212.0,0.0,12.0,...,0.0,0.0,100,12.6,,2017,12,31,12,6
5911,5663163,2017-12-31,DL,1058,MCO,ATL,14:20,1419.0,0.0,18.0,...,0.0,0.0,100,8.9,,2017,12,31,14,6
5909,5662782,2017-12-31,DL,72,MCO,ATL,15:20,1517.0,0.0,26.0,...,0.0,0.0,100,10.0,,2017,12,31,15,6
5917,5664379,2017-12-31,DL,2586,MCO,ATL,16:25,1623.0,0.0,14.0,...,0.1,0.0,100,16.9,,2017,12,31,16,6
