In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# 1. Load data
df = pd.read_csv("fact_monthly_delay_weather_hurricane.csv")

# Target: percentage of flights delayed
df['arrival_delay_rate'] = df['arr_del15'] / df['arr_flights']

# 2. Define features
# Weather features used for historical averages
weather_features = [
    'avg_temp',
    'total_precip',
    'avg_wind_speed',
    'total_snow',
    'n_storms',
    'sshs_category'
]

# Full weather features + year
feature_cols = weather_features + ['year']

# Label  hurricane category
if df['sshs_category'].dtype == 'object':
    le = LabelEncoder()
    df['sshs_category'] = le.fit_transform(df['sshs_category'].astype(str))

# Replacing infinities with NaN and convert all features to numeric
df.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in feature_cols + ['arrival_delay_rate']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing features
df = df.dropna(subset=feature_cols + ['arrival_delay_rate'])

# 3. Train Random Forest
X = df[feature_cols]
y = df['arrival_delay_rate']

#Splitting data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fitting model
model = RandomForestRegressor(n_estimators=300, random_state=42)
model.fit(X_train, y_train)

# 4. Define airports with station IDs
airports_dict = {
    "ATL": "GHCND:USW00013874",
    "DFW": "GHCND:USW00003927",
    "DEN": "GHCND:USW00003017",
    "ORD": "GHCND:USW00094846",
    "LAX": "GHCND:USW00093134",
    "CLT": "GHCND:USW00013881",
    "LAS": "GHCND:USW00023169",
    "MCO": "GHCND:USW00012815",
    "PHX": "GHCND:USW00023183",
    "MIA": "GHCND:USW00012839",
    "SEA": "GHCND:USW00024233",
    "EWR": "GHCND:USW00014734",
    "JFK": "GHCND:USW00094789",
    "SFO": "GHCND:USW00023234",
    "BOS": "GHCND:USW00014739"
}

airports = list(airports_dict.keys())
months = range(1, 13)
years_future = range(2025, 2036)

# 5. Create future dataframe for all combinations
future_rows = []
for airport in airports:
    for year in years_future:
        for month in months:
            future_rows.append({'airport': airport, 'year': year, 'month': month})

future_df = pd.DataFrame(future_rows)

# 6. Merge historical averages for weather features
weather_means = df.groupby(['airport', 'month'])[weather_features].mean().reset_index()
future_df = future_df.merge(weather_means, on=['airport', 'month'], how='left')

# Ensure year is numeric and drop rows with missing features
future_df['year'] = future_df['year'].astype(int)
future_df = future_df.dropna(subset=feature_cols)

# 7. Predict arrival delay rate
future_df['predicted_delay_rate'] = model.predict(future_df[feature_cols])
future_df['predicted_delay_rate_pct'] = future_df['predicted_delay_rate'] * 100

# 8. Apply yearly trend factor

base_year = 2025
annual_trend = 0.01  # 1% per year

future_df['predicted_delay_rate_pct_trend'] = future_df['predicted_delay_rate_pct'] * (
    1 + (future_df['year'] - base_year) * annual_trend
)
future_df['predicted_delay_rate_trend_frac'] = future_df['predicted_delay_rate_pct_trend'] / 100
future_df.to_csv("delay_predictions_2025_2035_expanded_trend.csv", index=False)
# 9. Export to CSV
future_df.to_csv("delay_predictions_2025_2035_expanded_trend.csv", index=False)



