In [2]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

In [3]:
df1 = pd.read_csv('data/weather-joined/w-2017.csv')
df2 = pd.read_csv('data/weather-joined/w-2018.csv')
df3 = pd.read_csv('data/weather-joined/w-2019.csv')
delay_df = pd.concat([df1,df2,df3])

In [4]:
non_numeric_columns = delay_df.select_dtypes(exclude=['number']).columns.tolist()
print(non_numeric_columns)

['FlightDate', 'IATA_CODE_Reporting_Airline', 'Tail_Number', 'Origin', 'OriginState', 'Dest', 'DestState', 'airport', 'day']


In [5]:
cols_to_drop = ['Unnamed: 0','IATA_CODE_Reporting_Airline', 'Flight_Number_Reporting_Airline', 'Tail_Number', 'OriginState', 'Dest', 'DestState', 'FlightDate', 'OriginAirportID', 'DestAirportID', 'DepTime', 'CRSArrTime', 'ArrTime', 'ArrDelayMinutes', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'SecurityDelay', 'LateAircraftDelay', 'CRSArrHour', 'CRSDepTime', 'airport']
delay_df_dropped = delay_df.drop(cols_to_drop, axis=1)

In [6]:
delay_df_dropped.columns

Index(['DayOfWeek', 'Origin', 'DepDelayMinutes', 'WeatherDelay', 'NASDelay',
       'CRSDepHour', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'apparent_temperature', 'precipitation', 'rain', 'snowfall',
       'snow_depth', 'weather_code', 'surface_pressure', 'cloud_cover',
       'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
       'wind_speed_10m', 'wind_speed_100m', 'wind_gusts_10m',
       'shortwave_radiation', 'direct_radiation', 'diffuse_radiation',
       'direct_normal_irradiance', 'terrestrial_radiation', 'day', 'hour'],
      dtype='object')

In [7]:
import numpy as np 

def encode_cols(df):
    def procDate(day):
        return pd.Series([day.year, day.month])

    df_encoded = df.copy()
    df_encoded[['weekday', 'weekend']] = df_encoded['DayOfWeek'].apply(lambda x: pd.Series([x<6, x>=6]))
    df_encoded[['year', 'month']] = df_encoded['day'].astype('datetime64[ns]').apply(procDate)
    df_encoded['xhr'] = np.sin(np.pi*df_encoded['hour']/12)
    df_encoded['yhr'] = np.cos(np.pi*df_encoded['hour']/12)
    
    df_encoded = df_encoded.drop(['DayOfWeek', 'day', 'hour'], axis=1)

    return df_encoded

def one_hot_encode(df):
    one_hot_encoding_cols = ['weather_code']
    df_encoded = pd.get_dummies(df, columns=one_hot_encoding_cols)
    return df_encoded

delay_df_encoded = encode_cols(delay_df_dropped)

# combined = pd.concat([train_encoded, test_encoded])
delay_df_encoded = one_hot_encode(delay_df_encoded)

# train_encoded = combined.iloc[:len(train_encoded), :]
# test_encoded = combined.iloc[len(train_encoded):, :]

In [298]:
print(delay_df_encoded.columns.tolist())

['Origin', 'DepDelayMinutes', 'WeatherDelay', 'NASDelay', 'CRSDepHour', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_speed_10m', 'wind_speed_100m', 'wind_gusts_10m', 'shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 'direct_normal_irradiance', 'terrestrial_radiation', 'weekday', 'weekend', 'year', 'month', 'xhr', 'yhr', 'weather_code_0.0', 'weather_code_1.0', 'weather_code_2.0', 'weather_code_3.0', 'weather_code_51.0', 'weather_code_53.0', 'weather_code_55.0', 'weather_code_61.0', 'weather_code_63.0', 'weather_code_65.0', 'weather_code_71.0', 'weather_code_73.0', 'weather_code_75.0']


In [8]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

def scale_df(df):
    columns_to_scale = ['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_speed_10m', 'wind_speed_100m', 'wind_gusts_10m', 'shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 'direct_normal_irradiance', 'terrestrial_radiation']

    scaler = StandardScaler()

    scaled_data = df.copy()
    scaled_data[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    return scaled_data

delay_df_scaled = scale_df(delay_df_encoded)

In [124]:
for weather_ft in ['temperature_2m',
       'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature',
       'precipitation', 'rain', 'snowfall', 'snow_depth', 
       'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid',
       'cloud_cover_high', 'wind_speed_10m', 'wind_speed_100m',
       'wind_gusts_10m', 'shortwave_radiation', 'direct_radiation',
       'diffuse_radiation', 'direct_normal_irradiance',
       'terrestrial_radiation']:
    
    # train_scaled.plot.scatter(x='WeatherDelay',
    #             y=weather_ft,
    #            s=.05);

IndentationError: expected an indented block (1985039446.py, line 12)

In [34]:
delay_df_model = delay_df_scaled.fillna(0)
delay_df_model.isna().sum().sum()

0

In [35]:
delay_df_model = delay_df_model[delay_df_model['Origin'] == 'ORD']

In [36]:
delay_df_model.describe()

Unnamed: 0,DepDelayMinutes,WeatherDelay,NASDelay,CRSDepHour,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,...,wind_gusts_10m,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,terrestrial_radiation,year,month,xhr,yhr
count,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,...,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0,14169.0
mean,68.546334,39.416473,23.143835,13.936058,-0.500201,0.095037,-0.447562,-0.49371,-0.074527,-0.108624,...,0.051452,-0.234333,-0.240478,-0.144238,-0.209213,-0.214245,2017.821371,5.996048,-0.212671,-0.3547475
std,93.085967,76.33851,24.909632,4.614364,1.022163,0.838946,1.052407,0.989545,0.831375,0.817666,...,0.935133,0.842638,0.80483,0.942547,0.886737,0.897034,29.373866,3.720032,0.731454,0.5419841
min,0.0,1.0,1.0,0.0,-3.506288,-3.1391,-3.503825,-3.085363,-0.376184,-0.319428,...,-1.945746,-0.854495,-0.662016,-0.956624,-0.72248,-1.08464,0.0,0.0,-1.0,-1.0
25%,18.0,9.0,8.0,10.0,-1.294035,-0.471377,-1.255188,-1.284559,-0.376184,-0.319428,...,-0.62653,-0.854495,-0.662016,-0.956624,-0.72248,-1.08464,2018.0,2.0,-0.866025,-0.8660254
50%,40.0,19.0,16.0,14.0,-0.776116,0.216194,-0.581354,-0.815762,-0.376184,-0.319428,...,-0.034229,-0.661981,-0.662016,-0.489817,-0.72248,-0.429071,2018.0,6.0,-0.5,-0.5
75%,84.0,41.0,31.0,18.0,0.489086,0.803918,0.577034,0.491303,-0.132798,-0.319428,...,0.584994,0.175809,-0.281739,0.520012,-0.116755,0.505563,2019.0,9.0,0.5,-1.83697e-16
max,1330.0,1316.0,956.0,23.0,1.395444,1.368364,1.186513,1.435767,16.579745,16.847184,...,5.161865,2.582228,3.340399,3.292276,2.794782,1.47988,2019.0,12.0,1.0,1.0


In [37]:
X = delay_df_model.drop(['DepDelayMinutes', 'NASDelay', 'WeatherDelay', 'Origin'], axis=1)
Y = delay_df_model['WeatherDelay'] + delay_df_model['NASDelay']

In [13]:
X = pd.concat([X_train, X_test], ignore_index=True, sort=False)
Y = pd.concat([Y_train, Y_test], ignore_index=True, sort=False)

NameError: name 'X_train' is not defined

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

In [40]:
X_train.describe()

Unnamed: 0,CRSDepHour,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,snow_depth,surface_pressure,...,wind_gusts_10m,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,terrestrial_radiation,year,month,xhr,yhr
count,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,...,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0,9493.0
mean,13.941957,-0.510675,0.085555,-0.460587,-0.503755,-0.074975,-0.108859,0.140461,0.084375,0.117937,...,0.047472,-0.234637,-0.242204,-0.141591,-0.208797,-0.214768,2017.602549,5.987359,-0.213649,-0.3522544
std,4.623351,1.025296,0.845271,1.05726,0.991185,0.82547,0.812042,1.134666,0.658779,0.164721,...,0.935001,0.839174,0.802213,0.942937,0.888068,0.893451,35.881529,3.734957,0.732707,0.5414638
min,0.0,-3.506288,-3.1391,-3.481111,-3.085363,-0.376184,-0.319428,-0.261663,-0.259668,-0.439865,...,-1.945746,-0.854495,-0.662016,-0.956624,-0.72248,-1.08464,0.0,0.0,-1.0,-1.0
25%,10.0,-1.297734,-0.481937,-1.266545,-1.291168,-0.376184,-0.319428,-0.261663,-0.259668,0.021346,...,-0.62653,-0.854495,-0.662016,-0.956624,-0.72248,-1.08464,2018.0,2.0,-0.866025,-0.8660254
50%,14.0,-0.790913,0.206553,-0.592711,-0.830018,-0.376184,-0.319428,-0.261663,-0.259668,0.113204,...,-0.034229,-0.654851,-0.662016,-0.48029,-0.72248,-0.412942,2018.0,6.0,-0.5,-0.5
75%,18.0,0.477988,0.801403,0.569463,0.477063,-0.132798,-0.319428,-0.261663,0.075892,0.230625,...,0.584994,0.175809,-0.281739,0.510485,-0.118294,0.499704,2019.0,9.0,0.5,-1.83697e-16
max,23.0,1.395444,1.368364,1.186513,1.391059,10.819596,11.015463,10.919365,3.095931,0.758479,...,5.161865,2.539447,3.235823,3.006476,2.794782,1.47988,2019.0,12.0,1.0,1.0


In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing  import LabelEncoder
from math import sqrt

model = LinearRegression()
model.fit(X_train, Y_train)

In [210]:
Y_train.describe()

count    76873.000000
mean        78.068255
std        100.976012
min          2.000000
25%         27.000000
50%         49.000000
75%         92.000000
max       1649.000000
dtype: float64

In [42]:
Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error on Test Set: {rmse}')
print(f'Mean Absolute Error on Test Set: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 5715.525313451941
Root Mean Squared Error on Test Set: 75.60109333503016
Mean Absolute Error on Test Set: 40.23838353018249
R-squared: 0.025535111902293295


In [204]:
Y_pred.describe()

array([81.62887969, 79.30724412, 79.5292804 , ..., 82.49774883,
       80.77302675, 76.07979428])

In [50]:
Y_pred

array([72.82479339, 72.82479339, 53.06904488, ..., 50.73607967,
       99.5       , 72.82479339])

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

rf_model = DecisionTreeRegressor(max_depth=4) #RandomForestRegressor(n_estimators=300, random_state=24) #, random_state=42, n_estimators=100 , max_depth = 5
rf_model.fit(X_train, Y_train)
Y_pred = rf_model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error on Test Set: {rmse}')
print(f'Mean Absolute Error on Test Set: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 5607.147962371523
Root Mean Squared Error on Test Set: 74.88089183744758
Mean Absolute Error on Test Set: 39.853543479171776
R-squared: 0.04401284010766826


In [133]:
Y_pred

array([76.51854934, 72.85397858, 80.25345457, ..., 67.77980546,
       73.98599234, 84.11157044])