In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import FeatureHasher

In [7]:
dtypes = {
    'DayOfWeek':                       int,
    'FlightDate':                      'string',
    'IATA_CODE_Reporting_Airline':     'string',
    'Tail_Number':                     'string',
    'Flight_Number_Reporting_Airline': int,
    'OriginAirportID':                 int,
    'Origin':                          'string',
    'OriginState':                     'string',
    'DestAirportID':                   int,
    'Dest':                            'string',
    'DestState':                       'string',
    'CRSDepTime':                      int,
    'DepTime':                         np.float64,
    'DepDelayMinutes':                 np.float64,
    'CRSArrTime':                      int,
    'ArrTime':                         np.float64,
    'ArrDelayMinutes':                 np.float64,
    'CRSElapsedTime':                  np.float64,
    'ActualElapsedTime':               np.float64,
    'AirTime':                         np.float64,
    'Cancelled':                       int,
    'CancellationCode':                'string',
    'Diverted':                        int,
    'CarrierDelay':                    np.float64,
    'WeatherDelay':                    np.float64,
    'NASDelay':                        np.float64,
    'SecurityDelay':                   np.float64,
    'LateAircraftDelay':               np.float64
}

In [8]:
files = ['data/weather-joined/full-w-2017.csv', 'data/weather-joined/full-w-2018.csv']

df_all = (pd.read_csv(path, dtype=dtypes) for path in files)
df = pd.concat(df_all, ignore_index=True)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [4]:
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.tolist()
print(non_numeric_columns)

['FlightDate', 'IATA_CODE_Reporting_Airline', 'Tail_Number', 'Origin', 'OriginState', 'Dest', 'DestState', 'airport', 'day']


In [5]:
df = df.drop(['OriginState'], axis=1)

In [51]:
df['TargetVariable'] = df['NASDelay']+df['WeatherDelay']
df3['TargetVariable'] = df3['NASDelay']+df['WeatherDelay']

In [52]:
feature_columns_1 = ['CRSDepTime', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'apparent_temperature', 'precipitation', 'rain', 'snowfall', 'snow_depth', 'weather_code', 'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'wind_speed_10m', 'wind_speed_100m', 'wind_gusts_10m', 'shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 'direct_normal_irradiance', 'terrestrial_radiation','Origin','TargetVariable']
df_new = df[feature_columns_1]
# df_new.count()
df_test_new = df3[feature_columns_1]
df_test_new.count()

CRSDepTime                  44090
temperature_2m              44054
relative_humidity_2m        44054
dew_point_2m                44054
apparent_temperature        44054
precipitation               44054
rain                        44054
snowfall                    44054
snow_depth                  42684
weather_code                44054
surface_pressure            44054
cloud_cover                 44054
cloud_cover_low             44054
cloud_cover_mid             44054
cloud_cover_high            44054
wind_speed_10m              44054
wind_speed_100m             44054
wind_gusts_10m              44054
shortwave_radiation         44054
direct_radiation            44054
diffuse_radiation           44054
direct_normal_irradiance    44054
terrestrial_radiation       44054
Origin                      44090
TargetVariable              44090
dtype: int64

In [53]:
def encoder_(df_):
    n_features = 298  
    hasher = FeatureHasher(n_features=n_features, input_type='string')
    hashed_features = hasher.transform(df_[['Origin']].values.astype(str))
    hashed_df = pd.DataFrame(hashed_features.toarray(), columns=[f'Origin_hashed_{i}' for i in range(n_features)])
    df_.reset_index(drop=True, inplace=True)
    df_encoded_x = pd.concat([df_, hashed_df], axis=1)
    return df_encoded_x

In [54]:
df_encoded = encoder_(df_new)

In [61]:
df_encoded = df_encoded.drop(['Origin'], axis = 1)

In [62]:
df_encoded_test = encoder_(df_test_new)
df_encoded_test = df_encoded_test.drop(['Origin'], axis = 1)

In [63]:
df_fillna = df_encoded.fillna(value = 0)
df_fillna_test = df_encoded_test.fillna(value=0)

In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from math import sqrt

In [74]:
X_train = df_fillna.drop('TargetVariable', axis=1)  # Replace 'TargetVariable' with the actual target column name
Y_train = df_fillna['TargetVariable']
X_test = df_fillna_test.drop('TargetVariable',axis = 1)
Y_test = df_fillna_test['TargetVariable']

In [75]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
model = make_pipeline(SimpleImputer(strategy='mean'), rf_model)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)
print(f'R-squared: {r2}')

R-squared: -0.14921371296598318


In [76]:
mae = mean_absolute_error(Y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 61.18232245695872


In [77]:
rmse = sqrt(mse)

In [78]:
print(rmse)

105.90392518656107
