Read in

In [75]:
import pandas as pd
import numpy as np

In [76]:
final_gird_dataset = pd.read_csv('final_grid_dataset_final.csv')

In [77]:
final_gird_dataset.head(5)

Unnamed: 0,grid_code,time_stamp,taxi_density,pm2.5_aqi,humidity,wind_direction,temp,wind_speed,wind_gust,pressure,weather_id
0,0@7,1680310800,0.0,58.040796,62,S,287.594444,4.4704,0.0,1009.482859,804
1,0@8,1680310800,0.0,58.555084,62,S,287.594444,4.4704,0.0,1009.482859,804
2,0@9,1680310800,0.0,59.016016,62,S,287.594444,4.4704,0.0,1009.482859,804
3,1@9,1680310800,0.0,59.562896,62,S,287.594444,4.4704,0.0,1009.482859,804
4,3@6,1680310800,0.0,57.712418,62,S,287.594444,4.4704,0.0,1009.482859,804


Define the transformers

In [78]:
print(final_gird_dataset['wind_direction'].unique())

['S' 'SSW' 'SW' 'SSE' 'WNW' 'NNW' 'CALM' 'WSW' 'NW' 'N' 'NE' 'ENE' 'E'
 'ESE' 'VAR' 'SE' 'NNE' 'W' 'MISSING']


In [79]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomWindDirectionEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.directional_strings = ['S', 'SSW', 'SW', 'SSE', 'WNW', 'NNW', 'WSW', 'NW', 'N', 'NE', 'ENE', 'E', 'ESE', 'SE', 'NNE', 'W']
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X = X.copy()
        X['wind_direction'] = X['wind_direction'].apply(lambda x: x if x in self.directional_strings else 'OTHER')

        X_encoded = pd.get_dummies(X, columns=['wind_direction'], prefix='', prefix_sep='')

        for col in self.directional_strings:
            if col not in X_encoded.columns:
                X_encoded[col] = 0

        X_encoded = X_encoded[self.directional_strings]
        X_encoded = X_encoded.astype(int)

        return X_encoded

In [80]:
print(final_gird_dataset['weather_id'].unique())

[804 500 741 803 801 800 200 501 721 300 211 502 711 212 701 600 616 612
 511 601 602 301]


In [81]:
class CustomWeatherIdEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.weather_ids = [804, 500, 741, 803, 801, 800, 200, 501, 721, 300, 211, 502, 711, 212, 701, 600, 616, 612, 511, 601, 602, 301]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X_encoded = pd.get_dummies(X, columns=['weather_id'], prefix='', prefix_sep='')

        for weather_id in self.weather_ids:
            if str(weather_id) not in X_encoded.columns:
                X_encoded[str(weather_id)] = 0

        X_encoded = X_encoded[[str(weather_id) for weather_id in self.weather_ids]]
        X_encoded = X_encoded.astype(int)

        return X_encoded

Split the datasets

In [82]:
from sklearn.model_selection import train_test_split

train_sets = {}
test_sets = {}

for grid_code, group in final_gird_dataset.groupby('grid_code'):
    train_set, test_set = train_test_split(group, test_size=0.2, random_state=42)
    train_sets[grid_code] = train_set
    test_sets[grid_code] = test_set

Select 3% of grid_codes as samples

In [83]:
total_grids = len(train_sets)
sample_size = max(1, int(0.03 * total_grids))

np.random.seed(42)
sampled_grid_codes = np.random.choice(list(train_sets.keys()), sample_size, replace=False)

Models to predict AQI

In [84]:
aqi_model_inputs = ['time_stamp', 'taxi_density', 'humidity', 'wind_direction', 'temp', 'wind_speed', 'wind_gust', 'pressure', 'weather_id']
aqi_model_output = 'pm2.5_aqi'

Random Forest

In [85]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [86]:
wind_direction_encoder = CustomWindDirectionEncoder()
weather_id_encoder = CustomWeatherIdEncoder()

rf_aqi_models = {}
rf_aqi_rmses = []

for grid_code in sampled_grid_codes:
    train_set = train_sets[grid_code]
    test_set = test_sets[grid_code]

    X_train = train_set[aqi_model_inputs]
    y_train = train_set[aqi_model_output]
    X_test = test_set[aqi_model_inputs]
    y_test = test_set[aqi_model_output]

    X_train_wind_direction_encoded = wind_direction_encoder.fit_transform(X_train[['wind_direction']])
    X_train_weather_id_encoded = weather_id_encoder.fit_transform(X_train[['weather_id']])
    X_test_wind_direction_encoded = wind_direction_encoder.transform(X_test[['wind_direction']])
    X_test_weather_id_encoded = weather_id_encoder.transform(X_test[['weather_id']])

    X_train_encoded = pd.concat([X_train.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                 X_train_wind_direction_encoded.reset_index(drop=True), 
                                 X_train_weather_id_encoded.reset_index(drop=True)], axis=1)
    X_test_encoded = pd.concat([X_test.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                X_test_wind_direction_encoded.reset_index(drop=True), 
                                X_test_weather_id_encoded.reset_index(drop=True)], axis=1)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train_encoded, y_train)
    rf_aqi_models[grid_code] = model

    y_pred = model.predict(X_test_encoded)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rf_aqi_rmses.append(rmse)
    print(f'Grid Code: {grid_code}, RMSE: {rmse}')

average_rmse = np.mean(rf_aqi_rmses)
print(f'Average RMSE: {average_rmse}')

Grid Code: 22@25, RMSE: 9.658714165306263
Grid Code: 12@17, RMSE: 6.91271124987585
Grid Code: 25@27, RMSE: 10.855794188532194
Grid Code: 6@18, RMSE: 11.508451923249963
Grid Code: 13@13, RMSE: 15.17454778357028
Grid Code: 6@13, RMSE: 12.919954360199153
Grid Code: 10@23, RMSE: 13.084759813487482
Grid Code: 13@14, RMSE: 25.716592896465144
Grid Code: 23@41, RMSE: 15.994309650958447
Grid Code: 16@29, RMSE: 11.981178625656527
Average RMSE: 13.38070146573013


Linear Regression

In [87]:
from sklearn.linear_model import LinearRegression

In [88]:
wind_direction_encoder = CustomWindDirectionEncoder()
weather_id_encoder = CustomWeatherIdEncoder()

lr_aqi_models = {}
lr_aqi_rmses = []

for grid_code in sampled_grid_codes:
    train_set = train_sets[grid_code]
    test_set = test_sets[grid_code]

    X_train = train_set[aqi_model_inputs]
    y_train = train_set[aqi_model_output]
    X_test = test_set[aqi_model_inputs]
    y_test = test_set[aqi_model_output]

    X_train_wind_direction_encoded = wind_direction_encoder.fit_transform(X_train[['wind_direction']])
    X_train_weather_id_encoded = weather_id_encoder.fit_transform(X_train[['weather_id']])
    X_test_wind_direction_encoded = wind_direction_encoder.transform(X_test[['wind_direction']])
    X_test_weather_id_encoded = weather_id_encoder.transform(X_test[['weather_id']])

    X_train_encoded = pd.concat([X_train.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                 X_train_wind_direction_encoded.reset_index(drop=True), 
                                 X_train_weather_id_encoded.reset_index(drop=True)], axis=1)
    X_test_encoded = pd.concat([X_test.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                X_test_wind_direction_encoded.reset_index(drop=True), 
                                X_test_weather_id_encoded.reset_index(drop=True)], axis=1)

    model = LinearRegression()
    model.fit(X_train_encoded, y_train)
    lr_aqi_models[grid_code] = model

    y_pred = model.predict(X_test_encoded)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    lr_aqi_rmses.append(rmse)
    print(f'Grid Code: {grid_code}, RMSE: {rmse}')

average_rmse = np.mean(lr_aqi_rmses)
print(f'Average RMSE: {average_rmse}')

Grid Code: 22@25, RMSE: 26.245201767365735
Grid Code: 12@17, RMSE: 19.729409339722835
Grid Code: 25@27, RMSE: 21.599045293415376
Grid Code: 6@18, RMSE: 23.427375296369227
Grid Code: 13@13, RMSE: 29.49850078673895
Grid Code: 6@13, RMSE: 26.420300933979764
Grid Code: 10@23, RMSE: 24.717979773078454
Grid Code: 13@14, RMSE: 45.34018877473013
Grid Code: 23@41, RMSE: 31.81207144854486
Grid Code: 16@29, RMSE: 25.32207463240675
Average RMSE: 27.41121480463521


Random forest is always a good choice.

Models to predict taxi_density

In [89]:
taxi_model_inputs = ['time_stamp', 'humidity', 'wind_direction', 'temp', 'wind_speed', 'wind_gust', 'pressure', 'weather_id']
taxi_model_output = 'taxi_density'

In [90]:
rf_taxi_models = {}
rf_taxi_rmses = []

for grid_code in sampled_grid_codes:
    train_set = train_sets[grid_code]
    test_set = test_sets[grid_code]

    X_train = train_set[taxi_model_inputs]
    y_train = train_set[taxi_model_output]
    X_test = test_set[taxi_model_inputs]
    y_test = test_set[taxi_model_output]

    X_train_wind_direction_encoded = wind_direction_encoder.fit_transform(X_train[['wind_direction']])
    X_train_weather_id_encoded = weather_id_encoder.fit_transform(X_train[['weather_id']])
    X_test_wind_direction_encoded = wind_direction_encoder.transform(X_test[['wind_direction']])
    X_test_weather_id_encoded = weather_id_encoder.transform(X_test[['weather_id']])

    X_train_encoded = pd.concat([X_train.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                 X_train_wind_direction_encoded.reset_index(drop=True), 
                                 X_train_weather_id_encoded.reset_index(drop=True)], axis=1)
    X_test_encoded = pd.concat([X_test.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                X_test_wind_direction_encoded.reset_index(drop=True), 
                                X_test_weather_id_encoded.reset_index(drop=True)], axis=1)

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train_encoded, y_train)
    rf_taxi_models[grid_code] = model

    y_pred = model.predict(X_test_encoded)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rf_taxi_rmses.append(rmse)
    print(f'Grid Code: {grid_code}, RMSE: {rmse}')

average_taxi_rmse = np.mean(rf_taxi_rmses)
print(f'Average Taxi RMSE: {average_taxi_rmse}')

Grid Code: 22@25, RMSE: 2.0693280087603427
Grid Code: 12@17, RMSE: 257.30898689639696
Grid Code: 25@27, RMSE: 4.3794484379057925
Grid Code: 6@18, RMSE: 76.28458645531391
Grid Code: 13@13, RMSE: 91.0547488918708
Grid Code: 6@13, RMSE: 209.6403323334904
Grid Code: 10@23, RMSE: 153.6096526572783
Grid Code: 13@14, RMSE: 102.9333788248887
Grid Code: 23@41, RMSE: 29.35764774982325
Grid Code: 16@29, RMSE: 71.26066191624943
Average Taxi RMSE: 99.7898772171978


Consider that taxi density can vary from 0 to 3000, this much of error is allowable.

Test cascaded models

In [91]:
cascaded_models = {}
cascaded_rmses = []

for grid_code in sampled_grid_codes:
    train_set = train_sets[grid_code]
    test_set = test_sets[grid_code]

    X_test_taxi = test_set[taxi_model_inputs]
    y_test_aqi = test_set[aqi_model_output]

    X_test_wind_direction_encoded = wind_direction_encoder.transform(X_test_taxi[['wind_direction']])
    X_test_weather_id_encoded = weather_id_encoder.transform(X_test_taxi[['weather_id']])

    X_test_taxi_encoded = pd.concat([X_test_taxi.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                     X_test_wind_direction_encoded.reset_index(drop=True), 
                                     X_test_weather_id_encoded.reset_index(drop=True)], axis=1)

    taxi_model = rf_taxi_models[grid_code]
    predicted_taxi_density = taxi_model.predict(X_test_taxi_encoded)

    X_test_aqi = test_set[aqi_model_inputs].copy()
    X_test_aqi['taxi_density'] = predicted_taxi_density

    X_test_wind_direction_encoded = wind_direction_encoder.transform(X_test_aqi[['wind_direction']])
    X_test_weather_id_encoded = weather_id_encoder.transform(X_test_aqi[['weather_id']])

    X_test_aqi_encoded = pd.concat([X_test_aqi.drop(columns=['wind_direction', 'weather_id']).reset_index(drop=True), 
                                    X_test_wind_direction_encoded.reset_index(drop=True), 
                                    X_test_weather_id_encoded.reset_index(drop=True)], axis=1)

    aqi_model = rf_aqi_models[grid_code]
    predicted_aqi = aqi_model.predict(X_test_aqi_encoded)

    rmse = np.sqrt(mean_squared_error(y_test_aqi, predicted_aqi))
    cascaded_rmses.append(rmse)
    print(f'Grid Code: {grid_code}, Cascaded RMSE: {rmse}')

    cascaded_models[grid_code] = {
        'taxi_model': taxi_model,
        'aqi_model': aqi_model
    }

average_cascaded_rmse = np.mean(cascaded_rmses)
print(f'Average Cascaded RMSE: {average_cascaded_rmse}')

Grid Code: 22@25, Cascaded RMSE: 9.81327302681314
Grid Code: 12@17, Cascaded RMSE: 7.063279872613036
Grid Code: 25@27, Cascaded RMSE: 10.954010144729002
Grid Code: 6@18, Cascaded RMSE: 11.887080433145057
Grid Code: 13@13, Cascaded RMSE: 15.66595141973561
Grid Code: 6@13, Cascaded RMSE: 13.444779264670538
Grid Code: 10@23, Cascaded RMSE: 13.377397623782878
Grid Code: 13@14, Cascaded RMSE: 26.190757814929295
Grid Code: 23@41, Cascaded RMSE: 16.359913744517872
Grid Code: 16@29, Cascaded RMSE: 12.479241628792803
Average Cascaded RMSE: 13.723568497372924


So we decide to use random forest model.