The previous model is too big. Let's regard grid_code as one of the input features.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
final_gird_dataset = pd.read_csv('final_grid_dataset_final.csv')
final_gird_dataset.head(5)

Unnamed: 0,grid_code,time_stamp,taxi_density,pm2.5_aqi,humidity,wind_direction,temp,wind_speed,wind_gust,pressure,weather_id
0,0@7,1680310800,0.0,52.226051,62,S,287.594444,4.4704,0.0,1009.482859,804
1,0@8,1680310800,0.0,52.226051,62,S,287.594444,4.4704,0.0,1009.482859,804
2,0@9,1680310800,0.0,68.886052,62,S,287.594444,4.4704,0.0,1009.482859,804
3,1@9,1680310800,0.0,68.886052,62,S,287.594444,4.4704,0.0,1009.482859,804
4,3@6,1680310800,0.0,52.226051,62,S,287.594444,4.4704,0.0,1009.482859,804


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class WindDirectionEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.directional_strings = ['S', 'SSW', 'SW', 'SSE', 'WNW', 'NNW', 'WSW', 'NW', 'N', 'NE', 'ENE', 'E', 'ESE', 'SE', 'NNE', 'W']
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X = X.copy()
        X['wind_direction'] = X['wind_direction'].apply(lambda x: x if x in self.directional_strings else 'OTHER')

        X_encoded = pd.get_dummies(X, columns=['wind_direction'], prefix='', prefix_sep='')

        for col in self.directional_strings:
            if col not in X_encoded.columns:
                X_encoded[col] = 0

        X_encoded = X_encoded[self.directional_strings]
        X_encoded = X_encoded.astype(int)

        return X_encoded

In [4]:
class WeatherIdEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.weather_ids = [804, 500, 741, 803, 801, 800, 200, 501, 721, 300, 211, 502, 711, 212, 701, 600, 616, 612, 511, 601, 602, 301]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X_encoded = pd.get_dummies(X, columns=['weather_id'], prefix='', prefix_sep='')

        for weather_id in self.weather_ids:
            if str(weather_id) not in X_encoded.columns:
                X_encoded[str(weather_id)] = 0

        X_encoded = X_encoded[[str(weather_id) for weather_id in self.weather_ids]]
        X_encoded = X_encoded.astype(int)

        return X_encoded

In [5]:
class GridCodeEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        X = X.copy()
        X[['grid_x', 'grid_y']] = X['grid_code'].str.split('@', expand=True).astype(int)
        return X.drop(columns=['grid_code'])

Split the datasets as before

In [6]:
from sklearn.model_selection import train_test_split

count = 0
for grid_code, group in final_gird_dataset.groupby('grid_code'):
    train_set_one_grid, test_set_one_grid = train_test_split(group, test_size=0.2, random_state=42)
    
    if count == 0:
        train_set = train_set_one_grid
        test_set = test_set_one_grid
    else:
        train_set = pd.concat([train_set, train_set_one_grid], axis=0)
        test_set = pd.concat([test_set, test_set_one_grid], axis=0)
    
    count += 1

print(f'Total: {count} grid_codes.')

Total: 358 grid_codes.


Defining inputs and outputs

In [7]:
taxi_model_inputs = ['grid_code', 'time_stamp', 'humidity', 'wind_direction', 'temp', 'wind_speed', 'wind_gust', 'pressure', 'weather_id']
taxi_model_output = 'taxi_density'
aqi_model_inputs = ['grid_code', 'time_stamp', 'taxi_density', 'humidity', 'wind_direction', 'temp', 'wind_speed', 'wind_gust', 'pressure', 'weather_id']
aqi_model_output = 'pm2.5_aqi'

Preprocessor

In [8]:
preprocessor_taxi = ColumnTransformer(
    transformers=[
        ('gridcode', GridCodeEncoder(), ['grid_code']),
        ('winddirection', WindDirectionEncoder(), ['wind_direction']),
        ('weatherid', WeatherIdEncoder(), ['weather_id'])
    ],
    remainder='passthrough'
)

preprocessor_aqi = ColumnTransformer(
    transformers=[
        ('gridcode', GridCodeEncoder(), ['grid_code']),
        ('winddirection', WindDirectionEncoder(), ['wind_direction']),
        ('weatherid', WeatherIdEncoder(), ['weather_id'])
    ],
    remainder='passthrough'
)

taxi_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_taxi),
    ('regressor', RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_split=20, min_samples_leaf=10, n_jobs=-1, random_state=42))
])

aqi_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_aqi),
    ('regressor', RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_split=20, min_samples_leaf=10, n_jobs=-1, random_state=42))
])

Train and test taxi model

In [9]:
X_train_taxi = train_set[taxi_model_inputs]
y_train_taxi = train_set[taxi_model_output]
taxi_pipeline.fit(X_train_taxi, y_train_taxi)

X_test_taxi = test_set[taxi_model_inputs]
y_test_taxi = test_set[taxi_model_output]
y_pred_taxi = taxi_pipeline.predict(X_test_taxi)

rmse_taxi = np.sqrt(mean_squared_error(y_test_taxi, y_pred_taxi))
print(f'Taxi Density Model RMSE: {rmse_taxi}')

Taxi Density Model RMSE: 297.22151100299067


Train and test pm2.5 model

In [10]:
X_train_aqi = train_set[aqi_model_inputs]
y_train_aqi = train_set[aqi_model_output]
aqi_pipeline.fit(X_train_aqi, y_train_aqi)

X_test_aqi = test_set[aqi_model_inputs]
y_test_aqi = test_set[aqi_model_output]
y_pred_aqi = aqi_pipeline.predict(X_test_aqi)

rmse_aqi = np.sqrt(mean_squared_error(y_test_aqi, y_pred_aqi))
print(f'AQI Model RMSE: {rmse_aqi}')

AQI Model RMSE: 25.23263059575479


Train models on the entire dataset and save them

In [9]:
import joblib

X_taxi = final_gird_dataset[taxi_model_inputs]
y_taxi = final_gird_dataset[taxi_model_output]
taxi_pipeline.fit(X_taxi, y_taxi)

joblib.dump(taxi_pipeline, 'taxi_density_model.pkl')

X_aqi = final_gird_dataset[aqi_model_inputs]
y_aqi = final_gird_dataset[aqi_model_output]
aqi_pipeline.fit(X_aqi, y_aqi)

joblib.dump(aqi_pipeline, 'aqi_model.pkl')

['aqi_model.pkl']