In [None]:
import os
import urllib.request
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, cross_val_score, TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
import xgboost as xgb 
import joblib

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/asharvi1/UCI-Air-Quality-Data/master/'
AIR_QUALITY_URL = DOWNLOAD_ROOT + 'AirQualityUCI.csv'
AIR_QUALITY_DATA_DIR = os.path.join('data', 'air_quality_data')

In [None]:
def get_air_quality_data(air_quality_url=AIR_QUALITY_URL, aq_data_dir=AIR_QUALITY_DATA_DIR):
    full_csv_path = os.path.join(aq_data_dir, 'AirQualityUCI.csv')
    if not os.path.exists(full_csv_path):
        os.makedirs(aq_data_dir, exist_ok=True)
        
        try:
            urllib.request.urlretrieve(air_quality_url, full_csv_path)
        except Exception as e:
            print(f'Failed to download data: {e}')

    return pd.read_csv(full_csv_path, sep=';', decimal=',', usecols=range(15))

def get_air_quality_clean_data(df=None, target='C6H6(GT)'):
    if df is None:
        df = get_air_quality_data()
    df.replace(-200, np.nan, inplace=True)
    
    sensor_columns = df.columns.difference(['Date', 'Time'])
    df.dropna(subset=sensor_columns, how='all', inplace=True)
    df.dropna(subset=[target], inplace=True)
    
    if 'NMHC(GT)' in df:
        df.drop('NMHC(GT)', axis=1, inplace=True)
    return df

air_quality = get_air_quality_clean_data()

In [None]:
target = 'C6H6(GT)'
X = air_quality.drop(target, axis=1)
y = air_quality[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [None]:
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, use_day_of_year=True, use_year=True, use_day=True):
        self.use_day_of_year = use_day_of_year
        self.use_year = use_year
        self.use_day = use_day

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        datetime = pd.to_datetime(X['Date'] + ' ' + X['Time'], errors='coerce', format='%d/%m/%Y %H.%M.%S')
        
        features = {
            'Month_sin': np.sin(2 * np.pi * datetime.dt.month / 12),
            'Month_cos': np.cos(2 * np.pi * datetime.dt.month / 12),
            'DayOfWeek_sin': np.sin(2 * np.pi * datetime.dt.dayofweek / 7),
            'DayOfWeek_cos': np.cos(2 * np.pi * datetime.dt.dayofweek / 7),
            'Hour_sin': np.sin(2 * np.pi * datetime.dt.hour / 24),
            'Hour_cos': np.cos(2 * np.pi * datetime.dt.hour / 24),
        }

        if self.use_day_of_year:
            features['DayOfYear_sin'] = np.sin(2 * np.pi * datetime.dt.dayofyear / 365)
            features['DayOfYear_cos'] = np.cos(2 * np.pi * datetime.dt.dayofyear / 365)      
        if self.use_year:
            features['Year'] = datetime.dt.year
        if self.use_day:
            features['Day_sin'] = np.sin(2 * np.pi * datetime.dt.day / 31)
            features['Day_cos'] = np.cos(2 * np.pi * datetime.dt.day / 31)

        return pd.DataFrame(features, index=X.index)

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

dt_pipeline = Pipeline([
	('datetime', DateTimeTransformer()),
	('imputer', SimpleImputer(strategy='most_frequent'))
])

air_quality_num_columns = list(X.drop(['Date', 'Time'], axis=1).columns)
air_quality_dt_columns = ['Date', 'Time']

full_pipeline = ColumnTransformer([
	('num', num_pipeline, air_quality_num_columns),
	('dt', dt_pipeline, air_quality_dt_columns) 
])

X_train_prepared = full_pipeline.fit_transform(X_train)

In [None]:
forest_reg = RandomForestRegressor()
tscv = TimeSeriesSplit(n_splits=10)

param_grid = [
	{
		'n_estimators': [200, 300, 400],
		'max_features': [5, 8, 10, 15, 'log2', 'sqrt'],
		'max_depth': [None, 10, 20],
		'min_samples_split': [2, 5, 10],
		'min_samples_leaf': [1, 2, 4]
	}
]

grid_search = GridSearchCV(forest_reg, param_grid, cv=tscv,
                           scoring='neg_root_mean_squared_error',
                           return_train_score=True, n_jobs=-1)

grid_search.fit(X_train_prepared, y_train)
forest_reg_model = grid_search.best_estimator_

In [None]:
models = {
	'forest_reg': {
		'estimator': RandomForestRegressor(random_state=42),
		'param_grid': [
			{
				'n_estimators': [200, 300, 400],
				'max_features': [5, 8, 'log2', 'sqrt'],
				'max_depth': [None, 10, 20],
				'min_samples_split': [2, 5, 10],
				'min_samples_leaf': [1, 2, 4]
			}
		]
	},
	'xgboost': {
		'estimator': xgb.XGBRegressor(random_state=42),
		'param_grid': [
			{
				'n_estimators': [200, 300, 400],
				'max_depth': [4, 6, 8],
				'learning_rate': [0.05, 0.1, 0.3],
				'subsample': [0.7, 0.9, 1],
				'colsample_bytree': [0.6, 0.8, 1],
				'gamma': [0, 0.1, 0.3],
				'reg_lambda': [1, 5, 10],
				'reg_alpha': [0, 0.1, 0.5, 1],
				'min_child_weight': [1, 5, 10, 20]
			}
		]
	},
	'hist_grad_boost': {
		'estimator': HistGradientBoostingRegressor(random_state=42),
		'param_grid': [
			{
				'max_iter': [200, 300, 400],
				'max_depth': [4, 6, 8],
				'learning_rate': [0.05, 0.1, 0.3],
				'l2_regularization': [1, 5, 10],
				'min_child_weight': [1, 5, 10, 20]
			}
		]
	}
}