In [None]:
import os
import urllib.request
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
import xgboost as xgb 
import joblib

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/asharvi1/UCI-Air-Quality-Data/master/'
AIR_QUALITY_URL = DOWNLOAD_ROOT + 'AirQualityUCI.csv'
AIR_QUALITY_DATA_DIR = os.path.join('data', 'air_quality_data')

In [None]:
def get_air_quality_data(air_quality_url=AIR_QUALITY_URL, aq_data_dir=AIR_QUALITY_DATA_DIR):
    full_csv_path = os.path.join(aq_data_dir, 'AirQualityUCI.csv')
    if not os.path.exists(full_csv_path):
        os.makedirs(aq_data_dir, exist_ok=True)
        
        try:
            urllib.request.urlretrieve(air_quality_url, full_csv_path)
        except Exception as e:
            print(f'Failed to download data: {e}')
            
    df = pd.read_csv(full_csv_path, sep=';', decimal=',', usecols=range(15))
    df.dropna(axis=1, how='all', inplace=True)
    return df

def get_air_quality_clean_data(df=None, target='C6H6(GT)'):
    if df is None:
        df = get_air_quality_data()
        if df.empty:
            raise ValueError('Data not loaded - the file is empty')
    df.replace(-200, np.nan, inplace=True)
    
    sensor_columns = df.columns.difference(['Date', 'Time'])
    df.dropna(subset=sensor_columns, how='all', inplace=True)
    df.dropna(subset=[target], inplace=True)
    
    if 'NMHC(GT)' in df.columns:
        df.drop('NMHC(GT)', axis=1, inplace=True)
    return df

air_quality = get_air_quality_clean_data()

In [257]:
target = 'C6H6(GT)'
X = air_quality.drop(target, axis=1)
y = air_quality[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [258]:
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, use_day_of_year=True, use_year=True, use_day=True):
        self.use_day_of_year = use_day_of_year
        self.use_year = use_year
        self.use_day = use_day

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        datetime = pd.to_datetime(X['Date'] + ' ' + X['Time'], errors='coerce', format='%d/%m/%Y %H.%M.%S')
        
        features = {
            'Month_sin': np.sin(2 * np.pi * datetime.dt.month / 12),
            'Month_cos': np.cos(2 * np.pi * datetime.dt.month / 12),
            'DayOfWeek_sin': np.sin(2 * np.pi * datetime.dt.dayofweek / 7),
            'DayOfWeek_cos': np.cos(2 * np.pi * datetime.dt.dayofweek / 7),
            'Hour_sin': np.sin(2 * np.pi * datetime.dt.hour / 24),
            'Hour_cos': np.cos(2 * np.pi * datetime.dt.hour / 24),
        }

        if self.use_day_of_year:
            features['DayOfYear_sin'] = np.sin(2 * np.pi * datetime.dt.dayofyear / 365)
            features['DayOfYear_cos'] = np.cos(2 * np.pi * datetime.dt.dayofyear / 365)      
        if self.use_year:
            features['Year'] = datetime.dt.year
        if self.use_day:
            features['Day_sin'] = np.sin(2 * np.pi * datetime.dt.day / 31)
            features['Day_cos'] = np.cos(2 * np.pi * datetime.dt.day / 31)

        return pd.DataFrame(features, index=X.index)

In [259]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

dt_pipeline = Pipeline([
	('datetime', DateTimeTransformer()),
	('imputer', SimpleImputer(strategy='most_frequent'))
])

air_quality_num_columns = list(X.drop(['Date', 'Time'], axis=1).columns)
air_quality_dt_columns = ['Date', 'Time']

full_pipeline = ColumnTransformer([
	('num', num_pipeline, air_quality_num_columns),
	('dt', dt_pipeline, air_quality_dt_columns) 
])

X_train_prepared = full_pipeline.fit_transform(X_train)

In [260]:
models = {
	'forest_reg': {
		'estimator': RandomForestRegressor(random_state=42),
		'param_grid': [
			{
				'n_estimators': [200, 300, 400],
				'max_features': [5, 8, 'log2', 'sqrt'],
				'max_depth': [None, 10, 20],
				'min_samples_split': [2, 5, 10],
				'min_samples_leaf': [1, 2, 4]
			}
		]
	},
	'xgboost': {
		'estimator': xgb.XGBRegressor(random_state=42),
		'param_grid': [
			{
				'n_estimators': [200, 300, 400],
				'max_depth': [4, 6, 8],
				'learning_rate': [0.05, 0.1, 0.3],
				'subsample': [0.7, 0.9, 1],
				'colsample_bytree': [0.6, 0.8, 1],
				'gamma': [0, 0.1, 0.3],
				'reg_lambda': [1, 5, 10],
				'reg_alpha': [0, 0.1, 0.5, 1],
				'min_child_weight': [1, 5, 10, 20]
			}
		]
	},
	'hist_grad_boost': {
		'estimator': HistGradientBoostingRegressor(random_state=42),
		'param_grid': [
			{
				'max_iter': [200, 300, 400],
				'max_depth': [4, 6, 8],
				'learning_rate': [0.05, 0.1, 0.3],
				'l2_regularization': [1, 5, 10],
				'min_samples_leaf': [1, 5, 10, 20]
			}
		]
	}
}

In [None]:
def train_models(models, X_train, y_train, cv=TimeSeriesSplit(n_splits=10), skipped_models=None):
    trained_models = {}
    skipped_models = skipped_models or []

    for model_name, model_info in models.items():
        if model_name in skipped_models:
            continue
        
        print(f'Training model: {model_name.upper()}')

        estimator = model_info['estimator']
        param_grid = model_info['param_grid']
        if not param_grid:
            print(f'Skipping {model_name}: param_grid is empty')
            continue

        random_search = RandomizedSearchCV(
            estimator, param_distributions=param_grid, cv=cv,
            scoring='neg_root_mean_squared_error', random_state=42,
            return_train_score=True, n_jobs=1, n_iter=300, verbose=1
        )
        
        try:
            random_search.fit(X_train, y_train)
            trained_models[model_name] = {
                'search_cv': random_search,
                'best_estimator': random_search.best_estimator_
            }
        except Exception as e:
            print(f'Error during training {model_name}: {e}')
            
    return trained_models

In [None]:
def save_models(trained_models, save_dir='models', leaderboard_flag=True):
    os.makedirs(save_dir, exist_ok=True)
    leaderboard = []

    for model_name, model_info in trained_models.items():
        model_path = os.path.join(save_dir, model_name)
        os.makedirs(model_path, exist_ok=True)

        try:
            joblib.dump(model_info['search_cv'], os.path.join(model_path, 'search_cv.joblib'))
            joblib.dump(model_info['best_estimator'], os.path.join(model_path, 'best_model.joblib'))
            print(f'{model_name}: saved')

            if leaderboard_flag:
                leaderboard.append({
                    'Model': model_name,
                    'CV_RMSE': round(-model_info['search_cv'].best_score_, 4),
                    'Best_Params': str(model_info['search_cv'].best_params_)
                })
        except Exception as e:
            print(f'Error during saving {model_name}: {e}')

    if leaderboard_flag and leaderboard:
        leaderboard_df = pd.DataFrame(leaderboard).sort_values('CV_RMSE').reset_index(drop=True)
        leaderboard_df.to_csv(os.path.join(save_dir, 'leaderboard.csv'), index=False)
        print(f'\nLeaderboard saved to {os.path.join(save_dir, "leaderboard.csv")}')
    elif not leaderboard_flag:
        print('\nLeaderboard generation skipped')

In [None]:
def load_models(models, save_dir):
    loaded_models = {}
    for model_name in models.keys():
        search_cv_path = os.path.join(save_dir, model_name, 'search_cv.joblib')
        best_model_path = os.path.join(save_dir, model_name, 'best_model.joblib')

        if os.path.exists(search_cv_path) and os.path.exists(best_model_path):
            try:
                loaded_models[model_name] = {
                    'search_cv': joblib.load(search_cv_path),
                    'best_estimator': joblib.load(best_model_path)
                }
            except Exception as e:
                print(f'Error loading model {model_name}: {e}')
    return loaded_models

In [None]:
def train_save_models(models, X_train, y_train, save_dir='models', leaderboard_flag=True, cv=None, force_retrain=False):
    if cv is None:
        cv = TimeSeriesSplit(n_splits=10)

    loaded_models = load_models(models, save_dir)
    skipped_model_names = list(loaded_models.keys()) if not force_retrain else []

    if force_retrain:
        print('\nForce retrain is ON — all models will be retrained.\n')
        loaded_models = {}
    elif skipped_model_names:
        print('\nAlready trained models found and will be skipped:')
        for model in skipped_model_names:
            print(f'  - {model}')

    trained_models = train_models(models, X_train, y_train, cv=cv, skipped_models=skipped_model_names)
    all_models = {**loaded_models, **trained_models}
    save_models(all_models, save_dir=save_dir, leaderboard_flag=leaderboard_flag)
    return all_models

results = train_save_models(models, X_train_prepared, y_train,
                        save_dir='models', leaderboard_flag=True,
                        cv=TimeSeriesSplit(n_splits=10), force_retrain=False)

In [271]:
def evaluate_models(models, X_test, y_test):
    results = []
    for model_name, model_info in models.items():
        best_estimator = model_info['best_estimator']
        y_pred = best_estimator.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results.append({
			'Model': model_name,
            'RMSE': round(rmse, 4), 
            'R2Score': round(r2, 4)
		})
    return pd.DataFrame(results)

X_test_prepared = full_pipeline.transform(X_test)
test_results = evaluate_models(results, X_test_prepared, y_test)
test_results

Unnamed: 0,Model,RMSE,R2Score
0,forest_reg,0.609,0.9913
1,xgboost,0.1847,0.9992
2,hist_grad_boost,0.2841,0.9981
