In [135]:
import os
import urllib.request
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/asharvi1/UCI-Air-Quality-Data/master/'
AIR_QUALITY_URL = DOWNLOAD_ROOT + 'AirQualityUCI.csv'
AIR_QUALITY_DATA_DIR = os.path.join('data', 'air_quality_data')

In [None]:
def get_air_quality_data(air_quality_url=AIR_QUALITY_URL, aq_data_dir=AIR_QUALITY_DATA_DIR):
    full_csv_path = os.path.join(aq_data_dir, 'AirQualityUCI.csv')
    if not os.path.exists(full_csv_path):
        os.makedirs(aq_data_dir, exist_ok=True)
        urllib.request.urlretrieve(air_quality_url, full_csv_path)
    return pd.read_csv(full_csv_path, sep=';', decimal=',', usecols=range(15))

def get_air_quality_clean_data(df=None):
    if df is None:
        df = get_air_quality_data()
    df.replace(-200, np.nan, inplace=True)
    sensor_columns = df.columns.difference(['Date', 'Time'])
    df.dropna(subset=sensor_columns, how='all', inplace=True)
    if 'NMHC(GT)' in df:
        df.drop('NMHC(GT)', axis=1, inplace=True)
    return df
    
air_quality = get_air_quality_clean_data()

In [None]:
target = 'C6H6(GT)'
X = air_quality.drop(target, axis=1)
y = air_quality[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
class DateTimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, use_day_of_year=True, use_year=True, use_day=True):
        self.use_day_of_year = use_day_of_year
        self.use_year = use_year
        self.use_day = use_day

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        datetime = pd.to_datetime(X['Date'] + ' ' + X['Time'], errors='coerce', format='%d/%m/%Y %H.%M.%S')
        
        features = {
            'Month': datetime.dt.month,
            'DayOfWeek': datetime.dt.dayofweek,
            'Hour': datetime.dt.hour
        }

        if self.use_day_of_year:
            features['DayOfYear'] = datetime.dt.dayofyear
        if self.use_year:
            features['Year'] = datetime.dt.year
        if self.use_day:
            features['Day'] = datetime.dt.day

        return pd.DataFrame(features, index=X.index)

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

dt_pipeline = Pipeline([
	('datetime', DateTimeTransformer()),
	('imputer', SimpleImputer(strategy='most_frequent')),
	('scaler', StandardScaler())
])

air_quality_num_columns = list(X.drop(['Date', 'Time'], axis=1).columns)
air_quality_dt_columns = ['Date', 'Time']

full_pipeline = ColumnTransformer([
	('num', num_pipeline, air_quality_num_columns),
	('dt', dt_pipeline, air_quality_dt_columns) 
])