In [67]:
import os
import urllib.request
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/asharvi1/UCI-Air-Quality-Data/master/'
AIR_QUALITY_URL = DOWNLOAD_ROOT + 'AirQualityUCI.csv'
AIR_QUALITY_DATA_DIR = os.path.join('data', 'air_quality_data')

In [75]:
def get_air_quality_data(air_quality_url=AIR_QUALITY_URL, aq_data_dir=AIR_QUALITY_DATA_DIR):
    full_csv_path = os.path.join(aq_data_dir, 'AirQualityUCI.csv')
    if not os.path.exists(full_csv_path):
        os.makedirs(aq_data_dir, exist_ok=True)
        urllib.request.urlretrieve(air_quality_url, full_csv_path)
    return pd.read_csv(full_csv_path, sep=';', decimal=',', usecols=range(15))

def get_air_quality_clean_data(df=None):
    if df is None:
        df = get_air_quality_data()
    df.replace(-200, np.nan, inplace=True)
    df.dropna(how='all', inplace=True)
    if 'NMHC(GT)' in df:
        df.drop('NMHC(GT)', axis=1, inplace=True)
    return df
    
air_quality = get_air_quality_clean_data()

In [76]:
target = 'C6H6(GT)'
X = air_quality.drop(target, axis=1)
y = air_quality[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
air_quality = X_train.copy()

In [77]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

air_quality_num_columns = air_quality.drop(['Date', 'Time'], axis=1)

full_pipeline = ColumnTransformer([
	('num', num_pipeline, air_quality_num_columns)
])