# Libraries

In [7]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # seaborn warning about not using data=... notation
import seaborn as sns

import os

from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

sns.set(rc = {'figure.figsize':(25, 12)})

# Data import

In [2]:
def loadRawData():
    # Loading each csv into the list and concat them into one dataframe in one step 
    df = []

    for file in os.listdir('data'):
        temp = pd.read_csv(
            f'data/{file}', 
            parse_dates = {'date': ['year', 'month', 'day', 'hour']}, 
            date_parser = lambda x: datetime.strptime(x, '%Y %m %d %H'),
            keep_date_col = True # will be used as dummies
        )

        # Values for different stations in each city are simmilar, so we can take the mean of them 
        targetCols = [col for col in temp.columns if 'PM' in col]
        temp['meanPM'] = temp[targetCols].mean(axis=1).round(2)

        targetCols.extend(('No', 'Iprec'))
        temp.drop(targetCols, axis=1, inplace=True)

        # Adding the source of the data from the filename
        temp['source'] = file.split('PM')[0]
        df.append(temp)

    df = pd.concat(df, axis = 0)

    # Moving important columns to the front, will be usefull when categorical columns are converted to dummies
    colsToMove = ['date', 'source', 'meanPM']
    df = df[colsToMove + [col for col in df.columns if col not in colsToMove]]
    df['dayOfWeek'] = df['date'].dt.dayofweek

    df = df[df.date > datetime(2012, 1, 1)]

    return df.reset_index(drop = True)

# Data wrangling

In [12]:
loadRawData().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175315 entries, 0 to 175314
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   date           175315 non-null  datetime64[ns]
 1   source         175315 non-null  object        
 2   meanPM         158144 non-null  float64       
 3   year           175315 non-null  object        
 4   month          175315 non-null  object        
 5   day            175315 non-null  object        
 6   hour           175315 non-null  object        
 7   season         175314 non-null  float64       
 8   DEWP           174795 non-null  float64       
 9   HUMI           174465 non-null  float64       
 10  PRES           174452 non-null  float64       
 11  TEMP           174797 non-null  float64       
 12  cbwd           174802 non-null  object        
 13  Iws            174790 non-null  float64       
 14  precipitation  167445 non-null  float64       
 15  

In [61]:
def prepareTrainTestSet():
    df = loadRawData()

    ### Replace incorrect values with NaN ###
    df.DEWP          = df.DEWP.replace(-9999, np.nan)
    df.DEWP          = df.DEWP.replace(-97, np.nan)

    df.HUMI          = df.HUMI.replace(-9999, np.nan)
    df.precipitation = df.precipitation.replace(999990, np.nan)


    ### Fill missing values in independent variables ###
    colsToFill = df.columns.to_list()
    colsToFill.remove('meanPM')

    # Missing values in the independent variables are rare, so they are just filled with the previous value
    df[colsToFill] = df[colsToFill].fillna(method = 'ffill').fillna(method = 'bfill')


    ### Lagging the variables ###
    independentCols = ['DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation']
    df[independentCols] = df[independentCols].shift(24) # 24 hours lag

    # TODO fix cause grops
    df['meanPM_24h']  = df['meanPM'].shift(24)
    df['meanPM_7d']   = df['meanPM'].shift(24 * 7)
    df['meanPM_30d']  = df['meanPM'].shift(24 * 30)
    df['meanPM_365d'] = df['meanPM'].shift(24 * 365)


    ### Convert categorical to dummies ###
    catCols = ['source', 'month', 'day', 'hour', 'season', 'cbwd', 'dayOfWeek']
    temp = [df.drop(catCols, axis = 1)]
    temp.extend(pd.get_dummies(df[col], prefix = col) for col in catCols)
    df = pd.concat(temp, axis = 1)
    
    
    ### Designate last year (~20%) of the data as test set ###
    df['isTestSet'] = (df.date > datetime(2015, 1, 1)).astype(int)
    
    
    ### Target col for classification ###
    df['isDangerous'] = (df.meanPM > 150).astype(int)
    
    
    ### Scale the numerical columns ###
    numCols = ['meanPM', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation', 'meanPM_24h', 'meanPM_7d', 'meanPM_30d', 'meanPM_365d']
    
    scaler = StandardScaler()
    scaler.fit(df[df.isTestSet == 0][numCols])
    
    df[numCols] = scaler.transform(df[numCols])
    
    
    ### Drop rows with NaN values ###
    # 25% of the dataset is dropped. This is just a quick analysis so it's ok
    # In a production model, the missing values should be investigated and filled with more sophisticated methods
    df = df[~df.isnull().any(axis = 1)].reset_index(drop = True)


    ### Drop unnecessary columns ###
    df = df.drop(['date', 'year'], axis = 1)

    independentCols = [col for col in df.columns if col not in ['meanPM', 'isTestSet', 'isDangerous']]
    
    X_train = df[df.isTestSet == 0][independentCols]
    X_test  = df[df.isTestSet == 1][independentCols]
    
    y_train = df[df.isTestSet == 0]['meanPM']
    y_test  = df[df.isTestSet == 1]['meanPM']
    
    y_train_class = df[df.isTestSet == 0]['isDangerous']
    y_test_class  = df[df.isTestSet == 1]['isDangerous']

    return X_train, X_test, y_train, y_test, y_train_class, y_test_class

In [62]:
X_train, X_test, y_train, y_test, y_train_class, y_test_class = prepareTrainTestSet()

In [None]:
    # sklearn pipeline did not work well
    
    #numCols = ['DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation', 'meanPM_24h', 'meanPM_7d', 'meanPM_30d', 'meanPM_365d']
    #catCols = ['source', 'month', 'day', 'hour', 'season', 'cbwd', 'dayOfWeek']
    #dropCols  = ['date', 'year']
    #passCols  = ['meanPM']
    #
    #
    #fullPipeline = ColumnTransformer([
    #    ('target', 'passthrough', passCols),
    #    ('num', StandardScaler(), numCols),
    #    ('cat', OneHotEncoder(), catCols),
    #    ('drop', 'drop', dropCols)
    #]) 
    #
    #df = pd.DataFrame(fullPipeline.fit_transform(df).todense())
    #
    ##df.columns = passCols + numCols + fullPipeline.named_transformers_['cat'].get_feature_names(catCols)

# Modeling