In [1]:
from sklearn.preprocessing import *
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
class DataHolder(pd.DataFrame):
    """
    """
    
    def __init__(self, path, multivariate=True):
        """
        """
        print(2)
        super().__init__(data=pd.read_csv(path, index_col=0))
        print(3)
        self.X_transformer = None
        self.y_transformer = None
        self.multivariate = multivariate
        
    def handle_na(self, data):
        """
        Handles missing values
        """

        # Interpolate NA's where possible, keeping sequence in mind
        data.interpolate(limit_direction='forward', limit_area='inside', inplace=True)

        # Drop rest of rows with NA's
        data.dropna(inplace=True)
        
        X = data.drop(TARGET, axis=1)
        y = data[TARGET]
        
        return X, y
            
        
    def preprocess(self, val=True):
        """
        Function to take care of preprocessing steps
        """
        
        # Parse time data and split features and target
        self['date'] = pd.to_datetime(self[TIME_COLS])
        self.drop(TIME_COLS, axis=1, inplace=True)
        self.set_index('date', inplace=True)
        
        X = self.drop(TARGET, axis=1)
        y = self[TARGET]
        
        # First split data before doing anything else
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
        X_val, y_val = None, None
        if val:
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2/0.8, shuffle=False)
            X_val, y_val = self.handle_na(pd.concat([X_val, y_val], axis=1))

        # Impute NA's and delete rest of NA's
        X_train, y_train = self.handle_na(pd.concat([X_train, y_train], axis=1))
        X_test, y_test = self.handle_na(pd.concat([X_test, y_test], axis=1))
                
        # Normalize numerical values and one hot encode categorical values
        self.X_transformer = make_column_transformer(
           (StandardScaler(),
            make_column_selector(dtype_include=np.number)),  
           (OneHotEncoder(),
            make_column_selector(dtype_include=object))
        )
        self.y_transformer = MinMaxScaler()
        
        # Fit scalers on train data and transform rest of data as well
        X_train = self.X_transformer.fit_transform(X_train)
        X_val = self.X_transformer.transform(X_val)
        X_test = self.X_transformer.transform(X_test)
        
        y_train = self.y_transformer.fit_transform(np.array(y_train).reshape(-1,1))
        y_val = self.y_transformer.transform(np.array(y_val).reshape(-1,1))
        y_test = self.y_transformer.transform(np.array(y_test).reshape(-1,1))
        
        # If univariate, X = y
        if not self.multivariate:
            X_train = y_train
            X_val = y_val
            X_test = y_test
        
        return X_train, X_val, X_test, y_train, y_val, y_test

In [10]:
"""
Cell with hyperparameters and arguments
The contents of this cell will have to be specified in the command line at the end
"""

DATA_PATH = 'PM_data.csv'
TARGET = 'pm2.5'
TIME_COLS = ['year', 'month', 'day', 'hour']
MULTIVARIATE = False

In [11]:
df = DataHolder(DATA_PATH, MULTIVARIATE)

X_train, X_val, X_test, y_train, y_val, y_test = df.preprocess()

2
3
