In [None]:
import csv
import math
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

## 1. Read data

In [None]:
# Taiwan_201701 = pd.read_csv('sinica/201701_Taiwan.csv')
# Taiwan_201702 = pd.read_csv('sinica/201702_Taiwan.csv')
# Taiwan_201703 = pd.read_csv('sinica/201703_Taiwan.csv')

# Taiwan_201701 = Taiwan_201701.rename(columns={' lat': 'lat', ' lon': 'lon'})
# Taiwan_201702 = Taiwan_201702.rename(columns={' lat': 'lat', ' lon': 'lon'})

epa_loc_data_201701 = pd.read_csv('epa/EPA_LOC_data_201701.csv')

In [None]:
Taiwan_201701_loc = pd.read_csv('Taiwan_201701_loc.csv')
Taiwan_201702_loc = pd.read_csv('Taiwan_201702_loc.csv')
Taiwan_201703_loc = pd.read_csv('Taiwan_201703_loc.csv')

In [None]:
Taiwan_201701_loc[:5]

## 2. Preprocessing

In [None]:
def drop_outliers(df):
    
    def _drop_PM10(df):
        PM10_CI95p = df.PM10.mean() + 2 * df.PM10.std()
        PM10_CI95n = df.PM10.mean() - 2 * df.PM10.std()
        df = df.drop(df[df.PM10 > PM10_CI95p].index)
        df = df.drop(df[df.PM10 < PM10_CI95n].index)
        df = df.drop(df[df.PM10 == 0].index)
        return df

    def _drop_PM1(df):
        PM1_CI95p = df.PM1.mean() + 2 * df.PM1.std()
        PM1_CI95n = df.PM1.mean() - 2 * df.PM1.std()
        df = df.drop(df[df.PM1 > PM1_CI95p].index)
        df = df.drop(df[df.PM1 < PM1_CI95n].index)
        df = df.drop(df[df.PM1 == 0].index)
        return df
    
    def _drop_temperature(df):
        temperature_CI95p = df.Temperature.mean() + 2 * df.Temperature.std()
        temperature_CI95n = df.Temperature.mean() - 2 * df.Temperature.std()
        df = df.drop(df[df.Temperature > temperature_CI95p].index)
        df = df.drop(df[df.Temperature < temperature_CI95n].index)
        return df
    
    def _drop_humidity(df):
        humidity_CI95p = df.Humidity.mean() + 2 * df.Humidity.std()
        humidity_CI95n = df.Humidity.mean() - 2 * df.Humidity.std()
        df = df.drop(df[df.Humidity > humidity_CI95p].index)
        df = df.drop(df[df.Humidity < humidity_CI95n].index)
        return df
    
    before = df.shape
    
    df = _drop_PM10(df)
    df = _drop_PM1(df)
    df = _drop_temperature(df)
    df = _drop_humidity(df)
    
    after = df.shape
    
    print(before, ' -> ', after)
    
    return df

In [None]:
# Taiwan_201701_CI95 = drop_outliers(Taiwan_201701_loc)
# Taiwan_201702_CI95 = drop_outliers(Taiwan_201702_loc)
# Taiwan_201703_CI95 = drop_outliers(Taiwan_201703_loc)

In [None]:
def preprocessing(df):
    
    def _normalization(df):
        for feature_name in ['PM10', 'PM1', 'Temperature', 'Humidity']:
            max_value = df[feature_name].mean() + 2 * df[feature_name].std()
            min_value = df[feature_name].mean() - 2 * df[feature_name].std()
            df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
        print('normalization DONE!')
        return df
    
    def _concat_datetime(df):
        df['period'] = df[['Date', 'Time']].apply(lambda x: ' '.join(x), axis=1)
        df = df.drop(['Date', 'Time'], axis=1)
        print('concat_datetime DONE!')
        return df
    
    def _cluster_loc(df):
        global epa_loc_data_201701
        counter = 0
        loc_list = []
        display_steps = int(len(df) / 100)
        for lon, lat in zip(df['lon'], df['lat']):
            min_distance = 999.9
            for row in epa_loc_data_201701.itertuples():
                loc_name = row[1]
                loc_lon = row[2]
                loc_lat = row[3]
                distance = (lon-loc_lon) ** 2 + (lat-lat) ** 2
                if min_distance > distance:
                    min_distance = distance
                    best_loc = loc_name
            loc_list.append(best_loc)
            if counter % display_steps == 0:
                p = int(counter / display_steps)
                print('[%s] %d/100' % (time.strftime("%H:%M:%S", time.localtime()), p))
            counter += 1
        df['loc'] = pd.Series(loc_list).values
        print('cluster_loc DONE!')
        return df
    
    def _drop_redundant_features(df):
        df = df.drop(['device_id', 'lat', 'lon'], axis=1)
        print('drop_redundant_features DONE!')
        return df
    
    df = _normalization(df)
    df = _concat_datetime(df)
#     df = _cluster_loc(df)
    df = _drop_redundant_features(df)
    return df

In [None]:
Taiwan_201701_precd = preprocessing(Taiwan_201701_loc)
Taiwan_201702_precd = preprocessing(Taiwan_201702_loc)
Taiwan_201703_precd = preprocessing(Taiwan_201703_loc)

## 3. Grouping

In [None]:
Taiwan_201701_group = Taiwan_201701_precd.groupby('loc') 
Taiwan_201702_group = Taiwan_201702_precd.groupby('loc')
Taiwan_201703_group = Taiwan_201703_precd.groupby('loc')

In [None]:
def get_largest_group(df):
    max_name = ''
    max_len = 0
    for name, group in df:
        length = len(group)
        if max_len < length:
            max_name = name
            max_len = length
    print(max_name, max_len)

In [None]:
train_group = Taiwan_201701_group.get_group('臺中市南屯區')
valid_group = Taiwan_201702_group.get_group('臺南市中西區')

## 4. Preprocessing of groups

In [None]:
def preprocessing_datetime_group(df):
    
    def _convert_datetime_to_norm_sec(df):
        df['period'] = pd.to_datetime(df['period'])
        df['seconds'] = [time.mktime(t.timetuple()) for t in df.period]
        max_value = df['seconds'].max()
        min_value = df['seconds'].min()
        df['seconds'] = (df['seconds'] - min_value) / (max_value - min_value)
        return df
    
    def _drop_redundant_features(df):
        X = df.drop(['loc', 'period', 'PM2.5'], axis=1).values
        Y = df['PM2.5'].values
        return X, Y
    
    def _reshapeX(X):
        X = X.reshape(X.shape[0], 1, X.shape[-1])
        return X
        
#     df = _convert_datetime_to_norm_sec(df)
    X, Y = _drop_redundant_features(df)
    X = _reshapeX(X)
    
    return X, Y

In [None]:
trainX, trainY = preprocessing_datetime_group(train_group)
validX, validY = preprocessing_datetime_group(valid_group)

In [None]:
print(trainX.shape, trainY.shape, validX.shape, validY.shape)

## 5. Build model

In [None]:
def LSTM_PM25(trainX, trainY, validX, validY, output_dim, epoch, batch_size):
    model = Sequential()
    model.add(LSTM(output_dim, input_shape=(trainX.shape[1], trainX.shape[2])))    
    model.add(Dense(units=1, kernel_initializer='uniform', activation='relu'))
    model.compile(loss='mse', optimizer='adam')
    history = model.fit(trainX, trainY, epochs=epoch, batch_size=batch_size, validation_data=(validX, validY), verbose=1, shuffle=False)
    return model

In [None]:
model = LSTM_PM25(trainX, trainY, validX, validY, 50, 10, 100)

In [None]:
model15 = LSTM_PM25(trainX, trainY, validX, validY, 50, 15, 100)
model20 = LSTM_PM25(trainX, trainY, validX, validY, 50, 20, 100)
model30 = LSTM_PM25(trainX, trainY, validX, validY, 50, 30, 100)

In [None]:
model40 = LSTM_PM25(trainX, trainY, validX, validY, 50, 40, 100)
model50 = LSTM_PM25(trainX, trainY, validX, validY, 50, 50, 100)
model60 = LSTM_PM25(trainX, trainY, validX, validY, 50, 60, 100)

## 6. Predict 201703 PM2.5

In [None]:
def predict_201703_PM25(model):
    result = [None] * len(Taiwan_201703_precd)

    counter = 0
    total = len(Taiwan_201703_group)

    for name, group in Taiwan_201703_group:
        index_list = group.index.tolist()
        testX = group.drop(['PM2.5', 'loc', 'period'], axis=1)
        testX = testX.values
        testX = testX.reshape(testX.shape[0], 1, testX.shape[-1])
        testY_hat_list = model.predict(testX).tolist()

        for index, Y_hat in zip(index_list, testY_hat_list):
            result[index] = Y_hat[0]

        counter += 1
        print('%s Finished! %d/%d' % (name, counter, total))
        
        result_df = pd.DataFrame(result, columns=["PM2.5"])
        result_df = result_df.round(0)
        
    return result_df

In [None]:
result15 = predict_201703_PM25(model15)
result20 = predict_201703_PM25(model20)
result30 = predict_201703_PM25(model30)

In [None]:
result40 = predict_201703_PM25(model40)
result50 = predict_201703_PM25(model50)
result60 = predict_201703_PM25(model60)

In [None]:
result15.to_csv('output_epoch15.csv', index=False, header=True)
result20.to_csv('output_epoch20.csv', index=False, header=True)
result30.to_csv('output_epoch30.csv', index=False, header=True)

In [None]:
result40.to_csv('output_epoch40.csv', index=False, header=True)
result50.to_csv('output_epoch50.csv', index=False, header=True)
result60.to_csv('output_epoch60.csv', index=False, header=True)