In [2]:
# ############################=> BEGIN IMPORT SECTION <=#############################
import os
import pandas as pd
import numpy as np
import random
from sklearn import preprocessing
from collections import deque
import time
import math
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard, ModelCheckpoint


# ############################=> END IMPORT SECTION <=#############################


# ############################=> BEGIN VARIABLE SECTION <=#############################

# creating variable, to keep prediction parameters
# using sixty seconds data to predict
SEQ_LEN = 60
# three minutes period for prediction
FUTURE_PERIOD_PREDICT = 3
# the type of data we trying to predict
STREET_TO_PREDICT = "georgia"
EPOCHS = 20
BATCH_SIZE = 100
# Name of the model
NAME = f"{STREET_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

# ############################=> END VARIABLE SECTION <=#############################

# ############################=> BEGIN FUNCTION SECTION <=#############################
def classify(current_co2_amount, future_co2_amount):
    if float(future_co2_amount) > float(current_co2_amount):
        return 1
    else:
        return 0


def preprocess_df(df):
    # drop the future columns
    df = df.drop(f"future_pollution_{STREET_TO_PREDICT}", 1)

    for col in df.columns:
        if col != "target":
            # normalize the data, because each columns has different value for different type of data
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(pd.notnull(df[col].values))
    df.dropna(inplace=True)
    sequential_data = []
    # it holds data for 60 seconds and popout the old item after this time
    prev_days = deque(maxlen=SEQ_LEN)

    for i in df.values:
        # not include the last item which is target
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
    random.shuffle(sequential_data)

    # balance the data into higherCO2 and  lowerCO2

    higherPollution = []
    lowerPollution = []

    for seq, target in sequential_data:
        if target == 0:
            lowerPollution.append([seq, target])
        elif target == 1:
            higherPollution.append([seq, target])

    random.shuffle(higherPollution)
    random.shuffle(lowerPollution)

    lower = min(len(higherPollution), len(lowerPollution))

    higherPollution = higherPollution[:lower]
    lowerPollution = lowerPollution[:lower]

    sequential_data = higherPollution + lowerPollution
    random.shuffle(sequential_data)

    X = []
    y = []

    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)

    return np.array(X), y

# ############################=> END FUNCTION SECTION <=#############################

data_frame = pd.DataFrame()
# reading the data
fileNames = ["pollution", "traffic"]
for fileName in fileNames:
    dataset_name = f"{fileName}.csv"
    df = pd.read_csv(dataset_name, sep=";")
    # converting the Time columns values to time object
    df.Time = pd.to_datetime(df.Time)
    # setting time as index
    df.set_index("Time", inplace=True)

    # combing the data into the data_frame
    if len(data_frame) == 0:
        data_frame = df
    else:
        data_frame = data_frame.join(df)



print(data_frame.head())

print(data_frame.columns)


# combing the average of the two Cam
data_frame[f"totalPopulation_{STREET_TO_PREDICT}"] = data_frame[[f"mlk-{STREET_TO_PREDICT}-cam-3", f"mlk-{STREET_TO_PREDICT}-cam-1"]].sum(axis=1)

data_frame = data_frame[[f"totalPopulation_{STREET_TO_PREDICT}", f"mlk-{STREET_TO_PREDICT}"]]

# if there are gaps in data, use previously known values, take look at this
data_frame.fillna(method="ffill", inplace=True)
data_frame.dropna(inplace=True)

data_frame[f"future_pollution_{STREET_TO_PREDICT}"] = data_frame[f"mlk-{STREET_TO_PREDICT}"].shift(-FUTURE_PERIOD_PREDICT)


print(data_frame.head(100))
print(data_frame.columns)
# renaming f"mlk-{STREET_TO_PREDICT}" to  current pollution
data_frame.rename(columns={f"mlk-{STREET_TO_PREDICT}": f"current_pollution_{STREET_TO_PREDICT}"}, inplace=True)
print(data_frame[[f"current_pollution_{STREET_TO_PREDICT}"]].head(20))
print(data_frame[[f"future_pollution_{STREET_TO_PREDICT}"]].head())

print(data_frame.columns)
# creating the target data, by making a list of labels
data_frame["target"] = list(map(classify, data_frame[f"current_pollution_{STREET_TO_PREDICT}"], data_frame[f"future_pollution_{STREET_TO_PREDICT}"]))

print(data_frame.head(100))

# creating training data
times = sorted(data_frame.index.values)

# getting the last 35% of time data
last_35pct = times[-int(0.35 * len(times))]
print(last_35pct)

# creating validation data that are greater than last 5%
validation_data_df = data_frame[(data_frame.index >= last_35pct)]
# creating training data
training_data_df = data_frame[(data_frame.index < last_35pct)]

#preprocess_df(training_data_df)
train_x, train_y = preprocess_df(training_data_df)
test_x, test_y = preprocess_df(validation_data_df)

print(validation_data_df.head())

print(f"train dta : {len(train_x)}  validation : {len(test_x)}")
print(f"low pollution: {train_y.count(0)}, high: {train_y.count(1)}")
print(f"VALIDATION low pollution : { test_y.count(0)}, high : {test_y.count(1)}")



model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

# Dense layer
model.add(LSTM(128, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())


model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = keras.optimizers.Adam(lr=0.001, decay=1e-6)
# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
 )

#plot_model(model, to_file="model.png")

# TensorBoard callback, to see the train data graph  command : board --logdir=logs
tensorboard = TensorBoard(log_dir=f"logs/{NAME}")

# unique file name that will include the epoch and the validation acc for that epoch
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"
# saves only the best ones
checkpoint = ModelCheckpoint("models/{}.model".format(filepath,
                            monitor='val_acc', verbose=1, save_best_only=True, mode='max'))


# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(test_x, test_y),
    callbacks=[tensorboard, checkpoint],
)

# Score model
score = model.evaluate(test_x, test_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))



                     mlk-peeples  mlk-magnolia  mlk-lindsay  mlk-houston  \
Time                                                                       
2019-03-25 20:50:00     8.572857      7.981429     9.332500     8.295625   
2019-03-25 21:00:00     7.873750      9.218125    10.310000     9.184286   
2019-03-25 21:10:00     8.982143      9.258333    11.130625     7.869375   
2019-03-25 21:20:00     8.585000     10.703125    10.182143     8.082143   
2019-03-25 21:30:00    10.153571     10.172857    11.306250     9.338750   

                     mlk-georgia  mlk-douglas  mlk-central  mlk-georgia-cam-3  \
Time                                                                            
2019-03-25 20:50:00          NaN     7.454375     7.227143                  0   
2019-03-25 21:00:00          NaN     8.806429     9.318750                  0   
2019-03-25 21:10:00     6.954375     7.821875    10.832143                  0   
2019-03-25 21:20:00     7.317500     8.599286     9.556875    



                     totalPopulation_georgia  current_pollution_georgia  \
Time                                                                      
2019-03-30 10:10:00                      446                   6.757857   
2019-03-30 10:20:00                      504                   6.370000   
2019-03-30 10:30:00                      488                   5.367143   
2019-03-30 10:40:00                      575                   5.643750   
2019-03-30 10:50:00                      548                   4.372500   

                     future_pollution_georgia  target  
Time                                                   
2019-03-30 10:10:00                  5.643750       0  
2019-03-30 10:20:00                  4.372500       0  
2019-03-30 10:30:00                  3.865000       0  
2019-03-30 10:40:00                  2.433333       0  
2019-03-30 10:50:00                  2.830000       0  
train dta : 398  validation : 210
low pollution: 199, high: 199
VALIDATION low pol

In [3]:
data_frame.head(100)

Unnamed: 0_level_0,totalPopulation_georgia,current_pollution_georgia,future_pollution_georgia,target
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-25 21:10:00,0,6.954375,8.420833,1
2019-03-25 21:20:00,0,7.317500,7.682000,1
2019-03-25 21:30:00,0,8.275714,8.727143,1
2019-03-25 21:40:00,0,8.420833,8.305000,0
2019-03-25 21:50:00,0,7.682000,8.301250,1
2019-03-25 22:00:00,0,8.727143,8.136667,0
2019-03-25 22:10:00,0,8.305000,8.190000,0
2019-03-25 22:20:00,0,8.301250,7.975833,0
2019-03-25 22:30:00,0,8.136667,8.159167,1
2019-03-25 22:40:00,0,8.190000,8.788750,1
