In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
import keras
import numpy as np
import ast
# from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils import class_weight
import pandas as pd
import seaborn as sn
import random
random.seed(1234)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Hyperparameters
batch_size = 8
time_steps = 61
data_dim = 1
test_split = 0.1
validation_split = 0.1
num_epoch = 30

In [3]:
# Convert json data to dictionary
with open('baseline_data.json') as f:
    data = f.read()
    data_dict_ = ast.literal_eval(data)

In [4]:
# Remove data with less than 61 states
for i in range(4950):
    if len(data_dict_['glance_sequence'][str(i)]) != 61:
        print(i)
        for key in data_dict_.keys():
            data_dict_[key].pop(str(i))

289
506
909
1203
1510
1788
2242
2505
2603
3426
3595
3674
3799
3853
4671


In [5]:
data_dict_train = {}
data_dict_test = {}
for key in data_dict_.keys():
#     data_dict_train[key] = np.array([],dtype=np.int32)
#     data_dict_test[key] = np.array([],dtype=np.int32)
    data_dict_train[key] = []
    data_dict_test[key] = []

count = -1

# Convert dictionary with index as key into an array
for i in sorted(map(int,data_dict_['id'].keys())): 
    count += 1
    # Traverse through all data columns/keys
    for key in data_dict_.keys():
        # Split data according to train-test split, small index used as test data
        # Get the data from the inner dict with the str index as key and append it in order
        if count < (test_split*4935):
#             data_dict_test[key] = np.append(data_dict_test[key],data_dict_[key][str(i)])
            data_dict_test[key].append(data_dict_[key][str(i)])
        else:
#             data_dict_train[key] = np.append(data_dict_train[key],data_dict_[key][str(i)])
            data_dict_train[key].append(data_dict_[key][str(i)])

# Reshape glance sequences back to the appropriate shape
# data_dict_train['glance_sequence'] = data_dict_train['glance_sequence'].reshape(-1,61,1)
# data_dict_test['glance_sequence'] = data_dict_test['glance_sequence'].reshape(-1,61,1)

In [6]:
x_train = data_dict_train['glance_sequence']
y_train = data_dict_train['proximity_to_an_intersection']
x_test = data_dict_test['glance_sequence']
y_test = data_dict_test['proximity_to_an_intersection']

In [8]:
def balanceDataset(x_data, y_data, num_of_class):
    '''
    @args:
        x_data: glance sequence
        y_data: label with values from 0 to n
        num_of_class: num of types of label
    Balance the dataset by oversampling classes with smaller dataset
    In the resulting dataset, each class has size equal to the class with the largest dataset
    '''
    # Array with one inner array for each class
    x_train = [[] for i in range(num_of_class)]
    y_train = [[] for i in range(num_of_class)]
    
    # Allocate data according to class
    for i,y in enumerate(y_data):
        x_train[y].append(x_data[i])
        y_train[y].append(y_data[i])
        
    # Get largest class size
    size = max(map(len,y_train))

    # slice_at = int(0.9*len(y_train))
    
    # Oversample smaller classes randomly
    for x,y in zip(x_train,y_train):
        for i in range(size - len(y)):
            num = random.randint(0,len(y)-1)
            x_data.append(x[num])
            y_data.append(y[num])
    
    # Shuffle data before returning to spread the dataset
    shuffles = zip(x_data,y_data)
    random.shuffle(shuffles)
    x_data,y_data = zip(*shuffles)

    return x_data,y_data

In [9]:
x_train,y_train = balanceDataset(x_train,y_train,2)
# x_test,y_test = balanceDataset(x_test,y_test,2)

In [19]:
x_train_np =  np.array(x_train).reshape(-1,61,1)
y_train_np =  np.array(y_train)
x_test_np =  np.array(x_test).reshape(-1,61,1)
y_test_np =  np.array(y_test)

In [24]:
def train(x_train_np, y_train_np, x_test_np, y_test_np, loss, metrics, balance_weight):
    '''
    @args:
        x_train_np: train glance sequence (numpy array)
        y_train_np: train label (numpy array)
        x_test_np: test glance sequence (numpy array)
        y_test_np: test label (numpy array)
        loss: loss function
        metrics: metrics for accuracy
        class_weight: Should class_weight balancing be done
    '''
    adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.001, amsgrad=False)

    model = Sequential()
    model.add(LSTM(128, dropout=0.0, recurrent_dropout=0.0, input_shape=(time_steps,data_dim), ))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                 optimizer=adam,
                 metrics=['binary_accuracy'])

    print('Train...')
    if balance_weight:
        cw = class_weight.compute_class_weight('balanced', np.unique(y_train_np),y_train_np)
        cw_dict = dict(enumerate(cw))
    else:
        cw_dict = None
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', min_delta=0, patience=3, verbose=0, mode='auto')

    model.fit(x_train_np, y_train_np,
             batch_size=batch_size,
             epochs=num_epoch,
             validation_split=validation_split,
             class_weight=cw_dict,
             callbacks = [early_stopping],
             verbose=2)

In [None]:
train(x_train_np, y_train_np, x_test_np, y_test_np, loss='binary_crossentropy', metrics='binary_accuracy', balance_weight=False)

Train...
Train on 6651 samples, validate on 739 samples
Epoch 1/30
 - 37s - loss: 0.6909 - binary_accuracy: 0.5294 - val_loss: 0.6875 - val_binary_accuracy: 0.5359
Epoch 2/30
 - 36s - loss: 0.6830 - binary_accuracy: 0.5553 - val_loss: 0.6815 - val_binary_accuracy: 0.5115
Epoch 3/30
 - 37s - loss: 0.6753 - binary_accuracy: 0.5701 - val_loss: 0.6666 - val_binary_accuracy: 0.5954
Epoch 4/30


In [None]:
predictions_ = model.predict(x_test,batch_size=batch_size)
predictions = []
for pred in predictions_:
    if pred < 0.5:
        predictions.append(0)
    else:
        predictions.append(1)

cm = confusion_matrix(y_test,predictions)
axis_index = ['Non-junction','Junction']
axis_predicted = ['Predicted ' + a_i for a_i in axis_index]
df_cm = pd.DataFrame(cm, index = axis_index, columns = axis_predicted)
plt.figure(figsize = (10,7))
sn.heatmap(df_cm,annot=True,cmap='Blues', fmt='g')
print("accuracy:" ,(cm[0][0] + cm[1][1])/float(len(y_test)))

score, acc = model.evaluate(x_test_np, y_test_np,
                          batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
cw_dict