In [1]:
from glob import glob
import os, sys
from os.path import join, dirname

import datetime, time
import csv
from glob import glob
import chardet
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, Normalizer, MaxAbsScaler, MinMaxScaler

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.python.client import device_lib
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Flatten, Dense, Conv2D, SimpleRNN, LSTM, GRU, Reshape, RepeatVector, MaxPooling2D, Dropout, Bidirectional, Attention, BatchNormalization
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.optimizers import Adadelta, RMSprop,SGD,Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import model_to_dot

import imblearn

from IPython.display import SVG

import matplotlib.pyplot as plt
%matplotlib inline

# print(tf.__version__)
# device_lib.list_local_devices()

# Global Parameter

In [2]:
num_classes = 0                           # {"0" : "Playing", "1" : "Talking", "2" : "Petting", "3" : "TV / Radio", "4" : "Eating / Cooking", "5" : "Moved It", "6" : "None of the above", "7" : "Other"}
time_offset = 10
window_size = 50 ## change 10 to 50
overlap_ratio = 0.5
bi_class = 1                              # Binary Classification (1 : Playing or not, 2 : Talking or not, 3 : Petting or not, 4: TV / Radio or not, 5 : Eating / Cooking or not, 6 : Moved It or not)
cross_val = 0
rand_st=1
mode = 0                                 # Split data {0: Didn't split, 1: US only, 2: Korea only, 3: train with US and test with Korea 4: train with Korea and test with US}

# Load Data

In [4]:
data_fname = '../Data/Preprocessed(new)/preprocessed_data(New collar_2).csv'
data = pd.read_csv(data_fname)

# Preprocess

In [5]:
data = pd.concat([data,pd.get_dummies(data['sound category'])],axis=1)         # Onehot encode sound category
data = pd.concat([data,pd.get_dummies(data['orientation_cat'])],axis=1)            # Onehot encode Orient

In [6]:
# Create rowID list
rowID_list = np.array(data['RowID'].drop_duplicates())
data = data.to_records(index=False)

In [7]:
# Normalize Data
scaler = StandardScaler()
data['audioLevel'] = scaler.fit_transform(data['audioLevel'].reshape(-1,1)).reshape(-1)
data['ir'] = scaler.fit_transform(data['ir'].reshape(-1,1)).reshape(-1)

In [8]:
# Split US and Korea
us_rowIDs = []
korea_rowIDs = []

if mode != 0:
    for rowid in rowID_list:
    #     print(rowid, rowid[0])
        if rowid[0] == '1':
            korea_rowIDs.append(rowid)
        else:
            us_rowIDs.append(rowid)

In [9]:
feature_col_name = ['accX','accY','accZ','chord','ir','full','iaq','pressure','gasResistance','temperature','humidity','staticIaq', 'co2Equivalent','breathVocEquivalent','breathVocAccuracy','audioLevel','Loud','Moderate','Quiet','Landscape Left Back','Landscape Left Front','Landscape Right Back','Landscape Right Front','Portrait Down Back',
 'Portrait Down Front','Portrait Up Back','Portrait Up Front']
target_col_name = ['Modality_cat']

In [10]:
X = []
us_X = []
korea_X = []

Y = []
us_Y = []
korea_Y = []


if mode != 0:
    for rowID in us_rowIDs:
        #Split raw data by rowID & split X, Y data
        tmp_data = data[data['RowID'] == rowID]
        feature = tmp_data[feature_col_name]
        feature = np.array(feature.tolist())
        target = tmp_data[target_col_name][0][0]
        target = np.array(target.tolist())
        us_X.append(feature)
        us_Y.append(target)
    
    for rowID in korea_rowIDs:
        #Split raw data by rowID & split X, Y data
        tmp_data = data[data['RowID'] == rowID]
        feature = tmp_data[feature_col_name]
        feature = np.array(feature.tolist())
        target = tmp_data[target_col_name][0][0]
        target = np.array(target.tolist())
        korea_X.append(feature)
        korea_Y.append(target)

else:
    for rowID in rowID_list:
        #Split raw data by rowID & split X, Y data
        tmp_data = data[data['RowID'] == rowID]
        feature = tmp_data[feature_col_name]
        feature = np.array(feature.tolist())
        target = tmp_data[target_col_name][0][0]
        target = np.array(target.tolist())
        X.append(feature)
        Y.append(target)
if bi_class != 0:
    #Transit multi classification to binary classification
    if mode != 0:
        for idx in range(len(us_Y)):
            if us_Y[idx] == bi_class-1:
                us_Y[idx]=1
            else:
                us_Y[idx]=0
                
        for idx in range(len(korea_Y)):
            if korea_Y[idx] == bi_class-1:
                korea_Y[idx]=1
            else:
                korea_Y[idx]=0
    else:
        for idx in range(len(Y)):
            if Y[idx] == bi_class-1:
                Y[idx]=1
            else:
                Y[idx]=0
def X_preprocess(X, window_size, overlap_ratio):
    #Transform data shape using the set time window
    processed_X = []
    
    for i in range(len(X)):
        tmp_X = X[i]
        tmp = []
        start_row = 0
        end_row = start_row + window_size
        
        if len(tmp_X)%int(window_size*overlap_ratio) == 0:
            for j in range(len(tmp_X)//int(window_size*overlap_ratio)-1):
                tmp.append(tmp_X[int(start_row):int(end_row)])
                start_row += (window_size*overlap_ratio)
                end_row += (window_size*overlap_ratio)
        else:
            for j in range(len(tmp_X)//int(window_size*overlap_ratio)+1):
                if end_row > len(tmp_X):
                    
                    tmp.append(tmp_X[-window_size:])
                    start_row += (window_size*overlap_ratio)
                    end_row += (window_size*overlap_ratio)
                    break
                else:
                    
                    tmp.append(tmp_X[int(start_row):int(end_row)])
                    start_row += (window_size*overlap_ratio)
                    end_row += (window_size*overlap_ratio)
        processed_X.append(tmp)
        
    return processed_X
if mode != 0:
    us_X = X_preprocess(us_X, window_size, overlap_ratio)        ### preprocess with input shape
    korea_X = X_preprocess(korea_X, window_size, overlap_ratio)
    if bi_class == 0:
        ### onehot encode Y
        us_Y = np.eye(num_classes)[us_Y]
        korea_Y = np.eye(num_classes)[korea_Y]
    else: 
        us_Y = np.eye(2)[us_Y]
        korea_Y = np.eye(2)[korea_Y]

else:    
    X = X_preprocess(X, window_size, overlap_ratio)        ### preprocess with input shape
    if bi_class == 0:
        ### onehot encode Y
        Y = np.eye(num_classes)[Y]
    else: Y = np.eye(2)[Y]
# Subsample X Data size

def subsample(X, min_len):
    sampled_X = []
    addon = 0

    for i in range(len(X)):
        if len(X[i]) == min_len:
            sampled_X.append(X[i])
        else:
            temp_X = []
            interval = np.array(X[i]).shape[1] / min_len
            quotient = int(np.modf(interval)[1])
            remainder = np.modf(interval)[0]


            for j in range(min_len):
                if addon >= 1:
                    temp_X.append(X[i][j*quotient + 1])
                    addon = 0
                    addon += remainder
                else:
                    temp_X.append(X[i][j*quotient])
                    addon += remainder

            sampled_X.append(temp_X)
                    
    return sampled_X
# calculate minimum length

min_len = 99999999
min_us_len = 99999999
min_korea_len = 99999999

if mode == 0:
    for x in X:
        if len(x) < min_len:
            min_len = len(x)

else:
    for x in us_X:
        if len(x) < min_us_len:
            min_us_len = len(x)
            
    for x in korea_X:
        if len(x) < min_korea_len:
            min_korea_len = len(x)
        
    else:
        if min_korea_len < min_us_len:
            min_len = min_korea_len
        else: min_len = min_us_len
# Drop duplicate

if bi_class != 0:
    
    target_list = []
    us_target_list = []
    korea_target_list = []
    del_list = []
    us_del_list = []
    korea_del_list = []
    
    if mode == 0:
        for i in range(len(Y)):
            if Y[i][1] == 1:
                target_list.append(i)

        for i in target_list:
            for j in range(len(X)):
                if j in target_list:
                    pass
                else:
                    if np.array_equal(np.array(X[i]), np.array(X[j])):
                        if j not in del_list:
                            del_list.append(j)
        inputs = []
        Target = []

        for i in range(len(Y)):
            if i not in del_list:
                inputs.append(X[i])
                Target.append(Y[i])
                
    else:
        for i in range(len(us_Y)):
            if us_Y[i][1] == 1:
                us_target_list.append(i)

        for i in us_target_list:
            for j in range(len(us_X)):
                if j in us_target_list:
                    pass
                else:
                    if np.array_equal(np.array(us_X[i]), np.array(us_X[j])):
                        if j not in us_del_list:
                            us_del_list.append(j)
                            
        for i in range(len(korea_Y)):
            if korea_Y[i][1] == 1:
                korea_target_list.append(i)

        for i in korea_target_list:
            for j in range(len(korea_X)):
                if j in korea_target_list:
                    pass
                else:
                    if np.array_equal(np.array(korea_X[i]), np.array(korea_X[j])):
                        if j not in korea_del_list:
                            korea_del_list.append(j)
        
        us_inputs = []
        us_Target = []
        korea_inputs = []
        korea_Target = []

        for i in range(len(us_Y)):
            if i not in us_del_list:
                us_inputs.append(us_X[i])
                us_Target.append(us_Y[i])
                
        for i in range(len(korea_Y)):
            if i not in korea_del_list:
                korea_inputs.append(korea_X[i])
                korea_Target.append(korea_Y[i])
# differentiate data based on mode

if mode == 0:
    X = subsample(inputs, min_len)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Target, test_size=0.2)
    
elif mode == 1:
    us_X = subsample(us_inputs, min_len)
    X_train, X_test, Y_train, Y_test = train_test_split(us_X, us_Target, test_size=0.2)

elif mode == 2:
    korea_X = subsample(korea_inputs, min_len)
    X_train, X_test, Y_train, Y_test = train_test_split(korea_X, korea_Target, test_size=0.2)

elif mode == 3:
    X_train = subsample(us_inputs, min_len)
    X_test = subsample(korea_inputs, min_len)
    Y_train = us_Target 
    Y_test = korea_Target

else:
    X_train = subsample(korea_inputs, min_len)
    X_test = subsample(us_inputs, min_len)
    Y_train = korea_Target
    Y_test = us_Target 

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Target, test_size=0.2)

# Model

In [17]:
def build_model(num_classes, input_shape):
    """ build CNN-RNN model """

    feature_input = Input(shape=input_shape)   
    cnn_layer = Conv2D(27, 8, (1, 1), padding='same', activation='relu', name='conv1')(feature_input)
    rnn_input = Reshape((-1, 200), name='reshape1')(cnn_layer)
    rnn_layer = SimpleRNN(units=200, activation='tanh', name='rnn1')(rnn_input)
    dense = Dense(units=num_classes, activation='softmax', name='fc1')(rnn_layer)
    model = Model(inputs=feature_input, outputs=dense, name='RNN')

    return model

In [18]:
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [19]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

# Train Model

In [20]:
if bi_class == 0:    
    model = build_model(num_classes=num_classes, input_shape=np.array(X_train[0]).shape)
    model.summary()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', keras.metrics.AUC(), tfa.metrics.F1Score(num_classes=num_classes)])
else:
    sm = imblearn.over_sampling.SMOTE()         # random state do not set
    origin_shape = np.array(X_train).shape
    new_X_train = np.array(X_train).reshape(origin_shape[0], origin_shape[1]*origin_shape[2]*origin_shape[3])
    Y_train = np.array(Y_train).astype('float64')
    X_train, Y_train = sm.fit_resample(new_X_train, Y_train)
    temp = X_train.shape
    X_train = X_train.reshape([temp[0], origin_shape[1], origin_shape[2], origin_shape[3]])
    Y_train = np.eye(2)[Y_train.reshape(temp[0])]
    model = build_model(num_classes=2, input_shape=np.array(X_train[0]).shape)
    model.summary()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', keras.metrics.AUC()])
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', auc])



Model: "RNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 276, 50, 27)]     0         
_________________________________________________________________
conv1 (Conv2D)               (None, 276, 50, 27)       46683     
_________________________________________________________________
reshape1 (Reshape)           (None, 1863, 200)         0         
_________________________________________________________________
rnn1 (SimpleRNN)             (None, 200)               80200     
_________________________________________________________________
fc1 (Dense)                  (None, 2)                 402       
Total params: 127,285
Trainable params: 127,285
Non-trainable params: 0
_________________________________________________________________


In [None]:
hist = model.fit(x=np.array(X_train).transpose([0,1,2,3]), y=Y_train, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150

In [None]:
# Display acc, loss

fig, loss_ax = plt.subplots()

acc_ax = loss_ax.twinx()

loss_ax.plot(hist.history['loss'], 'y', label='train loss')

acc_ax.plot(hist.history['accuracy'], 'b', label='train acc')

loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
acc_ax.set_ylabel('accuray')

loss_ax.legend(loc='upper left')
acc_ax.legend(loc='lower left')

plt.show()

# Test Model

In [None]:
predictions = model.predict(np.array(X_test).transpose([0,1,2,3]))
if bi_class==0:
    auc = roc_auc_score(Y_test, predictions, multi_class='raise')
    print('Multiclass Test AUC: ', auc)
else:
    auc = roc_auc_score(Y_test, predictions)
    print('Test AUC: ', auc)

In [None]:
frequency = np.zeros(len(Y_test[0]))

for i in range(len(predictions)):
    frequency[np.argmax(predictions[i])] +=1

frequency

In [None]:
if bi_class == 0:
    test_loss, test_acc, test_auc, test_F1 = model.evaluate(np.array(X_test).transpose([0,1,2,3]),  np.array(Y_test).transpose([0,1]), verbose=2)
    print('\nAccuracy:', test_acc)
else:
    test_loss, test_acc, test_auc = model.evaluate(np.array(X_test).transpose([0,1,2,3]),  np.array(Y_test).transpose([0,1]), verbose=2)
    print('\nAccuracy:', test_acc)