In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('times.csv')
sentences = np.array(data.sentence)
tokens = np.array(data.token)
labels = np.array(data.label)

In [4]:
encoded_tokens = []
for i in range(len(tokens)):
    if type(tokens[i]) != str:
        tokens[i] = ''
symbol_map = dict()
inverted_symbol_map = dict()
symbol_counter = 1
for token in tokens:
    encoded_token = []
    for symbol in token:         
        if symbol_map.get(symbol) == None:
            symbol_map[symbol]=symbol_counter
            inverted_symbol_map[symbol_counter]=symbol
            symbol_counter+=1
        encoded_token.append(symbol_map.get(symbol))
    encoded_tokens.append(encoded_token)

In [5]:
def fitToSize(word, n):
    wordFit = word[:n]
    if n > len(word):
        for i in range(n - len(word)):
            wordFit.append(0)    
    return wordFit

In [6]:
for i in range(len(encoded_tokens)):
    encoded_tokens[i] = fitToSize(encoded_tokens[i],8)

In [7]:
encoded_labels = []
for label in labels:
    ans = [0,0,0,0,0]
    if label == 'NONE':
        ans[0] = 1        
    elif label == 'D_ST': 
        ans[1] = 1   
    elif label == 'D_PT':
        ans[2] = 1
    elif label == 'D_EN':
        ans[3] = 1
    elif label == 'D_OR':
        ans[4] = 1    
    encoded_labels.append(ans)

In [8]:
batchesX = []
batchesY = []
for i in range(sentences[-1]+1):
    batchesX.append([])
    batchesY.append([])
for i in range(len(encoded_tokens)):
    (batchesX[sentences[i]]).append(encoded_tokens[i])
    (batchesY[sentences[i]]).append(encoded_labels[i])

In [9]:
from keras.layers import Input
from keras.layers import TimeDistributed
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dense
from keras.models import Model

Using TensorFlow backend.


In [39]:
inputLayer = Input(dtype='int32', shape=(None,8,), name = 'inputLayer')
embeddingLayer = TimeDistributed(Embedding(input_dim = 169, output_dim = 10, trainable=True, 
                                         name = 'embeddingLayer'), name='TDEmbedding')(inputLayer)
lstmLayer = TimeDistributed(LSTM(50, return_sequences=False, consume_less='gpu', 
                                name = 'lstmLayer'), name='TDlstm')(embeddingLayer)
outputLayer = TimeDistributed(Dense(output_dim=5,init='zero',activation='softmax',name='outputLayer'),
                              name='TDout')(lstmLayer)
model = Model(input=inputLayer, output=outputLayer)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','fmeasure'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
inputLayer (InputLayer)          (None, None, 8)       0                                            
____________________________________________________________________________________________________
TDEmbedding (TimeDistributed)    (None, None, 8, 10)   1690        inputLayer[0][0]                 
____________________________________________________________________________________________________
TDlstm (TimeDistributed)         (None, None, 50)      12200       TDEmbedding[0][0]                
____________________________________________________________________________________________________
TDout (TimeDistributed)          (None, None, 5)       255         TDlstm[0][0]                     
Total params: 14,145
Trainable params: 14,145
Non-trainable params: 0
_____________________

In [20]:
butches_count_train = int(len(batchesX)*0.8)
print(butches_count_train)

17528


In [134]:
trainX =  np.array(batchesX[:butches_count_train])
trainY = np.array(batchesY[:butches_count_train])
for epoch in range(50):
    shuffle_indexes = np.array(range(butches_count_train))
    np.random.shuffle(shuffle_indexes)
    trainX = trainX[shuffle_indexes]
    trainY = trainY[shuffle_indexes]
    print("epoch: ", epoch)
    for i in range(len(batchesX[:butches_count_train])):        
        model.train_on_batch(np.array([trainX[i]]), np.array([trainY[i]]))                     

epoch:  0
epoch:  1
epoch:  2
epoch:  3
epoch:  4
epoch:  5
epoch:  6
epoch:  7
epoch:  8
epoch:  9
epoch:  10
epoch:  11
epoch:  12
epoch:  13
epoch:  14
epoch:  15
epoch:  16
epoch:  17
epoch:  18
epoch:  19
epoch:  20
epoch:  21
epoch:  22
epoch:  23
epoch:  24
epoch:  25
epoch:  26
epoch:  27
epoch:  28
epoch:  29
epoch:  30
epoch:  31
epoch:  32
epoch:  33
epoch:  34
epoch:  35
epoch:  36
epoch:  37
epoch:  38
epoch:  39
epoch:  40
epoch:  41
epoch:  42
epoch:  43
epoch:  44
epoch:  45
epoch:  46
epoch:  47
epoch:  48
epoch:  49


In [141]:
answer = []
for i in range(len(batchesX[butches_count_train:])):
    answer.append(model.predict_on_batch(np.array([batchesX[butches_count_train+i]])))
# answer = []
# for i in range(len(batchesX[:butches_count_train])):
#     answer.append(model.predict_on_batch(np.array([batchesX[i]])))

In [142]:
decoded_answer = []
for i in range(len(answer)):
    for j in range(len(answer[i][0])):
        if np.argmax(answer[i][0][j]) == 0:
            decoded_answer.append('NONE')
        elif np.argmax(answer[i][0][j]) == 1:
            decoded_answer.append('D_ST')
        elif np.argmax(answer[i][0][j]) == 2:
            decoded_answer.append('D_PT')        
        elif np.argmax(answer[i][0][j]) == 3:
            decoded_answer.append('D_EN')   
        elif np.argmax(answer[i][0][j]) == 4:
            decoded_answer.append('D_OR')    

In [143]:
df = pd.DataFrame(data= decoded_answer, index = labels[-len(decoded_answer):])                        
df.to_csv('answer.csv')
# df = pd.DataFrame(data= decoded_answer, index = labels[:len(decoded_answer)])                        
# df.to_csv('answer_train.csv')