In [1]:
import numpy as np
import pandas as pd

In [17]:
data = pd.read_csv('times.csv')
sentences = np.array(data.sentence)
tokens = np.array(data.token)
labels = np.array(data.label)

In [34]:
encoded_tokens = []
symbol_map = dict()
inverted_symbol_map = dict()
symbol_counter = 1
for token in tokens:
    encoded_token = []
    for symbol in token:         
        if symbol_map.get(symbol) == None:
            symbol_map[symbol]=symbol_counter
            inverted_symbol_map[symbol_counter]=symbol
            symbol_counter+=1
        encoded_token.append(symbol_map.get(symbol))
    encoded_tokens.append(encoded_token)

In [36]:
def fitToSize(word, n):
    wordFit = word[:n]
    if n > len(word):
        for i in range(n - len(word)):
            wordFit.append(0)    
    return wordFit

In [37]:
for i in range(len(encoded_tokens)):
    encoded_tokens[i] = fitToSize(encoded_tokens[i],8)

In [154]:
encoded_labels = []
for label in labels:
    ans = [0,0,0,0,0]
    if label == 'NONE':
        ans[0] = 1        
    elif label == 'D_ST': 
        ans[1] = 1   
    elif label == 'D_PT':
        ans[2] = 1
    elif label == 'D_EN':
        ans[3] = 1
    elif label == 'D_OR':
        ans[4] = 1    
    encoded_labels.append(ans)

In [155]:
batchesX = []
batchesY = []
for i in range(sentences[-1]+1):
    batchesX.append([])
    batchesY.append([])
for i in range(len(encoded_tokens)):
    (batchesX[sentences[i]]).append(encoded_tokens[i])
    (batchesY[sentences[i]]).append(encoded_labels[i])

In [111]:
from keras.layers import Input
from keras.layers import TimeDistributed
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dense
from keras.models import Model

Using TensorFlow backend.


In [186]:
inputLayer = Input(dtype='int32', shape=(None,8,), name = 'inputLayer')
embeddingLayer = TimeDistributed(Embedding(input_dim = 67, output_dim = 10, trainable=True, 
                                         name = 'embeddingLayer'), name='TDEmbedding')(inputLayer)
lstmLayer = TimeDistributed(LSTM(50, return_sequences=False, consume_less='gpu', 
                                name = 'lstmLayer'), name='TDlstm')(embeddingLayer)
outputLayer = TimeDistributed(Dense(output_dim=5,init='zero',activation='softmax',name='outputLayer'),
                              name='TDout')(lstmLayer)
model = Model(input=inputLayer, output=outputLayer)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy','fmeasure'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
inputLayer (InputLayer)          (None, None, 8)       0                                            
____________________________________________________________________________________________________
TDEmbedding (TimeDistributed)    (None, None, 8, 10)   670         inputLayer[0][0]                 
____________________________________________________________________________________________________
TDlstm (TimeDistributed)         (None, None, 50)      12200       TDEmbedding[0][0]                
____________________________________________________________________________________________________
TDout (TimeDistributed)          (None, None, 5)       255         TDlstm[0][0]                     
Total params: 13,125
Trainable params: 13,125
Non-trainable params: 0
_____________________

In [187]:
for i in range(len(batchesX[:7000])):
    model.train_on_batch(np.array([batchesX[i]]), np.array([batchesY[i]]))
    if i % 500 == 0:
        print("Batch num: ", i)               

Batch num:  0
Batch num:  500
Batch num:  1000
Batch num:  1500
Batch num:  2000
Batch num:  2500
Batch num:  3000
Batch num:  3500
Batch num:  4000
Batch num:  4500
Batch num:  5000
Batch num:  5500
Batch num:  6000
Batch num:  6500


In [188]:
answer = []
for i in range(len(batchesX[7000:])):
    answer.append(model.predict_on_batch(np.array([batchesX[7000+i]])))

In [189]:
decoded_answer = []
for i in range(len(answer)):
    for j in range(len(answer[i][0])):
        if np.argmax(answer[i][0][j]) == 0:
            decoded_answer.append('NONE')
        elif np.argmax(answer[i][0][j]) == 1:
            decoded_answer.append('D_ST')
        elif np.argmax(answer[i][0][j]) == 2:
            decoded_answer.append('D_PT')        
        elif np.argmax(answer[i][0][j]) == 3:
            decoded_answer.append('D_EN')   
        elif np.argmax(answer[i][0][j]) == 4:
            decoded_answer.append('D_OR')    

In [190]:
df = pd.DataFrame(data= decoded_answer, index = labels[215475:])                        
df.to_csv('answer.csv')