In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
np.random.seed(0)
plt.style.use("ggplot")
import tensorflow as tf
from keras.utils import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot.tf_keras import PlotLossesCallback

In [2]:
data = pd.read_csv('./LSTM/datav1.4.csv',encoding ='latin1')
data = data.fillna(method = 'ffill')
# Unique words
print(data['Word'].nunique())
print(data['Tag'].nunique())
words = list(set(data['Word'].values))
words.append('ENDPAD')
num_words = len(words)
tags= list(set(data['Tag'].values))
num_tags = len(tags)
num_words, num_tags

35177
17


(35178, 17)

In [3]:
class sg(object):
    def __init__(self,data):
        self.n_sent = 1
        self.data = data
        af = lambda s: [(w,p,t) for w,p,t in zip(s['Word'].values.tolist(),
                                                s['POS'].values.tolist(),
                                                s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(af)
        self.sentences = [s for s in self.grouped]
g = sg(data)
s = g.sentences
s[2]

[('Helicopter', 'NN', 'O'),
 ('gunships', 'NNS', 'O'),
 ('Saturday', 'NNP', 'B-tim'),
 ('pounded', 'VBD', 'O'),
 ('militant', 'JJ', 'O'),
 ('hideouts', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('Orakzai', 'NNP', 'B-geo'),
 ('tribal', 'JJ', 'O'),
 ('region', 'NN', 'O'),
 (',', ',', 'O'),
 ('where', 'WRB', 'O'),
 ('many', 'JJ', 'O'),
 ('Taliban', 'NNP', 'B-org'),
 ('militants', 'NNS', 'O'),
 ('are', 'VBP', 'O'),
 ('believed', 'VBN', 'O'),
 ('to', 'TO', 'O'),
 ('have', 'VB', 'O'),
 ('fled', 'VBN', 'O'),
 ('to', 'TO', 'O'),
 ('avoid', 'VB', 'O'),
 ('an', 'DT', 'O'),
 ('earlier', 'JJR', 'O'),
 ('military', 'JJ', 'O'),
 ('offensive', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('nearby', 'JJ', 'O'),
 ('South', 'NNP', 'B-geo'),
 ('Waziristan', 'NNP', 'I-geo'),
 ('.', '.', 'O')]

In [4]:
wi = {w: i+1 for i,w in enumerate(words)}
ti = {t: i for i,t, in enumerate(tags)}
wi

{'Aung': 1,
 'Melanne': 2,
 'basin': 3,
 'punishment': 4,
 'Nations': 5,
 '608': 6,
 'Broadcaster': 7,
 'non-binding': 8,
 '28,000': 9,
 'selling': 10,
 'lntelligence': 11,
 'arch-rivals': 12,
 '2,500': 13,
 'exploit': 14,
 'fire': 15,
 '169.9': 16,
 'post-Castro': 17,
 '557': 18,
 'categorized': 19,
 'generators': 20,
 'Tomiaki': 21,
 'Sugiarto': 22,
 'cheered': 23,
 'WTO': 24,
 'Dagger': 25,
 'Saturday': 26,
 'Islamist-controlled': 27,
 'Rostropovich': 28,
 'welcoming': 29,
 'abused': 30,
 '7,200': 31,
 'Figueredo': 32,
 'selecting': 33,
 'resisting': 34,
 'Sen': 35,
 'Statements': 36,
 'sign': 37,
 'road-building': 38,
 'Josh': 39,
 'pastures': 40,
 '726': 41,
 'Archaeologists': 42,
 'Royal-Dutch': 43,
 'Security': 44,
 'Carib': 45,
 'Qantas': 46,
 '1955': 47,
 'Saudi-Syrian': 48,
 'physicist': 49,
 'abducted': 50,
 'Turkmens': 51,
 'punk': 52,
 'non-believers': 53,
 'symptoms': 54,
 'hitch': 55,
 'Netzarim': 56,
 '200-mile': 57,
 'pinions': 58,
 '1990': 59,
 'grassroots': 60,
 'rea

In [5]:
ml = 50
X= [[wi[w[0]] for w in s ] for s in s]
X = pad_sequences(maxlen=ml,sequences = X,padding ='post',value = num_words-1)
y= [[ti[w[2]] for w in s ] for s in s]
y = pad_sequences(maxlen=ml,sequences = y,padding ='post',value = ti["O"])
y = [to_categorical(i,num_classes = num_tags) for i in y]
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1)

In [10]:
input_word = Input(shape=(ml,))
m = Embedding(input_dim = num_words, output_dim = ml,input_length = ml)(input_word)
m= SpatialDropout1D(0.1)(m)
m = Bidirectional(LSTM(units=100, return_sequences = True, recurrent_dropout =0.1))(m)
out= TimeDistributed(Dense(num_tags,activation = 'softmax'))(m)
m = Model(input_word,out)
m.summary()
m.compile(loss='mse', optimizer='adam')

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 50, 50)            1758900   
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 50, 50)           0         
 lDropout1D)                                                     
                                                                 
 bidirectional_2 (Bidirectio  (None, 50, 200)          120800    
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 50, 17)           3417      
 tributed)                                                       
                                                           

In [11]:
es= EarlyStopping(monitor='val_accuracy',patience=1,verbose=0,mode='max',restore_best_weights=False)
cb = [PlotLossesCallback(),es]
h = m.fit(
    x_train, np.array(y_train),
    validation_split = 0.2,
    batch_size=32,
    epochs=3,
    verbose = 1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [12]:
m.evaluate(x_test,np.array(y_test))



0.00135014986153692