# LSTM part-of-speech tagging and supertagging for the French Treebank: 

This notebook trains a part-of-speech tagger and supertagger for the French Treebank using a vanilla bi-direction LSTM network.

Run the following cell to load the Keras packages.

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import pickle


from keras.models import Model, load_model
from keras.layers import Bidirectional, Dense, Input, Dropout, LSTM, Activation, TimeDistributed, BatchNormalization, concatenate, Concatenate
from keras.layers.embeddings import Embedding
from keras.constraints import max_norm
from keras import regularizers
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.initializers import glorot_uniform
from keras import backend as K
from sklearn.model_selection import train_test_split

from grail_data_utils import *

%matplotlib inline

np.random.seed(1)

### Read the TLGbank file

In [134]:
# sentences with verified parses
# number of sentences, train: 9449, test: 3150, dev: 3150
words, Y1, Y2, S, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxLen = read_maxentdata('parsed.txt')

In [117]:


print()
print("Longest sentence   : ", maxLen)
print("Number of POS tags : ", numClasses)
print("Number of supertags: ", numSuperClasses)



Longest sentence   :  140
Number of POS tags :  32
Number of supertags:  891


## 1. Split the input into train/dev/test

Split the full training set into 60% train, 20% dev and 20% test.

In [5]:
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


## 2. Create auxiliary mappings

Create mappings from supertags and the two sets of part-of-speech tags to integers and back.

In [8]:
super_to_index = load_obj('super_to_index')
index_to_super = load_obj('index_to_super')
pos1_to_index = load_obj('pos1_to_index')
index_to_pos1 = load_obj('index_to_pos1')
pos2_to_index = load_obj('pos2_to_index')
index_to_pos2 = load_obj('index_to_pos2')
p1_to_integer = load_obj('p1_to_integer')
integer_to_p1 = load_obj('integer_to_p1')
p2_to_integer = load_obj('p2_to_integer')
integer_to_p2 = load_obj('integer_to_p2')
p3_to_integer = load_obj('p3_to_integer')
integer_to_p3 = load_obj('integer_to_p3')
p4_to_integer = load_obj('p4_to_integer')
integer_to_p4 = load_obj('integer_to_p4')
s1_to_integer = load_obj('s1_to_integer')
integer_to_s1 = load_obj('integer_to_s1')
s2_to_integer = load_obj('s2_to_integer')
integer_to_s2 = load_obj('integer_to_s2')
s3_to_integer = load_obj('s3_to_integer')
integer_to_s3 = load_obj('integer_to_s3')
s4_to_integer = load_obj('s4_to_integer')
integer_to_s4 = load_obj('integer_to_s4')
s5_to_integer = load_obj('s5_to_integer')
integer_to_s5 = load_obj('integer_to_s5')
s6_to_integer = load_obj('s6_to_integer')
integer_to_s6 = load_obj('integer_to_s6')
s7_to_integer = load_obj('s7_to_integer')
integer_to_s7 = load_obj('integer_to_s7')


In [116]:
numSuperClasses = len(index_to_super) + 1

Y = lists_to_indices(S, super_to_index, maxLen)

In [45]:
print(Y[0])
print(np.shape(Y))

[ 221.  599.  264.  890.   11.  597.  319.   38.  264.  890.   11.  597.
  653.  597.  315.   20.  585.  756.   11.  597.  174.   11.  597.   57.
    1.  597.  653.  597.  319.  162.   11.  597.   57.    1.  597.  319.
  174.   11.  597.  319.  315.  330.  832.  221.  609.  426.  387.  597.
  429.   11.  597.  319.  653.  597.  174.  221.  653.  597.  315.  162.
   11.  597.  319.  653.  597.  629.   11.  597.  724.  361.   11.  597.
   57.  597.  174.  221.  315.  447.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.]
(4320, 140)


In [128]:
Yin = Y[:2795]
print(np.shape(Yin))

(2795, 140)


In [26]:
def read_data(file):
    with open(file, 'r') as f:
        out = []
        for line in f:
            list = []
            line = line.strip().split()
            for i in line:
                list.append(i)
            out.append(list)
    return np.asarray(out) 

In [27]:
LeftList  = read_data('brackets_left.txt')
RightList = read_data('brackets_right.txt')

In [104]:
def l_to_indices(X, max_len):

    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len,1))

    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split it into words. You should get a list of words.
        list = X[i]

        j = 0
        
        # Loop over the words of sentence_words
        for w in list:
            try:
                X_indices[i, j, 0] = float(w)
            except:
                print("Not a float/integer: ", w)
                X_indices[i, j, 0] = 0  # unknown
            # Increment j to j + 1
            j = j + 1
            
    return X_indices


In [105]:
Left = l_to_indices(LeftList, maxLen)
Right = l_to_indices(RightList, maxLen)

In [46]:
print(Left[0])
print(np.shape(Left))

[ 1.  2.  2.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  2.  0.  1.  2.
  1.  1.  1.  4.  0.  1.  2.  1.  1.  0.  0.  1.  2.  0.  1.  2.  1.  0.
  1.  1.  1.  0.  2.  0.  1.  3.  1.  0.  1.  0.  1.  2.  4.  0.  1.  0.
  1.  0.  1.  0.  2.  0.  1.  2.  0.  1.  0.  1.  2.  0.  2.  0.  2.  0.
  1.  1.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
(2795, 140)


In [118]:
sentence_indices = Input(shape = (maxLen,), dtype = 'int32')

In [119]:
emb = Embedding(numSuperClasses,32,trainable=True,mask_zero=True)(sentence_indices)

In [120]:
X = Bidirectional(LSTM(128, return_sequences=True))(emb)

In [121]:
X = TimeDistributed(Dense(32,kernel_constraint=max_norm(5.)))(X)

In [122]:
L =  TimeDistributed(Dense(1,kernel_constraint=max_norm(5.)))(X)
outl = Activation('relu')(L)

In [123]:
R =  TimeDistributed(Dense(1,kernel_constraint=max_norm(5.)))(X)
outr = Activation('relu')(R)

In [124]:
model = Model(sentence_indices, [outl, outr])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 140)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 140, 32)      28512       input_5[0][0]                    
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) (None, 140, 256)     164864      embedding_3[0][0]                
__________________________________________________________________________________________________
time_distributed_12 (TimeDistri (None, 140, 32)      8224        bidirectional_7[0][0]            
__________________________________________________________________________________________________
time_distr

In [125]:
model.compile(optimizer='rmsprop', loss=['mae','mae']) 

In [126]:
history = model.fit(Yin, [Left,Right], epochs=30, batch_size=32,validation_split=0.2)

Train on 2236 samples, validate on 559 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [130]:
Lpred, Rpred = model.predict(Y[4000:])

In [135]:
print(words[4000])

['-', '21', 'septembre', '2000', ':', 'les', 'révélations', 'posthumes', 'de', 'Jean-Claude', 'Méry', '(', 'cassette', 'Méry', ')', 'sont', 'publiées', 'par', 'le', 'journal', 'Le', 'Monde', '.']


In [131]:
print(Lpred[0])

[[ 3.51808262]
 [ 0.        ]
 [ 1.06027424]
 [ 0.        ]
 [ 2.71995115]
 [ 0.        ]
 [ 2.04945779]
 [ 0.        ]
 [ 1.07729733]
 [ 1.14353406]
 [ 0.44215032]
 [ 1.10520411]
 [ 1.05459976]
 [ 0.        ]
 [ 2.13619494]
 [ 0.        ]
 [ 1.00193095]
 [ 0.99981958]
 [ 1.06076026]
 [ 1.04573667]
 [ 1.08062315]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.     

In [132]:
print(Rpred[0])

[[ 0.        ]
 [ 1.01456225]
 [ 0.        ]
 [ 2.68298173]
 [ 0.        ]
 [ 0.93558681]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 1.7421037 ]
 [ 0.        ]
 [ 0.        ]
 [ 6.3043251 ]
 [ 0.        ]
 [ 1.07220888]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 9.12233067]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]
 [ 0.     