In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import pickle

from keras.models import Model, load_model
from keras.layers import Bidirectional, Dense, Input, Dropout, LSTM, Activation, TimeDistributed, BatchNormalization, concatenate, Concatenate
from keras.layers.embeddings import Embedding
from keras.constraints import max_norm
from keras import regularizers
from keras import optimizers
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.initializers import glorot_uniform
from keras import backend as K
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors

from grail_data_utils import *


np.random.seed(1)


Using TensorFlow backend.


In [124]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

pos1_to_index = load_obj('pos1_to_index')
index_to_pos1 = load_obj('index_to_pos1')
pos2_to_index = load_obj('pos2_to_index')
index_to_pos2 = load_obj('index_to_pos2')
p1_to_integer = load_obj('p1_to_integer')
integer_to_p1 = load_obj('integer_to_p1')
p2_to_integer = load_obj('p2_to_integer')
integer_to_p2 = load_obj('integer_to_p2')
p3_to_integer = load_obj('p3_to_integer')
integer_to_p3 = load_obj('integer_to_p3')
p4_to_integer = load_obj('p4_to_integer')
integer_to_p4 = load_obj('integer_to_p4')
s1_to_integer = load_obj('s1_to_integer')
integer_to_s1 = load_obj('integer_to_s1')
s2_to_integer = load_obj('s2_to_integer')
integer_to_s2 = load_obj('integer_to_s2')
s3_to_integer = load_obj('s3_to_integer')
integer_to_s3 = load_obj('integer_to_s3')
s4_to_integer = load_obj('s4_to_integer')
integer_to_s4 = load_obj('integer_to_s4')
s5_to_integer = load_obj('s5_to_integer')
integer_to_s5 = load_obj('integer_to_s5')
s6_to_integer = load_obj('s6_to_integer')
integer_to_s6 = load_obj('integer_to_s6')
s7_to_integer = load_obj('s7_to_integer')
integer_to_s7 = load_obj('integer_to_s7')

In [None]:
# correct
numClasses = len(index_to_pos1) + 1

In [113]:
# mistake 
numClasses = len(index_to_pos2) + 1

In [29]:
print(pos1_to_index)

{'PRO': 1, 'CS': 2, 'ADVWH': 3, 'DET': 4, 'PROREL': 5, 'ADJWH': 6, 'PROWH': 7, 'VPP': 8, 'VIMP': 9, 'VS': 10, 'P+PRO': 11, 'P': 12, 'VPR': 13, 'ADV': 14, 'CLO': 15, 'NPP': 16, 'ADJ': 17, 'CLR': 18, 'ET': 19, 'I': 20, 'NC': 21, 'CLS': 22, 'VINF': 23, 'P+D': 24, 'DETWH': 25, 'PONCT': 26, 'CC': 27, 'V': 28, 'PREF': 29}


In [3]:
def read_text_file(filename):
    with open(filename, 'r') as f:
        lines = 0
        maxlen = 0
        text = {}
        for line in f:
            line = line.strip().split()
            length = len(line)
            if (length > maxlen):
                maxlen = length
            text[lines] = line
            lines = lines + 1
    return text, lines, maxlen

In [4]:
def text_vocab(text):
    vocab = set()
    for (k,v) in text.items():
        for i in range(len(v)):
            word = v[i]
            if word not in vocab:
                vocab.add(word)
    return vocab

In [6]:
text, numLines, maxline = read_text_file('input.txt')

In [9]:
print(text[0])
print(numLines)

['Le', 'crédit', 'foncier', 'va', 'prendre', 'une', 'participation', 'de', '20', '%', 'dans', 'le', 'capital', 'de', "l'", 'immobilière', 'constructions', 'de', 'Paris', '(', 'ICP', ')', ',', 'qui', 'détient', "d'", 'importantes', 'participations', 'dans', 'des', 'sociétés', 'immobilières', "d'", 'investissement', '(', 'SII', ')', 'cotées', ',', 'comme', 'Sefimeg', ',', 'Cofimeg', ',', 'et', 'dans', 'plusieurs', 'sicomi', ',', '"', 'dans', 'lesquelles', 'le', 'crédit', 'foncier', 'est', 'déjà', 'présent', '"', 'précise', 'le', 'communiqué', '.']
20


In [10]:
maxLen = 266

vocab = text_vocab(text)

In [11]:
print(vocab)

{'ce', 'convertibles', 'formées', 'deux', 'cosmétiques', 'Toledo', 'travail', 'dès', 'les', 'Paris', 'plus', 'Europe', 'communiqué', 'il', 'tchécoslovaque', 'associations', 'processus', 'où', 'participations', 'malgré', 'sont', 'européenne', 'devant', 'animaux', 'incapables', 'tests', '1989', 'lesquelles', 'ouest-allemands', 'réunion', 'Vaclav', 'assistance', 'octobre', 'atteignait', 'Biscaye', 'avait', 'transférable', 'effet', 'se', 'chiffre', '-', 'mixtes', 'aux', 'Luft', 'discours', 'simple', 'compte', 'reculé', 'détérioration', '12', 'date', 'CEE', 'industrie', '...', '10', 'importante', 'milliards', 'va', 'intervienne', 'hauteur', 'cotées', 'autoriser', 'un', 'financière', 'En', 'commerce', 'selon', 'douze', 'investissement', 'trouve', 'constructions', 'par', 'production', 'actuellement', 'ensemble', 'échanges', 'ICP', 'moyennes', 'SII', 'réalisé', 'autonomie', '10,7', "n'", 'envisagent', 'Mr', 'baisse', 'vingt-cinq', 'parfums', 'majoritaires', 'alors', 'est-allemand', 'approuvée'

In [81]:
word_to_index, index_to_word = indexify(vocab)

In [133]:
def word_to_prefvec(word, alen, afset, af_to_int):
    if len(word) >= alen:
        pref = word[:alen]
        if pref in afset:
            int = af_to_int[pref]
        else:
            int = af_to_int['*UNK*']
    else:
        int = af_to_int['*OOR*']
    return to_categorical(int, len(afset)+1)


def word_to_sufvec(word, alen, afset, af_to_int):
    if len(word) >= alen:
        pref = word[-alen:]
        if pref in afset:
            int = af_to_int[pref]
        else:
            int = af_to_int['*UNK*']
    else:
        int = af_to_int['*OOR*']
    return to_categorical(int, len(afset)+1)


In [134]:
prefix1 = p1_to_integer.keys()
prefix2 = p2_to_integer.keys()
prefix3 = p3_to_integer.keys()
prefix4 = p4_to_integer.keys()

suffix1 = s1_to_integer.keys()
suffix2 = s2_to_integer.keys()
suffix3 = s3_to_integer.keys()
suffix4 = s4_to_integer.keys()
suffix5 = s5_to_integer.keys()
suffix6 = s6_to_integer.keys()
suffix7 = s7_to_integer.keys()

In [135]:
def word_to_prefix_vector(word):
    p1 = word_to_prefvec(word, 1, prefix1, p1_to_integer)
    p2 = word_to_prefvec(word, 2, prefix2, p2_to_integer)
    p3 = word_to_prefvec(word, 3, prefix3, p3_to_integer)
    p4 = word_to_prefvec(word, 4, prefix4, p4_to_integer)
    return np.concatenate((p1,p2,p3,p4))

def word_to_suffix_vector(word):
    s1 = word_to_sufvec(word, 1, suffix1, s1_to_integer)
    s2 = word_to_sufvec(word, 2, suffix2, s2_to_integer)
    s3 = word_to_sufvec(word, 3, suffix3, s3_to_integer)
    s4 = word_to_sufvec(word, 4, suffix4, s4_to_integer)
    s5 = word_to_sufvec(word, 5, suffix5, s5_to_integer)
    s6 = word_to_sufvec(word, 6, suffix6, s6_to_integer)
    s7 = word_to_sufvec(word, 7, suffix7, s7_to_integer)
    return np.concatenate((s1,s2,s3,s4,s5,s6,s7))


In [136]:
def compute_affixes(vocab):
    
    word_to_suffix = {}
    word_to_prefix = {}

    for word in vocab:
        w = word.lower()
        w = re.sub(r'[0-8]', '9', w)
        pvec = word_to_prefix_vector(w)
        svec = word_to_suffix_vector(w)
        word_to_prefix[word] = pvec
        word_to_suffix[word] = svec
        
    return word_to_prefix, word_to_suffix

word_to_prefix, word_to_suffix = compute_affixes(vocab)


In [38]:
wv = KeyedVectors.load_word2vec_format('../wang2vec/frwiki_cwindow50_10.bin', binary=True)
veclength = 50

In [36]:
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text

In [39]:
word_to_vec_map = {}
unknowns = set()
invoc = 0

for w in vocab:
    wn = normalize_word(w)
    wr = remove_prefix(wn, "-t-")
    wr = remove_prefix(wr, "-")
    try:
        vec = wv[wr]
        invoc = invoc + 1
    except:
        unknowns.add(w)
        vec = np.zeros(veclength)
    word_to_vec_map[w] = vec

print('Unknowns: ', len(unknowns))
print('In vocabulary: ', invoc)


Unknowns:  63
In vocabulary:  312


In [102]:
X_pref = np.zeros((numLines, 266, 5788))
X_suff = np.zeros((numLines, 266, 14983))
X_word_emb = np.zeros((numLines, 266, 50))
X_indices = np.zeros((numLines,266))

In [137]:
for i in range(numLines):
    line = text[i]
    for j in range(len(line)):
        word = line[j]
        X_pref[i,j,:] = word_to_prefix[word]
        X_suff[i,j,:] = word_to_suffix[word]
        X_word_emb[i,j,:] = word_to_vec_map[word]
        X_indices[i,j] = word_to_index[word]

In [45]:
print(X_pref)

[[[ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 ..., 
 [[ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  

In [138]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained fastText vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 2           # adding 1 for 'unknown'and 1 to fit Keras embedding
    emb_dim = word_to_vec_map["est"].shape[0]    # get dimensionality of word vectors
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len,emb_dim,trainable=False,mask_zero=True)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [139]:
def POS_model(input_shape, word_to_vec_map, word_to_prefix, word_to_suffix, word_to_index):
    """
    Function creating the graph for the part-of-speech tagger model
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its fastText vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    prefix_emb = pretrained_embedding_layer(word_to_prefix, word_to_index)
    suffix_emb = pretrained_embedding_layer(word_to_suffix, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    pref = prefix_emb(sentence_indices)
    suff = suffix_emb(sentence_indices)
    P = Dense(32,kernel_constraint=max_norm(5.))(pref)
    S = Dense(32,kernel_constraint=max_norm(5.))(suff)
    merged = concatenate([embeddings,P,S])
    X = Dropout(0.5)(merged)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # returning a batch of sequences.
    X = Bidirectional(LSTM(128, recurrent_dropout=0.2, kernel_constraint=max_norm(5.), return_sequences=True))(X)
    X = BatchNormalization()(X)
    Y = TimeDistributed(Dropout(0.2))(X)
    # Add a (time distributed) Dense layer followed by a softmax activation
    Y = TimeDistributed(Dense(numClasses, activation='softmax'))(Y)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices,outputs=Y)
        
    return model


In [140]:
model = POS_model((maxLen,), word_to_vec_map, word_to_prefix, word_to_suffix, word_to_index)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 266)          0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 266, 5788)    2182076     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 266, 14983)   5648591     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 266, 50)      18850       input_5[0][0]                    
__________________________________________________________________________________________________
dense_12 (

In [88]:
trained_model = load_model('best_pos1.h5')

In [89]:
trained_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 266)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 266, 5788)    175387976   input_1[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 266, 14983)   454014866   input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 266, 50)      1515100     input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (D

In [128]:
weights = trained_model.get_weights()

In [141]:
weights2 = model.get_weights()

In [96]:
print(np.shape(weights[0]))
print(np.shape(weights2[0]))

print(np.shape(weights[1]))
print(np.shape(weights2[1]))

print(np.shape(weights[2]))
print(np.shape(weights2[2]))

print(len(weights))

(30302, 5788)
(377, 5788)
(30302, 14983)
(377, 14983)
(30302, 50)
(377, 50)
19


In [142]:
for i in range(3,len(weights)):
    weights2[i] = weights[i]

In [143]:
model.set_weights(weights2)

In [144]:
predictions = model.predict(X_indices)

In [111]:
print(index_to_pos1)

{1: 'PRO', 2: 'CS', 3: 'ADVWH', 4: 'DET', 5: 'PROREL', 6: 'ADJWH', 7: 'PROWH', 8: 'VPP', 9: 'VIMP', 10: 'VS', 11: 'P+PRO', 12: 'P', 13: 'VPR', 14: 'ADV', 15: 'CLO', 16: 'NPP', 17: 'ADJ', 18: 'CLR', 19: 'ET', 20: 'I', 21: 'NC', 22: 'CLS', 23: 'VINF', 24: 'P+D', 25: 'DETWH', 26: 'PONCT', 27: 'CC', 28: 'V', 29: 'PREF'}


In [151]:
for i in range(len(X_indices)-1):
    for j in range(len(X_indices[i]-1)):
        if X_indices[i][j] != 0:
            num = np.argmax(predictions[i][j])
            wi = int(X_indices[i][j])
            print(index_to_word[wi], end='')
            print('|', end='')
            print(index_to_pos1[num], end=' ')
    print()

Le|DET crédit|NC foncier|ADJ va|V prendre|VINF une|DET participation|NC de|P 20|DET %|NC dans|P le|DET capital|NC de|P l'|DET immobilière|NPP constructions|NPP de|P Paris|NPP (|PONCT ICP|NPP )|PONCT ,|PONCT qui|PROREL détient|V d'|DET importantes|ADJ participations|NC dans|P des|DET sociétés|NC immobilières|ADJ d'|P investissement|NC (|PONCT SII|NPP )|PONCT cotées|VPP ,|PONCT comme|ADV Sefimeg|NPP ,|PONCT Cofimeg|NPP ,|PONCT et|CC dans|P plusieurs|DET sicomi|NC ,|PONCT "|PONCT dans|P lesquelles|PROREL le|DET crédit|NC foncier|ADJ est|V déjà|ADV présent|ADJ "|PONCT précise|V le|DET communiqué|NC .|PONCT 
Dans|P le|DET capital|NC d'|P ICP|NC ,|PONCT où|PROREL le|DET groupe|NC centenaire|ADJ Blanzy|NPP est|V majoritaire|ADJ ,|PONCT on|CLS trouve|V également|ADV les|DET mutuelles|NC du|P+D Mans|NPP ,|PONCT à|P hauteur|NC de|P 20|DET %|NC .|PONCT 
Le|DET vice-|PREF premier|ADJ ministre|NC est-allemand|ADJ en|P charge|NC des|P+D affaires|NC économiques|ADJ ,|PONCT Mme|NC Christa|NPP Luft|NPP