## LSTM part-of-speech tagging for the French Treebank: 

This notebook trains a part-of-speech tagger for the French Treebank using a vanilla bi-direction LSTM network.

Run the following cell to load the Keras packages.

In [38]:
import numpy as np
import matplotlib.pyplot as plt
import re

from keras.models import Model
from keras.layers import Bidirectional, Dense, Input, Dropout, LSTM, Activation, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.initializers import glorot_uniform
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

%matplotlib inline

np.random.seed(1)

In [2]:
def normalize_word(orig_word):
    word = orig_word.lower()
    # get rid of all special symbols unless all symbols are special
    word2 = re.sub('\W+','', word)
    if word2 != '':
        word = word2
    return word
    

In [184]:
def read_maxentdata(file):
    with open(file, 'r') as f:
        vocabulary = set()
        vnorm = set()
        partsofspeech1 = set()
        partsofspeech2 = set()
        superset = set()
        sentno = 0
        maxlen = 0
        words = []
        postags1 = []
        postags2 = []
        supertags = []
        allwords = []
        allpos1 = []
        allpos2 = []
        allsuper = []
        for line in f:
            line = line.strip().split()
            length = len(line)
            if (length > maxlen):
                maxlen = length
            for l in range(length):
                item = line[l].split('|')
                orig_word = item[0]
                word = normalize_word(orig_word)
                postag = item[1]
                supertag = item[2]
                poslist = postag.split('-')
                pos1 = poslist[0]
                pos2 = poslist[1]
                vocabulary.add(orig_word)
                vnorm.add(word)
                partsofspeech1.add(pos1)
                partsofspeech2.add(pos2)
                superset.add(supertag)
                words.append(orig_word)
                postags1.append(pos1)
                postags2.append(pos2)
                supertags.append(supertag)
            allwords.append(words)
            allpos1.append(postags1)
            allpos2.append(postags2)
            allsuper.append(supertags)
            words = []
            postags1 = []
            postags2 = []
            supertags = []
            
        X = np.asarray(allwords)
        Y1 = np.asarray(allpos1)
        Y2 = np.asarray(allpos2)
        Z = np.asarray(allsuper)
        return X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen

In [230]:
X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxLen = read_maxentdata('m2.txt')
# X, Y1, Y2, Z, vocabulary, partsofspeech1, partsofspeech2, superset, maxLen = read_maxentdata('aa1.txt')

print(X[1])
print(Y2[1])

numClasses = len(partsofspeech2)+1
numSuperClasses = len(superset)+1

print()
print("Longest sentence   : ", maxLen)
print("Number of POS tags : ", numClasses)
print("Number of supertags: ", numSuperClasses)


print("de" in vocabulary)
print("de" in vnorm)
print("le" in vocabulary)
print("le" in vnorm)

['La', 'Caixa', 'prend', '49', '%', 'de', 'Lyonnaise', 'Espana', ',', 'dont', 'le', 'groupe', 'français', 'ne', 'détiendra', 'plus', 'que', '51', '%', ';', 'en', 'échange', ',', 'elle', 'cède', 'à', 'celui', '-ci', '3,5', '%', 'de', 'la', 'SGAB', 'et', '19,6', '%', "d'", 'ACESA', ',', 'première', 'société', 'concessionnaire', "d'", 'autoroutes', 'en', 'Espagne', '(', '500', 'kilomètres', "d'", 'autoroutes', 'et', 'un', 'chiffre', "d'", 'affaires', '1990', 'de', '37', 'milliards', 'de', 'pesetas', ')', ',', 'dont', 'elle', 'conserve', '26,6', '%', '.']
['DET:ART', 'NAM', 'VER:pres', 'NUM', 'SYM', 'PRP', 'NAM', 'NAM', 'PUN', 'PRO:REL', 'DET:ART', 'NOM', 'ADJ', 'ADV', 'VER:futu', 'ADV', 'ADV', 'NUM', 'SYM', 'PUN', 'PRP', 'NOM', 'PUN', 'PRO:PER', 'VER:pres', 'PRP', 'PRO:DEM', 'ADV', 'NUM', 'SYM', 'PRP', 'DET:ART', 'ABR', 'KON', 'NUM', 'SYM', 'PRP', 'ABR', 'PUN', 'NUM', 'NOM', 'ADJ', 'PRP', 'NOM', 'PRP', 'NAM', 'PUN', 'NUM', 'NOM', 'PRP', 'NOM', 'KON', 'DET:ART', 'NOM', 'PRP', 'NOM', 'NUM',

In [187]:
# split the training data into the standard 60% train, 20% dev, 20% test 
X_train, X_testdev, Y_train, Y_testdev = train_test_split(X, Y2, test_size=0.4)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_testdev, Y_testdev, test_size=0.5)
print("Train: ", X_train.shape)
print("Test:  ", X_test.shape)
print("Dev:   ", X_dev.shape)


Train:  (9449,)
Test:   (3150,)
Dev:    (3150,)


In [188]:
# create a bi-directional mapping (using two dictionaries) translating elements of a set to and from integers

def indexify (set):
    i = 1
    item_to_index = {}
    index_to_item = {}

    for item in set:
        item_to_index[item] = i
        index_to_item[i] = item
        i = i + 1

    return item_to_index, index_to_item    

In [189]:
# create mapping for the two POS tagset and for the supertags

super_to_index, index_to_super = indexify(superset)
pos1_to_index, index_to_pos1 = indexify(partsofspeech1)
pos2_to_index, index_to_pos2 = indexify(partsofspeech2)
print(pos2_to_index)

{'PUN': 1, 'ABR': 2, 'NUM': 3, 'NAM': 4, 'VER:infi': 5, 'ADV': 6, 'VER:pper': 7, 'VER:futu': 8, 'VER:cond': 9, 'VER:impf': 10, 'PRO': 11, 'SYM': 12, 'PRO:IND': 13, 'PRP': 14, 'VER:ppre': 15, 'VER:subp': 16, 'PRO:PER': 17, 'DET:ART': 18, 'VER:pres': 19, 'KON': 20, 'PRO:POS': 21, 'VER:simp': 22, 'PUN:cit': 23, 'PRO:DEM': 24, 'NOM': 25, 'PRP:det': 26, 'INT': 27, 'PRO:REL': 28, 'ADJ': 29, 'DET:POS': 30, 'VER:impe': 31}


In [231]:
def read_vecs(file):
    with open(file, 'r') as f:
        words = set()
        vocabn = vnorm
        vocab = vocabulary
        word_to_vec_map = {}
        emsize = 0
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            if (curr_word in vocabn):
                vocabn.remove(curr_word)
                vocab.discard(curr_word)
                words.add(curr_word)                
                emb = np.array(line[1:], dtype=np.float64)
                emsize = np.size(emb)
                features = word_features(curr_word)
                suf = suffix_vector(curr_word)
                word_to_vec_map[curr_word] = np.concatenate((emb,suf,features))

        for w in vocab:
            words.add(w)
            wn = normalize_word(w)
            emb = np.zeros(emsize)
            suf = suffix_vector(wn)
            try:
                vec = word_to_vec_map[wn]
            except:  
                print(w)
                features = word_features(w, unknown=True)
                word_to_vec_map[w] = np.concatenate((emb,suf,features))
            else:
                emb = vec[0:emsize]
                features = word_features(w)
                word_to_vec_map[w] = np.concatenate((emb,suf,features))

        for w in vocabn:
            words.add(w)
            try:
                vec = word_to_vec_map[w]
            except:  
                print(w)
                features = word_features(w, unknown=True)
                emb = np.zeros(emsize)
                suf = suffix_vector(w)
                word_to_vec_map[w] = np.concatenate((emb,suf,features))
        
                
        i = 2  # keep 1 for unknown
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map


### 2.1 - Overview of the model

Here is the Emojifier-v2 you will implement:

<img src="images/emojifier-v2.png" style="width:700px;height:400px;"> <br>
<caption><center> **Figure 3**: Emojifier-V2. A 2-layer LSTM sequence classifier. </center></caption>



In [232]:
word_to_index, index_to_word, word_to_vec_map = read_vecs('/Volumes/LaCie/Corpus/fastText/wiki.fr.vec')

1911
55000
Ledoucin
127
stations-service
eaux-Dumez
Pif-gadget
36,7
18,27
2002
472
1961
32,2
11.01.02.
40,95
centre-gauche
110,44
6,40
750
Mesrahi
soixante-neuf
exploration-production
12
123,75
1953
12,5
50,6
2,96
35-49
3
17.500
56
35,7
3.400
Mirage-2000
10500
Gaîté-lyrique
113
2,1
Peylevade
FR3
Sous-traités
6ème
280
6
Taxdisk
B-2
108
295,70
5,52
44.000
ville-campagne
54,8
sous-emploi
Bade-Wurtemberg
9,1
92-1268
1975
coûts-bénéfices
50
1988-1991
anglo-néerlandaise
2,50
Warner-Seven
1.360
671
Bordeaux-Mérignac
2,95
2266106333
A3
2664
1,7
77,4
Rose-Croix
N1
101
1949
député-maire
740
640
5,4
Saint-Pierre-et-Miquelon
1842
8,77
150.000
Champs-Elysées
7
93
politico-financiers
Paris-Montsouris
VP-Schickedanz
A4
1982-1983
Saint-Sauveur
Euroc
1914-1915
expo-vente
minifundisme
28,5
sociétés-écrans
arrière-pays
France-Maîtrise
liguait
606
7,6
18,50
13.719
déjeunâmes
Peletons
Etat-caisses
9,442
39,6
353
55,4
4,35
1.875
245
Mireille-Bénédicte
6.300
ultraminces
Cushionning
1ère
Marriott-prince-de-Ga

8e
108,20
Eco-Systeme
vingt-deux
80
685
appuis-tête
232,59
8000
trente-six
1,5180
est-à-dire
multimédiatiques
10
24
1810
20,2
1640
544.412
Hurand
époumonés
4.890
1,57
RD192
104
0,6
sous-préfet
ejideros
49,32
8,50
85
BGAG
peronnellement
847
1611,4
250000
36e
avant-coureur
démocrate-chrétien
Corradetti
privatisables
maine-anjou
42
350
9,6
26,4
GAN-CIC
1920-1921
quittâmes
1943
Edenor
Paris-Nantes
Béghin-Say
1900
enlisera
RPR-UDF
40.000
92.535
1947
assurance-vieillesse
348
contre-visite
1,20
1.000
Dynaction
dix-huitième
Mafouz
mollétiste
24,2
Port-Louis
30,4
immolons
Bourg-Saint-Maurice
Sainte-Honorine
Fitz-Pegado
A36
770
dégraissages
Aritmos
Pepsi-Cola
Epéda-Bertrand-Faure
oeils
3.379,50
Elmalek
78,5
32
voitures-ventouses
Louise-Yvonne
Munchener
109,70
Ma'Pub
191
ultrafrais
98
MTV3
120000
Pointe-des-Galets
Arouy
30,3
révisaient
Geneimwirtschaft
1900-1904
hôtellerie-restauration
73.000
Niedenhoff
George-V
73,8
3.066.400
Heyris
monte-matériaux
gouvenement
Saint-Vincent-de-Paul
370
Saint-Ger

île-de-France
0,60
Pierre-François
18
Jean-Benoit
227.000
Boulogne-sur-mer
fabricants-grossistes
17605
385
Ratier-Figeac
550
sous-sol
0,3
abordions
155.000
3.040
marchandise-étalon
1,25
89,9
Indochine-France
2,569
Kimberly-Clark
23.700
1871
soixante-deux
1,4450
avant-veille
1875
Sochata
488.000
tourisme-loisirs
GEC-Alsthom
Ileau
surperformé
228
Moellemann
2.303
Sapeurs-pompiers
titres-chocs
495.000
Saint-Siège
1.326
Langoni
8
270
1983
3,4
interpellez
485.000
1750
Gensis
trente-huit
0,74
40,13
3.130
95,8
Tywersus
bleu-verte
Punta-del-Este
Jacques-Bailleurs
Senhao
34000
Rouméas
Montigny-les-Metz
découvrimes
downgrading
Grand-Charmont
28,8
Servan-Schreiber
sous-traitants
1988-1990
2.251
60.000
703.000
335,6
Saint-Martin-Rivoli
zapperaient
16,10
Fère-Champenoise
:
Meurthe-et-Moselle
66,72
13,6
3.028.300
15,1
top-down
1986-1988
637
hors-médias
hypothécables
540
1857
188
3,333
1.883,64
Saurer-Diederichs
2A
5,75
Tluszczowego
embrassions
178
plurifonctionnalité
38.503
conjoncturiste
5,10
Cofre

ultrafertile
alahadji
opfi
étalonboeuf
sousestimer
tampering
1073
60000
waldeckrousseau
duperier
peugeotcitroën
eysymontt
saintvincentdepaul
19831988
954
décretloi
1372
granddallas
cinquantesix
parisnantes
11044
356
552000
mariemarvingt
1212
41000
jeandaniel
85300
nondits
mathussière
merlingérin
29910
ciater
alliotmarie
aprèsmidi
jfkennedy
korauto
096
energiesud
hommeterminal
872
218479
11220
34070
617
19931997
grandprairie
parisbâle
etatcaisses
marienoëlle
1125
fainsvéel
75000
5630
20005
fauxsemblant
desktopiv
n1
chrétiendémocrate
làmême
4867
roosel
pursang
lyonfigaro
23259
prêtàporter
528
257
chegrouche
g20
9747
935
sousestimé
etatnation
karenztag
trompeloeil
4119
2303
10062
montreuxchâteau
quarantehuit
passepasse
1326
ctcoe
maineetloire
bjoernskov
francoitaliens
malaimée
pointedesgalets
servanschreiber
2534
316883
trèsconsidérable
134804
luimême
2200
contreexemple
1416
udfrpr
voltaredonda
francetélécom
6829
metalbox
transcet
estallemand
006
ponviane
664
132000
bartabac
soidisant
fnm

montpellieri
958
soixantedeux
24000
3800
sousgouverneur
plandesetangs
256
hautegaronne
628
jeansébastien
alhuwalia
88560
fnseacnja
1002
994
nousmêmes
plazaathénée
112000
grandduché
2215
dassaultelectronique
ernewein
pleinair
matrahachette
800000
essuietout
savoirfaire
19791981
alsacelorraine
soixanteneuf
rhônealpes
dixhuitième
dresdnerbank
3035
goetzfrid
1568
23700
sapeurspompiers
quarantetrois
15180
sousévalués
france2
4065
yachtclub
toujoursplus
19666
303
240000
endessous
seineetmarne
bouchesdurhône
3356
341
239
psacitroën
fetese
sandoglass
5200
2231
matuikhin
commutanttype
colombierfontaine
sociétésécrans
185
confartigianato
compuadd
u2
357
151
476
semiremorque
261
odudu
19901992
psapeugeotcitroën
84000
6700
yonnne
avantprojet
7960
74000
322
schaaff
9192
piraben
tiersmonde
costdisk
r19
jeanmathieu
10422
8769
miavril
strausskahn
539
châteaumargaux
ouestallemande
522
morganstanley
francosuédoise
topdown
cellesci
contrerapport
quatrevingtdixneuf
lavelinge
meurtheetmoselle
2660
13139
ma

### 2.2 Keras and mini-batching 

In this exercise, we want to train Keras using mini-batches. However, most deep learning frameworks require that all sequences in the same mini-batch have the same length. This is what allows vectorization to work: If you had a 3-word sentence and a 4-word sentence, then the computations needed for them are different (one takes 3 steps of an LSTM, one takes 4 steps) so it's just not possible to do them both at the same time.

The common solution to this is to use padding. Specifically, set a maximum sequence length, and pad all sequences to the same length. For example, of the maximum sequence length is 20, we could pad every sentence with "0"s so that each input sentence is of length 20. Thus, a sentence "i love you" would be represented as $(e_{i}, e_{love}, e_{you}, \vec{0}, \vec{0}, \ldots, \vec{0})$. In this example, any sentences longer than 20 words would have to be truncated. One simple way to choose the maximum sequence length is to just pick the length of the longest sentence in the training set. 


### 2.3 - The Embedding layer

In Keras, the embedding matrix is represented as a "layer", and maps positive integers (indices corresponding to words) into dense vectors of fixed size (the embedding vectors). It can be trained or initialized with a pretrained embedding. In this part, you will learn how to create an [Embedding()](https://keras.io/layers/embeddings/) layer in Keras, initialize it with the GloVe 50-dimensional vectors loaded earlier in the notebook. Because our training set is quite small, we will not update the word embeddings but will instead leave their values fixed. But in the code below, we'll show you how Keras allows you to either train or leave fixed this layer.  

The `Embedding()` layer takes an integer matrix of size (batch size, max input length) as input. This corresponds to sentences converted into lists of indices (integers), as shown in the figure below.

<img src="images/embedding1.png" style="width:700px;height:250px;">
<caption><center> **Figure 4**: Embedding layer. This example shows the propagation of two examples through the embedding layer. Both have been zero-padded to a length of `max_len=5`. The final dimension of the representation is  `(2,max_len,50)` because the word embeddings we are using are 50 dimensional. </center></caption>

The largest integer (i.e. word index) in the input should be no larger than the vocabulary size. The layer outputs an array of shape (batch size, max input length, dimension of word vectors).

The first step is to convert all your training sentences into lists of indices, and then zero-pad all these lists so that their length is the length of the longest sentence. 

**Exercise**: Implement the function below to convert X (array of sentences as strings) into an array of indices corresponding to words in the sentences. The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 

In [233]:
def lists_to_indices(X, item_to_index, max_len, normalize=False):

    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))

    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split it into words. You should get a list of words.
        list = X[i]

        j = 0
        
        # Loop over the words of sentence_words
        for w in list:
            if normalize == True:
                w = normalize_word(w)
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            try:
                X_indices[i, j] = item_to_index[w]
            except:
                X_indices[i, j] = 1  # unknown
            # Increment j to j + 1
            j = j + 1
            
    return X_indices


In [234]:
# GRADED FUNCTION: sentences_to_indices

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split it into words. You should get a list of words.
        sentence_words = X[i]
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            w = normalize_word(w)
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            try:
                X_indices[i, j] = word_to_index[w]
            except:
                print("Unknown: ", w)
                X_indices[i, j] = 1   # index for unknown words
            # Increment j to j + 1
            j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

Run the following cell to check what `sentences_to_indices()` does, and check your results.

In [235]:
sentences_to_indices(X_train, word_to_index, maxLen)

array([[  3.00000000e+00,   9.77800000e+03,   2.54500000e+04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.24330000e+04,   1.64340000e+04,   2.36640000e+04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.85590000e+04,   1.43930000e+04,   1.40000000e+01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  3.47370000e+04,   3.61060000e+04,   1.43070000e+04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.31430000e+04,   1.65460000e+04,   1.63290000e+04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.37200000e+04,   3.26670000e+04,   2.43880000e+04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

Let's build the `Embedding()` layer in Keras, using pre-trained word vectors. After this layer is built, you will pass the output of `sentences_to_indices()` to it as an input, and the `Embedding()` layer will return the word embeddings for a sentence. 

**Exercise**: Implement `pretrained_embedding_layer()`. You will need to carry out the following steps:
1. Initialize the embedding matrix as a numpy array of zeroes with the correct shape.
2. Fill in the embedding matrix with all the word embeddings extracted from `word_to_vec_map`.
3. Define Keras embedding layer. Use [Embedding()](https://keras.io/layers/embeddings/). Be sure to make this layer non-trainable, by setting `trainable = False` when calling `Embedding()`. If you were to set `trainable = True`, then it will allow the optimization algorithm to modify the values of the word embeddings. 
4. Set the embedding weights to be equal to the embedding matrix 

In [236]:
# GRADED FUNCTION: pretrained_embedding_layer

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 2                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["aime"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len,emb_dim,trainable=False,mask_zero=True)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [237]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

weights[0][1][3] = 0.0


**Expected Output**:

<table>
    <tr>
        <td>
            **weights[0][1][3] =**
        </td>
        <td>
           -0.3403
        </td>
    </tr>
</table>

## 2.3 Building the Emojifier-V2

Lets now build the Emojifier-V2 model. You will do so using the embedding layer you have built, and feed its output to an LSTM network. 

<img src="images/emojifier-v2.png" style="width:700px;height:400px;"> <br>
<caption><center> **Figure 3**: Emojifier-v2. A 2-layer LSTM sequence classifier. </center></caption>


**Exercise:** Implement `Emojify_V2()`, which builds a Keras graph of the architecture shown in Figure 3. The model takes as input an array of sentences of shape (`m`, `max_len`, ) defined by `input_shape`. It should output a softmax probability vector of shape (`m`, `C = 5`). You may need `Input(shape = ..., dtype = '...')`, [LSTM()](https://keras.io/layers/recurrent/#lstm), [Dropout()](https://keras.io/layers/core/#dropout), [Dense()](https://keras.io/layers/core/#dense), and [Activation()](https://keras.io/activations/).

In [238]:
# POS_model

def POS_model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # returning a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add a (time distributed) Dense layer followed by a softmax activation
    X = TimeDistributed(Dense(numClasses, activation='softmax'))(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices,outputs=X)
        
    return model

Run the following cell to create your model and check its summary. Because all sentences in the dataset are less than 10 words, we chose `max_len = 10`.  You should see your architecture, it uses "20,223,927" parameters, of which 20,000,050 (the word embeddings) are non-trainable, and the remaining 223,877 are. Because our vocabulary size has 400,001 words (with valid indices from 0 to 400,000) there are 400,001\*50 = 20,000,050 non-trainable parameters. 

In [239]:
model = POS_model((maxLen,), word_to_vec_map, word_to_index)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 266)               0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 266, 510)          18783300  
_________________________________________________________________
lstm_7 (LSTM)                (None, 266, 128)          327168    
_________________________________________________________________
time_distributed_7 (TimeDist (None, 266, 32)           4128      
Total params: 19,114,596
Trainable params: 331,296
Non-trainable params: 18,783,300
_________________________________________________________________


As usual, after creating your model in Keras, you need to compile it and define what loss, optimizer and metrics your are want to use. Compile your model using `categorical_crossentropy` loss, `adam` optimizer and `['accuracy']` metrics:

In [240]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

It's time to train your model. Your Emojifier-V2 `model` takes as input an array of shape (`m`, `max_len`) and outputs probability vectors of shape (`m`, `number of classes`). We thus have to convert X_train (array of sentences as strings) to X_train_indices (array of sentences as list of word indices), and Y_train (labels as indices) to Y_train_oh (labels as one-hot vectors).

In [241]:
print(pos2_to_index)
X_train_indices = lists_to_indices(X_train, word_to_index, maxLen)
Y_train_indices = lists_to_indices(Y_train, pos2_to_index, maxLen)
Y_train_oh = to_categorical(Y_train_indices, num_classes=numClasses)

{'PUN': 1, 'ABR': 2, 'NUM': 3, 'NAM': 4, 'VER:infi': 5, 'ADV': 6, 'VER:pper': 7, 'VER:futu': 8, 'VER:cond': 9, 'VER:impf': 10, 'PRO': 11, 'SYM': 12, 'PRO:IND': 13, 'PRP': 14, 'VER:ppre': 15, 'VER:subp': 16, 'PRO:PER': 17, 'DET:ART': 18, 'VER:pres': 19, 'KON': 20, 'PRO:POS': 21, 'VER:simp': 22, 'PUN:cit': 23, 'PRO:DEM': 24, 'NOM': 25, 'PRP:det': 26, 'INT': 27, 'PRO:REL': 28, 'ADJ': 29, 'DET:POS': 30, 'VER:impe': 31}


In [242]:
print(Y_train_indices[1])
print(Y_train_oh[1])

[ 25.  14.  18.  25.  29.   1.  20.  26.  23.  25.  14.  25.  23.  20.  17.
  19.  18.  25.  26.  25.   1.  18.  25.  14.  18.   2.  14.  25.  14.  25.
   6.  19.   6.  29.  14.   5.  24.  25.   1.  14.  18.  25.   6.   7.   1.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.  

Fit the Keras model on `X_train_indices` and `Y_train_oh`. We will use `epochs = 50` and `batch_size = 32`.

In [None]:
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
2016/9449 [=====>........................] - ETA: 2:10 - loss: 0.0051 - acc: 0.9986

Your model should perform close to **100% accuracy** on the training set. The exact accuracy you get may be a little different. Run the following cell to evaluate your model on the test set. 

In [36]:
X_dev_indices = lists_to_indices(X_dev, word_to_index, max_len = maxLen)
Y_dev_indices = lists_to_indices(Y_dev, pos2_to_index, max_len = maxLen)
Y_dev_oh = to_categorical(Y_dev_indices, num_classes = numClasses)
loss, acc = model.evaluate(X_dev_indices, Y_dev_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.948129268325


You should get a test accuracy of about 94.8% for a vanilla model using only aa1.txt.
A vanilla POS model on the full training set gets a dev accuracy of 99.85%!

In [39]:
def plot_confusion_matrix(y_actu, y_pred, title='Confusion matrix', cmap=plt.cm.gray_r):
    
    df_confusion = pd.crosstab(y_actu, y_pred.reshape(y_pred.shape[0],), rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    df_conf_norm = df_confusion / df_confusion.sum(axis=1)
    
    plt.matshow(df_confusion, cmap=cmap) # imshow
    #plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(df_confusion.columns))
    plt.xticks(tick_marks, df_confusion.columns, rotation=45)
    plt.yticks(tick_marks, df_confusion.index)
    #plt.tight_layout()
    plt.ylabel(df_confusion.index.name)
    plt.xlabel(df_confusion.columns.name)


In [None]:
print(Y_dev.shape)
print('           '+ label_to_emoji(0)+ '    ' + label_to_emoji(1) + '    ' +  label_to_emoji(2)+ '    ' + label_to_emoji(3)+'   ' + label_to_emoji(4))
print(pd.crosstab(Y_dev, pred_test.reshape(56,), rownames=['Actual'], colnames=['Predicted'], margins=True))
plot_confusion_matrix(Y_dev, pred_test)

In [57]:
# This code allows you to see the mislabelled examples

y_dev_oh = to_categorical(Y_dev_indices, num_classes = numClasses)
X_dev_indices = lists_to_indices(X_test, word_to_index, maxLen)
pred = model.predict(X_dev_indices)

for i in range(len(X_dev)):
    x = X_dev_indices
    for j in range(maxLen):
        num = np.argmax(pred[i][j])
        if(num != Y_dev_indices[i][j]):
            print('Expected POS tag: '+ X_dev[i][j] + '|' + Y_dev[i][j] + ' prediction: '+ X_dev[i][j] + '|' + index_to_pos2[num])

Expected POS tag: ces|PRO:DEM prediction: ces|DET:ART
Expected POS tag: handicaps|NOM prediction: handicaps|NAM
Expected POS tag: connus|VER:pper prediction: connus|PRP:det
Expected POS tag: (|PUN prediction: (|NOM
Expected POS tag: alourdis|VER:pper prediction: alourdis|PRP
Expected POS tag: par|PRP prediction: par|NOM
Expected POS tag: le|DET:ART prediction: le|ADJ
Expected POS tag: poids|NOM prediction: poids|PRP
Expected POS tag: encore|ADV prediction: encore|DET:ART
Expected POS tag: considérable|ADJ prediction: considérable|NAM
Expected POS tag: de|PRP prediction: de|PUN
Expected POS tag: dépit|NOM prediction: dépit|DET:ART
Expected POS tag: d|PRP prediction: d|NOM
Expected POS tag: une|DET:ART prediction: une|ADJ
Expected POS tag: diversification|NOM prediction: diversification|PUN
Expected POS tag: agricole|ADJ prediction: agricole|NUM
Expected POS tag: vers|PRP prediction: vers|SYM
Expected POS tag: l|DET:ART prediction: l|PRP:det
Expected POS tag: notamment|ADV prediction: no

IndexError: list index out of range

In [284]:
model.save('tt_pos.h5')

## Feature vectors

In [12]:
def read_suffixes(file):
    i = 1
    suffixes = {}

    with open(file, 'r') as f:
        for line in f:
            line = line.strip()
            suffixes[line] = i
            i = i + 1

    return suffixes       

In [106]:
french_suffixes = read_suffixes('suffixes.txt')
print(french_suffixes)
print(len(french_suffixes))

{'a': 1, 'able': 2, 'acé': 3, 'acée': 4, 'acées': 5, 'acés': 6, 'ade': 7, 'adique': 8, 'age': 9, 'ai': 10, 'aie': 11, 'aient': 12, 'ail': 13, 'aille': 14, 'ailler': 15, 'ain': 16, 'aine': 17, 'aire': 18, 'ais': 19, 'aise': 20, 'aises': 21, 'aison': 22, 'ait': 23, 'al': 24, 'âmes': 25, 'amment': 26, 'ance': 27, 'ane': 28, 'ant': 29, 'ante': 30, 'antes': 31, 'ants': 32, 'archie': 33, 'ard': 34, 'asse': 35, 'assent': 36, 'asses': 37, 'assiez': 38, 'assions': 39, 'at': 40, 'ât': 41, 'ate': 42, 'âtes': 43, 'ateur': 44, 'atif': 45, 'ation': 46, 'âtre': 47, 'atrice': 48, 'aud': 49, 'bourg': 50, 'céphale': 51, 'chorie': 52, 'culteur': 53, 'dingue': 54, 'drome': 55, 'e': 56, 'é': 57, 'eau': 58, 'eaux': 59, 'ectomie': 60, 'èdre': 61, 'édrique': 62, 'ée': 63, 'el': 64, 'elle': 65, 'ème': 66, 'émie': 67, 'ence': 68, 'ène': 69, 'ent': 70, 'er': 71, 'erai': 72, 'eraie': 73, 'eraient': 74, 'erais': 75, 'erait': 76, 'èrent': 77, 'eresse': 78, 'erez': 79, 'erie': 80, 'erons': 81, 'eront': 82, 'esque': 

In [109]:
def suffix_vector(word, suffixes=french_suffixes):
    length = len(suffixes)+1
    vector = np.zeros(length)
    for suf,num in suffixes.items():
        if word.endswith(suf):
            vector[num] = 1.0
        else:
            vector[num] = 0.0
    return vector        

In [108]:
suffix_vector("seraient", suffixes)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [113]:
def word_features(word, unknown=False):
    list = []
    if word.isupper():
        list.append(1.0)
    else:
        list.append(0.0)
    if word[0].isupper():
        list.append(1.0)
    else:
        list.append(0.0)
    if word.isnumeric():
        list.append(1.0)
    else:
        list.append(0.0)
    if word.isalnum():
        list.append(0.0)
    else:
        list.append(1.0)
    if "-" in word:
        list.append(1.0)
    else:
        list.append(0.0)
    if unknown:
         list.append(1.0)
    else:
        list.append(0.0)
    
    ar = np.asarray(list)
    return ar

In [89]:
print(word_features("ABCD"))
print(word_features("Abcd"))
print(word_features("1234"))
print(word_features("*%"))
print(word_features("Ab-cd"))



[ 1.  1.  0.  0.  0.]
[ 0.  1.  0.  0.  0.]
[ 0.  0.  1.  0.  0.]
[ 0.  0.  0.  1.  0.]
[ 0.  1.  0.  1.  1.]


### Training the Supertagger

We split the data as before, only using Z (supertags) instead of Y2 (treetagger POStag set) as the goal

In [90]:
# split the training data into the standard 60% train, 20% dev, 20% test 
X_train, X_testdev, Y_train, Y_testdev = train_test_split(X, Z, test_size=0.4)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_testdev, Y_testdev, test_size=0.5)
print("Train: ", X_train.shape)
print("Test:  ", X_test.shape)
print("Dev:   ", X_dev.shape)

Train:  (293,)
Test:   (98,)
Dev:    (98,)


In [92]:
# Super_model

def Super_model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # returning a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add a (time distributed) Dense layer followed by a softmax activation
    X = TimeDistributed(Dense(numSuperClasses, activation='softmax'))(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices,outputs=X)
        
    return model

In [93]:
supermodel = Super_model((maxLen,), word_to_vec_map, word_to_index)
supermodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 88)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 88, 300)           1039500   
_________________________________________________________________
lstm_2 (LSTM)                (None, 88, 128)           219648    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 88, 337)           43473     
Total params: 1,302,621
Trainable params: 263,121
Non-trainable params: 1,039,500
_________________________________________________________________


In [94]:
supermodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [95]:
X_train_indices = lists_to_indices(X_train, word_to_index, maxLen)
Y_train_indices = lists_to_indices(Y_train, super_to_index, maxLen)
Y_train_oh = to_categorical(Y_train_indices, num_classes=numSuperClasses)

In [98]:
print(Y_train_indices[2])
print(Y_train_oh[2])

[ 278.  277.   31.   30.  286.   89.  277.   31.  301.  258.   55.  231.
   31.   50.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.]
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]]


In [99]:
supermodel.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a229c4a20>

In [101]:
X_dev_indices = lists_to_indices(X_dev, word_to_index, max_len = maxLen)
Y_dev_indices = lists_to_indices(Y_dev, super_to_index, max_len = maxLen)
Y_dev_oh = to_categorical(Y_dev_indices, num_classes = numSuperClasses)
loss, acc = supermodel.evaluate(X_dev_indices, Y_dev_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.720135607282
