In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

from keras.models import Model
from keras.layers import Bidirectional, Dense, Input, Dropout, LSTM, Activation, TimeDistributed, BatchNormalization, concatenate, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.initializers import glorot_uniform
from sklearn.model_selection import train_test_split

from itertools import chain

from grail_data_utils import *

%matplotlib inline

np.random.seed(1)

Using TensorFlow backend.


In [2]:
# entire corpus
X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxLen = read_maxentdata('m2.txt')

In [3]:
numClasses = len(partsofspeech2)+1
numSuperClasses = len(superset)+1

print()
print("Longest sentence      : ", maxLen)
print("Number of words       : ", len(vocabulary))
print("Number of norm. words : ", len(vnorm))
print("Number of POS tags    : ", numClasses)
print("Number of supertags   : ", numSuperClasses)



Longest sentence      :  266
Number of words       :  30300
Number of norm. words :  28223
Number of POS tags    :  32
Number of supertags   :  891


In [4]:
# split the training data into the standard 60% train, 20% dev, 20% test 
X_train, X_testdev, Y_train, Y_testdev = train_test_split(X, Y2, test_size=0.4)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_testdev, Y_testdev, test_size=0.5)
print("Train: ", X_train.shape)
print("Test:  ", X_test.shape)
print("Dev:   ", X_dev.shape)


Train:  (9449,)
Test:   (3150,)
Dev:    (3150,)


In [110]:

def get_features(string, cat):
    fset = set()

    if (cat == "v"):
        m0 = re.search(r"<(.*)>", string)
        if m0 is not None:
            for item in m0.group(1).split(','):
                fset.add(item)

    m1 = re.search(r"cat=(.*?)[,\]]", string)
    if m1 is not None:
        fset.add(m1.group(1))

    for m2 in re.findall(r"@(.*?)[,\]]", string):
        if not (m2 == "e"):
            fset.add(m2)
    return fset
            
    
def read_lefff(file):
    vocabulary = set()
    tags = set()
    word_pos_map = {}
    max_word_len = 0
    with open(file, 'r') as f:
        for line in f:
            line = line.strip().split("\t")
            w = line[0]
            w = w.replace("æ", "ae")
            w = w.replace("œ", "oe")
            word_len = len(w)
            if (word_len > max_word_len):
                max_word_len = word_len
            wlist = w.split()
            if (len(wlist) == 2):
                w = wlist[0]
                nextfeat == None
                if (wlist[1]).startswith("qu'"):
                    nextfeat = "Next:que"
                if (wlist[1]).startswith("que "):
                    nextfeat = "Next:que"
                if (wlist[1] == 'que'):
                    nextfeat = "Next:que"
                if (wlist[1]).startswith("d'"):
                    nextfeat = "Next:de"
                if (wlist[1]).startswith("de "):
                    nextfeat = "Next:de"
                if (wlist[1] == 'de'):
                    nextfeat = "Next:de"
                if (wlist[1]).startswith("à "):
                    nextfeat = "Next:à"
                if (wlist[1] == 'à'):
                    nextfeat = "Next:à"
                if (wlist[1] == 'priori'):
                    nextfeat = "Next:priori"
            elif (len(wlist) == 1):
                nextfeat = None
                
            if (len(wlist) == 1) or ((len(wlist) ==2) and (nextfeat is not None)):
                if (w == "c'est-à-dire"):
                    w = "est-à-dire"
                    nextfeat = "Prev:ce"
                pos = line[2]
                features = line[3]
                vocabulary.add(w)
                valset = word_pos_map.get(w)
                if valset is None:
                    valset = set()
                valset.add(pos)
                fts = get_features(features, pos)
                valset = valset.union(fts)
                if nextfeat is not None:
                    valset.add(nextfeat)
                word_pos_map[w] = valset

    for w in ['capella', 'contratio', 'fortiori', 'latere', 'minima', 'posteriori',  'priori']:
        word_pos_map[w] = set(['priori'])
                
    for key in iter(word_pos_map.keys()):
        word_pos_map[key] = frozenset(word_pos_map[key])
        
    for val in iter(word_pos_map.values()):
        tags.add(val)
        
    return vocabulary, tags, word_pos_map, max_word_len


In [111]:
v, t, wpm, maxWordLen = read_lefff('lefff-ext-3.0.txt')

In [7]:
print(maxWordLen)
print(t)

60
{frozenset({'Obj:cla|sn', 'Loc:(loc-sn|y)', 'pers', 'Dloc:(de-sn|en)', 'v', 'Suj:cln|sn', 'C3p'}), frozenset({'se_moyen', 'être', 'v', 'I1p', 'Suj:cln|scompl|sinf|sn', 'S1p', 'Suj:scompl|sinf|sn', 'pers', 'Obj:(cla|scompl|sinf|sn)', 'CtrlSujObj'}), frozenset({'Obj:(cla|de-sinf|qcompl|scompl|sn)', 'Objà:(cld|à-sn)', 'Obj:cla|sn', 'Att:sa|sn', 'v', 'C2p', 'Obj:sinf', 'Suj:cln|scompl|sinf|sn', 'AttObj', 'CtrlObjàObj', 'pers', 'Suj:cln|sn', 'CtrlSujObj'}), frozenset({'G', 'pers', 'Obj:(cla|sn)', 'v', 'Att:de-sa|de-sn', 'Suj:cln|scompl|sinf|sn'}), frozenset({'pers', 'Obj:(cla|qcompl|scompl|sn)', 'F3p', 'v', 'Suj:cln|scompl|sinf|sn'}), frozenset({'pers', 'Obl:(par-sn)', 'v', 'F2s', 'Suj:cln|scompl|sinf|sn'}), frozenset({'Obj:(cla|sn)', 'v', 'CtrlSujObjà', 'Suj:cln|scompl|sinf|sn', 'imperative', 'P2p', 'Objà:(sur-scompl|sur-sn|y|à-qcompl|à-scompl|à-sinf|à-sn)', 'pers', 'Y2p'}), frozenset({'Obj:cla|sn', 'Kmp', 'active', 'v', 'passive', 'Suj:cln|scompl|sinf|sn', 'Obl2:(par-sn)', 'pers', 'adj

In [8]:
print(len(t))

12244


In [9]:
print(wpm["Jean"])

frozenset({'np', 'hum', 'fs', 'ms'})


In [10]:
print(wpm["est"])

frozenset({'Att:(sa|à-sinf|à-sn)', 'v', 'AttSuj', 'Att:(de-sinf|scompl|sn)', 'Suj:cln|scompl|sinf|sn', 'ms', 'auxEtre', 'pers', 'adj', 'nc', 'fêtre', 'P3s'})


In [11]:
print(wpm["été"])

frozenset({'Att:(sa|à-sinf|à-sn)', 'active', 'v', 'K', 'AttSuj', 'Att:(de-sinf|scompl|sn)', 'ms', 'Suj:cln|scompl|sinf|sn', 'auxEtre', 'pers', 'nc', 'fêtre'})


In [None]:
print(wpm["était"])

In [None]:
print(wpm["faut"])

In [None]:
print(wpm["que"])

In [None]:
print(wpm["qu'"])

In [None]:
print(wpm["priori"])

In [None]:
print(wpm["importe"])

In [None]:
print(len(v))

In [113]:
fset = set()
for frozen in t:
    for f in frozen:
        fset.add(f)
print(fset)
print(len(fset))
outFeatures = len(fset)+1

{'P3p', 'parento', 'poncts', 'Obj:cla|scompl|sn', 'CtrlObjàObj', 'Obj:(cla|de-sinf|scompl|sn)', 'Obl:(sur-sn)', 'Objà:(à-scompl|à-sinf|à-sn)', 'Dloc:(de-sn)', 'p_P1p', 'Obl:(pour-sinf|sinf)', 'Att:(sinf)', 'Obl:(pour-sinf)', 'Obj:(cla|qcompl|scompl|sn)', 'Suj:cln|sn', 'det', 'CtrlSujAtt', 'Obl:(sinf)', 'Kms', 'Obj:(cla|scompl|sinf|sn)', 'pro', 'NV:', 'Obl:(vers-sn)', 'Objà:(y|à-scompl|à-sn)', 'T2s', 'Att:(sa|sn)', 'Suj:de-sinf|qcompl|scompl|sn', 'négatif', 'Suj:cln|qcompl|scompl|sn', 'fêtre', 'Obj:cla|sn', 'CtrlObjàSuj', 's_P1s', 'advp', 'Objde:(de-scompl|de-sn|en|scompl)', 'Suj:(cln|scompl|sinf|sn)', 'CtrlObjObl', 'Obl:en-sn', 'S3s', 's_P2s', 'year', 'Suj:(cln|qcompl|scompl|sinf|sn)', '3', 'Obl:(à-sn)', 'Att:(cla|sn)', 'Objà:cld|à-sn', 'C1p', 'adv', '3s', 'Objde:(de-scompl|de-sn|en|scompl|sinf)', 'ACompSubj', 's_P3s', 'F2p', 'P12s', 'clg', 'Obl:(à-sinf)', 'sym', 'pron', 'Obj:cla|qcompl|scompl|sinf|sn', 'Obl:(après-sn)', 'Att:sa|sinf|sn', 'suffAdj', 'Suj:qcompl|scompl|sn', ':NV', 'P1p'

In [114]:
feature_to_integer, integer_to_feature = indexify(fset)

In [65]:
print(X[0])
print(integer_to_feature[357])

['Lyonnaise-Dumez', 'vient', "d'", 'hispaniser', 'sa', 'filiale', 'espagnole', 'et', "d'", 'étendre', 'ses', 'participations', 'en', 'Espagne', ',', 'tout', 'en', 'resserrant', 'ses', 'liens', 'avec', 'la', 'Caixa', ',', 'première', 'caisse', "d'", 'épargne', 'espagnole', 'et', "l'", 'un', 'des', 'premiers', 'établissements', 'financiers', 'de', 'la', 'péninsule', 'ibérique', ',', 'à', 'laquelle', 'elle', 'est', 'liée', 'depuis', 'longtemps', 'dans', 'la', 'Société', 'générale', 'des', 'eaux', 'de', 'Barcelone', '(', 'SGAB', ')', ',', 'premier', 'groupe', 'espagnol', 'de', 'services', '(', 'la', 'Caixa', 'détient', 'aussi', '2', '%', 'du', 'capital', 'de', 'Lyonnaise-Dumez', ')', '.']
Obl:(avec-sn)


In [66]:
def get_characters(vocabulary):

    characters = set(['^', 'Û', '³', 'Ê', '<BOS>', '<BOW>', '<EOW>', '<EOS>', '<UNK>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
    for word in vocabulary:
        clist = list(word)
        for char in clist:
            characters.add(char)
    
    remove = set(['|', '«', '»', 'µ', '§', 'þ', 'ø', '_', '¯', 'ð', 'æ', '~', '©', '@', '~'])
    
    return characters - remove

In [67]:
characters = get_characters(v)
print(characters)
print(len(characters))

maxChars = len(characters)

character_to_integer, integer_to_character = indexify(characters)

{'w', '>', 'O', '<BOW>', ':', '8', 'ë', 'ï', 'Û', 'ê', '.', 'Å', '<EOW>', 'S', 'º', '²', 'å', '9', 'J', 'Z', '!', 'F', 't', '$', '7', 'í', 'W', '0', 'Ê', 'c', 'P', '+', '=', 'è', 'Á', 'D', 'E', 'K', 'ç', '[', '3', 'Y', '5', 'r', 'ü', '°', 'h', 'ò', '2', 'õ', '³', 'ö', 'û', 'g', 'U', 'È', 'H', 'o', '^', '<', 'k', '/', 'A', ')', '<EOS>', 'j', '%', 'q', 'X', 'ã', '&', 'R', 'M', 'ú', 'ì', 'Î', 'î', 'B', 'b', '-', 'V', 'v', 'l', 'a', 'N', 'p', 'e', '4', '"', '<UNK>', 'é', 'I', ']', '±', 'ó', 'Q', 'ù', '½', 'À', '<BOS>', '1', 'C', 'n', 'L', 'f', 'ô', 'i', 'â', '(', 'x', 'ä', 'm', 'u', 'É', 'à', '?', 'G', '*', 'á', 'ñ', '6', 's', 'z', 'Ç', ';', "'", 'd', 'y', ',', 'T'}
130


In [68]:
mapping = {}

mapping["="] = set(['math', 'infix'])
mapping["<"] = set(['math', 'infix'])
mapping[">"] = set(['math', 'infix'])


mapping["+"] = set(['math', 'infix'])
mapping["*"] = set(['math', 'infix'])
mapping["±"] = set(['math', 'prefix'])
mapping["°"] = set(['math', 'postfix'])
mapping["º"] = set(['math', 'postfix'])
mapping["²"] = set(['math', 'postfix'])
mapping["³"] = set(['math', 'postfix'])



mapping["0"] = set(['9'])
mapping["1"] = set(['9'])
mapping["2"] = set(['9'])
mapping["3"] = set(['9'])
mapping["4"] = set(['9'])
mapping["5"] = set(['9'])
mapping["6"] = set(['9'])
mapping["7"] = set(['9'])
mapping["8"] = set(['9'])
mapping["½"] = set(['9'])

mapping["A"] = set(['a', 'maj'])
mapping["B"] = set(['b', 'maj'])
mapping["C"] = set(['c', 'maj'])
mapping["D"] = set(['d', 'maj'])
mapping["E"] = set(['e', 'maj'])
mapping["F"] = set(['f', 'maj'])
mapping["G"] = set(['g', 'maj'])
mapping["H"] = set(['h', 'maj'])
mapping["I"] = set(['i', 'maj'])
mapping["J"] = set(['j', 'maj'])
mapping["K"] = set(['k', 'maj'])
mapping["L"] = set(['l', 'maj'])
mapping["M"] = set(['m', 'maj'])
mapping["N"] = set(['n', 'maj'])
mapping["O"] = set(['o', 'maj'])
mapping["P"] = set(['p', 'maj'])
mapping["Q"] = set(['q', 'maj'])
mapping["R"] = set(['r', 'maj'])
mapping["S"] = set(['s', 'maj'])
mapping["T"] = set(['t', 'maj'])
mapping["U"] = set(['u', 'maj'])
mapping["V"] = set(['v', 'maj'])
mapping["W"] = set(['w', 'maj'])
mapping["X"] = set(['x', 'maj'])
mapping["Y"] = set(['y', 'maj'])
mapping["Z"] = set(['z', 'maj'])

mapping["á"] = set(['a', 'aigu'])
mapping["à"] = set(['a', 'grave'])
mapping["ä"] = set(['a', 'uml'])
mapping["â"] = set(['a', 'circ'])
mapping["ã"] = set(['a', 'tilde'])
mapping["å"] = set(['a', 'overring'])

mapping["ç"] = set(['c', 'cedil'])

mapping["é"] = set(['e', 'aigu'])
mapping["è"] = set(['e', 'grave'])
mapping["ê"] = set(['e', 'circ'])
mapping["ë"] = set(['e', 'uml'])


mapping["ï"] = set(['i', 'uml'])
mapping["í"] = set(['i', 'aigu'])
mapping["î"] = set(['i', 'circ'])
mapping["ì"] = set(['i', 'grave'])

mapping["ñ"] = set(['n', 'tilde'])

mapping["ö"] = set(['o', 'uml'])
mapping["ó"] = set(['o', 'aigu'])
mapping["õ"] = set(['o', 'tilde'])
mapping["ô"] = set(['o', 'circ'])
mapping["ò"] = set(['o', 'grave'])

mapping["û"] = set(['u', 'circ'])
mapping["ú"] = set(['u', 'aigu'])
mapping["ù"] = set(['u', 'grave'])
mapping["ü"] = set(['u', 'uml'])

mapping["Á"] = set(['a', 'maj', 'aigu'])
mapping["À"] = set(['a', 'maj', 'grave'])
mapping["Å"] = set(['a', 'maj', 'overring'])

mapping["Ç"] = set(['c', 'maj', 'cedil'])

mapping["È"] = set(['e', 'maj', 'grave'])
mapping["É"] = set(['e', 'maj', 'aigu'])
mapping["Ê"] = set(['e', 'maj', 'circ']) 

mapping["Î"] = set(['i', 'maj', 'circ'])

mapping["Û"] = set(['u', 'maj', 'circ'])



In [69]:
def compute_symset (characters, mapping):
    symset = set()
    for c in characters:
        try:
            syms = mapping[c]
            symset = symset.union(syms)
        except:
            symset.add(c)
    return symset

In [70]:
sset = compute_symset(characters, mapping)
print(sset)

maxCharFeat = len(sset)+1
print(len(sset))

print(mapping['+'])

{'w', 'k', '/', ')', '<EOS>', '<BOW>', 'c', ':', 'j', '%', '<BOS>', 'aigu', 'grave', 'cedil', 'circ', 'q', 'uml', 'n', '.', 'f', '&', '<EOW>', 'i', '[', '(', 'prefix', 'b', 'x', '-', '9', 'm', 'u', 'r', 'v', 'math', 'infix', '!', 'l', '?', 'a', 'p', 'e', 'h', 't', '"', 'overring', '<UNK>', '$', 's', 'g', 'z', 'tilde', ';', 'd', "'", ']', 'y', 'postfix', ',', 'o', '^', 'maj'}
62
{'math', 'infix'}


In [71]:
def char_to_charfeatures (char, mpg, symbset):
    try:
        feats = mpg[char]
    except:
        if (char in symbset):
            feats = set([char])
        else:
            feats = set(['<UNK>'])
    
    return feats

In [72]:
char_to_charfeatures("A", mapping, sset)

{'a', 'maj'}

In [73]:
charf_to_integer, integer_to_charf = indexify(sset)

In [74]:
charf_to_integer.keys()

dict_keys(['w', 'k', '/', ')', '<EOS>', '<BOW>', 'c', ':', 'j', '%', '<BOS>', 'aigu', 'grave', 'cedil', 'circ', 'q', 'uml', 'n', '.', 'f', '&', '<EOW>', 'i', '[', '(', 'prefix', 'b', 'x', '-', '9', 'm', 'u', 'r', 'v', 'math', 'infix', '!', 'l', '?', 'a', 'p', 'e', 'h', 't', '"', 'overring', '<UNK>', '$', 's', 'g', 'z', 'tilde', ';', 'd', "'", ']', 'y', 'postfix', ',', 'o', '^', 'maj'])

In [75]:
def char_to_one_hot(char, mpg, c_to_in, symbset, mcf):
    fset = char_to_charfeatures(char, mpg, symbset)
    numf = len(fset)
    
    tmp = np.zeros(numf)
    index = 0
    for item in fset:
        tmp[index] = c_to_in[item]
        index = index + 1
    Y = to_categorical(tmp, mcf)
    return np.sum(Y,axis=0)
   

In [76]:
Y0 = char_to_one_hot("A", mapping, charf_to_integer, sset, maxCharFeat)

print(Y0)


[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.]


In [77]:
transform = np.zeros((maxChars+1, maxCharFeat))
for char, i in character_to_integer.items():
    transform[i] = char_to_one_hot(char, mapping, charf_to_integer, sset, maxCharFeat)
    
np.shape(transform)

(131, 63)

In [78]:
def word_to_one_hot(word, mpg, c_to_in, symbset, mcf, bos, eos):
    ls = list(word)
    length = len(ls)+2
    end = length - 1
    Y = np.zeros((length,mcf))
    if bos:
        Y[0] = char_to_one_hot('<BOS>', mpg, c_to_in, symbset, mcf)
    else:    
        Y[0] = char_to_one_hot('<BOW>', mpg, c_to_in, symbset, mcf)
    if eos:
        Y[end] = char_to_one_hot('<EOS>', mpg, c_to_in, symbset, mcf)
    else:    
        Y[end] = char_to_one_hot('<EOW>', mpg, c_to_in, symbset, mcf)

    for i in range(0,end-1):
        Y[i+1] = char_to_one_hot(ls[i], mpg, c_to_in, symbset, mcf)
        
    return Y

In [79]:
Jean = word_to_one_hot('Jean', mapping, charf_to_integer, sset, maxCharFeat, True, True)
print(np.shape(Jean))
print(Jean)

(6, 63)
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.

In [80]:
totalWords = 0
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        totalWords = totalWords + 1
  
print(totalWords)

264432


In [None]:
# compute the character index input matrix

X_charint_input = np.zeros((totalwords,maxChars))

xindex = 0

for i in range(len(X_train)-1):
    end = len(X_train[i])
    for j in range(len(X_train[i])):
        charslist = list(X_train[i][j])
        X_charint_input[xindex][0] = character_to_integer['<BOW>']
        X_charint_input[xindex][len(charslist)] = character_to_integer['<EOW>']
        for c in range(len(charslist)):
            char = charslist[c]
            d = c+1
            try:
                X_charint_input[xindex][d] = character_to_integer[char]
            except:
                # warn when unknown character is encountered
                print(i)
                print(charslist)
        xindex = xindex + 1
        
print(np.shape(X_charint_input))

In [81]:
def compute_total_words(word_matrix):
  
    tw = 0
    
    for i in range(len(word_matrix)):
        for j in range(len(word_matrix[i])):
             tw = tw + 1

    return tw


In [82]:
def word_matrix_to_charint_matrix(word_matrix, max_chars):
 
    total_words = compute_total_words(word_matrix)
    
    X_charint = np.zeros((total_words,max_chars))
    
    xindex = 0

    # iterate over sentences in input matrix
    for i in range(len(word_matrix)):
        end = len(word_matrix[i])
        # iterater over words in sentences
        for j in range(len(word_matrix[i])):
            charslist = list(word_matrix[i][j])
            # add beginning/end of word/sentence tags to word
            if (j == 0):
                X_charint[xindex][0] = character_to_integer['<BOS>']
            else:
                X_charint[xindex][0] = character_to_integer['<BOW>']
            if (j == end-1):
                X_charint[xindex][len(charslist)+1] = character_to_integer['<EOS>']
            else:
                X_charint[xindex][len(charslist)+1] = character_to_integer['<EOW>']
            for c in range(len(charslist)):
                d = c + 1
                char = charslist[c]
                try:
                    X_charint[xindex][d] = character_to_integer[char]
                except:
                    # warn when unknown character is encountered
                    print("Unknown character: ", end='')
                    print(i, end=' ')
                    print(charslist)
            # increase word counter
            xindex = xindex + 1
    return X_charint

In [83]:
X_charint_input = word_matrix_to_charint_matrix(X_train, maxChars)

In [84]:
print(np.shape(X_charint_input))
print(X_charint_input[5])

(264432, 130)
[   4.  129.   13.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.]


In [85]:
print(X_train[397])
print(character_to_integer['U'])

['-', '^', 'Reuters', 'Doctoring', 'Photos', 'from', 'Beirut', '?']
55


In [107]:
# compensate for some missing words in lefff

def outword_mapping(word):
    lword = word.lower()
    if lword == "auquel":
        return set(['à', 'lequel'])
    if lword == "auxquels":
        return set(['à', 'lesquels'])
    if lword == "auxquelles":
        return set(['à', 'lesquelles'])
    if lword == "aux":
        return set(['à', 'les'])
    if lword == "duquel":
        return set(['de', 'lequel'])
    if lword == "desquels":
        return set(['des', 'lesquels'])
    if lword == "desquelles":
        return set(['des', 'lesquelles'])
    if lword == "des":
         return set(['de', 'les'])
    if lword == "au":
        return set(['à', 'le'])
    if lword == "du":
         return set(['de', 'le'])
    if lword == "deçà":
         return set(['dessous'])
    if len(word) == 1 and word.isupper:
        return set(['Alpha'])
    if is_numeral(word):
        return set(["dix-sept"])
    if word == "CNRS":
        return set(['centre'])
    if word == "CRS":
        return set(['centre'])
    if word == "CSA":
        return set(['compagnie'])
    if word == "EDF":
        return set(['électricité'])
    if word == "ANPE":
        return set(['agence'])
    if word == "ONG" or word == "ONU":
        return set(['organisation'])
    if word == "AOC":
        return set(['appelation'])
    if word == "ALENA":
        return set(['accord'])
    if word == "ASSEDIC":
        return set(['association'])
    if word == "PIB":
        return set(['produit'])
    if word == "SMIC":
        return set(['salaire'])


    return set([word])
       

In [101]:
def wordf_to_one_hot(word, outword_mapping, wpm, f_to_in, fset, outf):
    wordset = outword_mapping(word)
    featureset = set()
    for word in wordset:
        if word in wpm.keys():
            featureset = featureset.union(wpm[word])
        else:
            wordl = word.lower()
            if wordl in wpm.keys():
                featureset = featureset.union(wpm[wordl])
            else:
                print("Unknown: ", word)
                featureset = set()
            
    length = len(featureset)
#    Y = np.zeros((length,outf))

    i = 0
    tmp = np.zeros(len(featureset))
    for f in featureset:
        tmp[i] = f_to_in[f]
        i = i + 1
    Y = to_categorical(tmp, outf)

    return np.sum(Y,axis=0)

In [88]:
def normalize_capitals(word):
    word = word.replace("É", "E")
    word = word.replace("Î", "I")
        

In [89]:
vector = wordf_to_one_hot("aux", outword_mapping, wpm, feature_to_integer, fset, outFeatures)
print(np.shape(vector))
print(vector)
print(vector[0])

[ 216.  196.  312.  115.  245.  285.   77.  163.  296.   48.   16.]
(358,)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [102]:
print(X_train[0][0])
vec = wordf_to_one_hot(X_train[0][0], outword_mapping, wpm, feature_to_integer, fset, outFeatures)
print(vec)

Des
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1

In [103]:
def compute_word_targets(X, outword_mapping, wpm, feature_to_integer, fset, outFeatures):

    total_words = compute_total_words(X)
    Y = np.zeros((total_words,outFeatures))

    yindex = 0

    for i in range(len(X)):
        for j in range(len(X[i])):
            Y[yindex] = wordf_to_one_hot(X[i][j], outword_mapping, wpm, feature_to_integer, fset, outFeatures)
            yindex = yindex + 1

    return Y

In [115]:
Y_train = compute_word_targets(X_train, outword_mapping, wpm, feature_to_integer, fset, outFeatures)

Unknown:  CBV
Unknown:  Guintoli
Unknown:  ÉTATS-UNIS
Unknown:  Popi
Unknown:  Bayard
Unknown:  HLM
Unknown:  TVA
Unknown:  Doubin
Unknown:  Jean-Louis
Unknown:  Pétriat
Unknown:  FNAC
Unknown:  Ruymbeke
Unknown:  BCCI
Unknown:  Mahfouz
Unknown:  bank
Unknown:  Jean-Yves
Unknown:  Chamard
Unknown:  RPR
Unknown:  UDF
Unknown:  UDC
Unknown:  Etats
Unknown:  Guinée-Equatoriale
Unknown:  Sierra-Léone
Unknown:  Pronasol
Unknown:  M3
Unknown:  Bundesbank
Unknown:  UNIM
Unknown:  Schlesinger
Unknown:  Bundesbank
Unknown:  PS
Unknown:  Etats
Unknown:  Balmary
Unknown:  Etat
Unknown:  OCDE
Unknown:  SME
Unknown:  PNB
Unknown:  Etats
Unknown:  RPR
Unknown:  UDF
Unknown:  rétrocommissions
Unknown:  Strauss-Kahn
Unknown:  CNPF
Unknown:  UPA
Unknown:  Trente-cinq
Unknown:  RMI
Unknown:  Jean-Pierre
Unknown:  Jean-Pierre
Unknown:  PAP
Unknown:  Lodz
Unknown:  Bydgoszcz
Unknown:  Wroclaw
Unknown:  Zielona-Gora
Unknown:  Szczecin
Unknown:  Bérégovoy
Unknown:  Gomez
Unknown:  Eisswein
Unknown:  Thomson

Unknown:  PDO
Unknown:  Partex
Unknown:  ISBN
Unknown:  TF
Unknown:  Sabouret
Unknown:  Poullain
Unknown:  m²
Unknown:  18ème
Unknown:  Indosuez
Unknown:  ex-
Unknown:  BP
Unknown:  Olivetti
Unknown:  Philips
Unknown:  Thomson
Unknown:  Electronics
Unknown:  TCE
Unknown:  CTCOE
Unknown:  Nestlé
Unknown:  Neiertz
Unknown:  Pentland
Unknown:  BTF
Unknown:  Gbm.H
Unknown:  BTF
Unknown:  Pentland
Unknown:  Clinvest
Unknown:  AGF
Unknown:  Worms
Unknown:  BNP
Unknown:  Banexi
Unknown:  BTF
Unknown:  Gbm.H
Unknown:  Etat
Unknown:  Sochaux-Montbéliard
Unknown:  Louise-Yvonne
Unknown:  Casetta
Unknown:  RPR
Unknown:  Curien
Unknown:  Mellick
Unknown:  Bull
Unknown:  Nec
Unknown:  GPL
Unknown:  AEG
Unknown:  Lagardère
Unknown:  TF
Unknown:  zig-Zag
Unknown:  Bouygues
Unknown:  mini-
Unknown:  micro-
Unknown:  DEFICIT
Unknown:  UNEDIC
Unknown:  RDA
Unknown:  Joffre
Unknown:  Who's
Unknown:  next
Unknown:  m2
Unknown:  SCA
Unknown:  SCA
Unknown:  Pelège
Unknown:  Electronique
Unknown:  Bundesbank

Unknown:  VLM
Unknown:  AL
Unknown:  Sogeti
Unknown:  Kampf
Unknown:  Daimler-Benz
Unknown:  UAW
Unknown:  General
Unknown:  Motors
Unknown:  Dassault
Unknown:  Falcon-2000
Unknown:  Alenia
Unknown:  côte-d'ivoire
Unknown:  semi-
Unknown:  Sogenal
Unknown:  OPA
Unknown:  Berger-Levrault
Unknown:  Elf
Unknown:  rétro-
Unknown:  Taxdisk
Unknown:  trente-six
Unknown:  assurance-vieillesse
Unknown:  Orlyval
Unknown:  Bari
Unknown:  Etat
Unknown:  Thomson
Unknown:  Doutriaux
Unknown:  Maastricht
Unknown:  Etat
Unknown:  Etats-nations
Unknown:  Expanso
Unknown:  Prado
Unknown:  Desfossés
Unknown:  Desfossés
Unknown:  CIRI
Unknown:  A2
Unknown:  FR3
Unknown:  Mellick
Unknown:  Britannnique
Unknown:  Smouha
Unknown:  Roden
Unknown:  quasi-
Unknown:  Hongkong
Unknown:  CEE
Unknown:  Saouma
Unknown:  Brégou
Unknown:  Usine-Publications
Unknown:  Point-clé
Unknown:  Indochine
Unknown:  Azeglio
Unknown:  Ciampi
Unknown:  ex-
Unknown:  RDA
Unknown:  Deutscher
Unknown:  Fernsehfunk
Unknown:  DFF
Unk

Unknown:  GATT
Unknown:  Volkswagen
Unknown:  JAX
Unknown:  jésuitière
Unknown:  LUXEMBOURG
Unknown:  Elysée
Unknown:  Bercy
Unknown:  Etat
Unknown:  Gaîté-lyrique
Unknown:  SGS-Thomson
Unknown:  QPL
Unknown:  Kier
Unknown:  Etat
Unknown:  CBV
Unknown:  Indosuez
Unknown:  OPA
Unknown:  CPR
Unknown:  Rambaut
Unknown:  Eramet
Unknown:  cinquante-cinq
Unknown:  Financial
Unknown:  Times
Unknown:  Echos
Unknown:  Chérèque
Unknown:  projet-clé
Unknown:  Arguydal
Unknown:  Giscard
Unknown:  Mitterrand
Unknown:  SME
Unknown:  Jean-Claude
Unknown:  Trichet
Unknown:  Etat
Unknown:  FRANCE
Unknown:  UTA
Unknown:  Steinkühler
Unknown:  IG
Unknown:  Metall
Unknown:  sino-britannique
Unknown:  réalignement
Unknown:  SME
Unknown:  Denvers
Unknown:  FR3
Unknown:  A2
Unknown:  FR3
Unknown:  Maly
Unknown:  Delpey
Unknown:  Jean-Bedel
Unknown:  Bokassa
Unknown:  intra-
Unknown:  Etat
Unknown:  Washington-New-York
Unknown:  Jean-Claude
Unknown:  Trichet
Unknown:  franco-allemand
Unknown:  ZDS
Unknown:  M

Unknown:  HLM
Unknown:  PTT
Unknown:  Koweit
Unknown:  Sutherland
Unknown:  Miyazawa
Unknown:  Neiertz
Unknown:  Times
Unknown:  DAF
Unknown:  sit-in
Unknown:  Wikipédia
Unknown:  Nicholls
Unknown:  BCCI
Unknown:  trente-sept
Unknown:  vingt-cinq
Unknown:  Baylet
Unknown:  DM
Unknown:  DM
Unknown:  Sligos
Unknown:  FR3
Unknown:  Aegon
Unknown:  CEE
Unknown:  Etat
Unknown:  Evaluée
Unknown:  SPEP
Unknown:  Schneider
Unknown:  minimum-vieillesse
Unknown:  Interland
Unknown:  Jean-Pierre
Unknown:  appelation
Unknown:  Etat
Unknown:  Bundesbank
Unknown:  Etat
Unknown:  CNPF
Unknown:  Usinor-Sacilor
Unknown:  forge-matriçage
Unknown:  Nexus
Unknown:  Payment
Unknown:  Halphen
Unknown:  OPEP
Unknown:  Genève
Unknown:  Papon
Unknown:  INSEE
Unknown:  Etat
Unknown:  Rocard
Unknown:  Etat
Unknown:  Chevènement
Unknown:  CERES
Unknown:  ex-
Unknown:  RFA
Unknown:  OPR
Unknown:  Arjil
Unknown:  FETESE
Unknown:  COB
Unknown:  OPCVM
Unknown:  mi-
Unknown:  semi-
Unknown:  vingt-cinq
Unknown:  toyot

Unknown:  TUC
Unknown:  congé-maladie
Unknown:  FNB
Unknown:  Philips
Unknown:  Moody's
Unknown:  Ileau
Unknown:  Muzelle
Unknown:  Mermaz
Unknown:  Bergons
Unknown:  leitender
Unknown:  Angestellter
Unknown:  Yoshitomi
Unknown:  Pharmaceuticals
Unknown:  Éric
Unknown:  Halphen
Unknown:  Olszewski
Unknown:  île-de-France
Unknown:  ex-
Unknown:  BP
Unknown:  Etat
Unknown:  Saint-pères
Unknown:  Fininvest
Unknown:  Aubry
Unknown:  Calvet
Unknown:  autoentretenue
Unknown:  Ducros
Unknown:  Volkswagen
Unknown:  AG
Unknown:  JAX
Unknown:  Co.
Unknown:  Nihon
Unknown:  Keizai
Unknown:  Shimbun
Unknown:  1er
Unknown:  Lebègue
Unknown:  BNP
Unknown:  AJEF
Unknown:  Genève
Unknown:  Castille
Unknown:  GATT
Unknown:  Adidas
Unknown:  mono-industrie
Unknown:  Knightley
Unknown:  The
Unknown:  disinformation
Unknown:  campaign
Unknown:  The
Unknown:  Guardian
Unknown:  October
Unknown:  micro-
Unknown:  micro-
Unknown:  Thomson-CSF
Unknown:  PANCHO
Unknown:  HLM
Unknown:  Ile-de-France
Unknown:  P

Unknown:  Stone
Unknown:  jazz-funk
Unknown:  hip-hop
Unknown:  Accor
Unknown:  Desfossés
Unknown:  sud-coréen
Unknown:  Telecom
Unknown:  MCI
Unknown:  Zeller
Unknown:  CDS
Unknown:  Etat
Unknown:  Zorn
Unknown:  CNT
Unknown:  ETHANOL
Unknown:  Alcatel
Unknown:  Rockwell
Unknown:  Argos
Unknown:  Soditik
Unknown:  Metpart
Unknown:  Wagram
Unknown:  Midy
Unknown:  Jean-Louis
Unknown:  Gergorin
Unknown:  Ruymbeke
Unknown:  Clearstream
Unknown:  FIJ
Unknown:  Rotterdam
Unknown:  Bundesrat
Unknown:  Waigel
Unknown:  Lander
Unknown:  Valeo
Unknown:  BNP
Unknown:  UAP
Unknown:  Gutman
Unknown:  Laigneau
Unknown:  contre-pouvoir
Unknown:  HSBC
Unknown:  Midland
Unknown:  Bank
Unknown:  Indochine
Unknown:  Plon
Unknown:  chambériens
Unknown:  Malvy
Unknown:  Kabuto-cho
Unknown:  35e
Unknown:  16e
Unknown:  Ferrari
Unknown:  Vernholes
Unknown:  Godart
Unknown:  XIIe
Unknown:  Jones
Unknown:  European
Unknown:  Medicines
Unknown:  Agency
Unknown:  Chaath
Unknown:  Jaeggi
Unknown:  Cassandre
Unk

Unknown:  Andes
Unknown:  M3
Unknown:  Editions
Unknown:  Bourin
Unknown:  Dumas
Unknown:  Ciba
Unknown:  Ciba-Geigy
Unknown:  CSMF
Unknown:  assurance-maladie
Unknown:  pro-
Unknown:  Hongkong
Unknown:  Po
Unknown:  O'Kane
Unknown:  time
Unknown:  'm
Unknown:  scared
Unknown:  The
Unknown:  Guardian
Unknown:  December
Unknown:  Bercy
Unknown:  cadeaux-souvenirs
Unknown:  IPSN
Unknown:  GRS
Unknown:  Repsol
Unknown:  Elf
Unknown:  Cauteretz
Unknown:  USGS
Unknown:  GEC-Alsthom
Unknown:  Skoda
Unknown:  Mercedes-Benz
Unknown:  ci
Unknown:  Smith
Unknown:  Aubry
Unknown:  Bolatto
Unknown:  Notat
Unknown:  UNEDIC
Unknown:  Etat
Unknown:  IFIL
Unknown:  Agnelli
Unknown:  Worms
Unknown:  Wall
Unknown:  Street
Unknown:  mini-
Unknown:  BRUXELLES
Unknown:  États-UNIS
Unknown:  Denvers
Unknown:  FR3
Unknown:  concernante
Unknown:  repreneurs
Unknown:  Renon
Unknown:  SNECMA
Unknown:  Channel
Unknown:  Five
Unknown:  Thames
Unknown:  Television
Unknown:  City
Unknown:  TV
Unknown:  CEE
Unknown:

Unknown:  1er
Unknown:  Arguydal
Unknown:  ..
Unknown:  ..
Unknown:  réassureurs
Unknown:  SCOR
Unknown:  non-
Unknown:  Papon
Unknown:  Bousquet
Unknown:  Léotin
Unknown:  Etats
Unknown:  ex-
Unknown:  Riberolles
Unknown:  Jean-Claude
Unknown:  Méry
Unknown:  Etat
Unknown:  Etat
Unknown:  Chalvon
Unknown:  redécollage
Unknown:  GPA
Unknown:  Evoquant
Unknown:  Tchuruk
Unknown:  Pasqua
Unknown:  Giraud
Unknown:  Pandraud
Unknown:  Balkany
Unknown:  Mauroy
Unknown:  Delors
Unknown:  hôtel-restaurant
Unknown:  Montmartre
Unknown:  Adidas
Unknown:  Pentland
Unknown:  Sécuripost
Unknown:  Lagardère
Unknown:  Syveton
Unknown:  Kadosh
Unknown:  Gourcuff
Unknown:  Ribéry
Unknown:  Nasri
Unknown:  maitre
Unknown:  next
Unknown:  jump
Unknown:  next
Unknown:  wave
Unknown:  next
Unknown:  North-Field
Unknown:  Qatargas
Unknown:  conspirationniste
Unknown:  Ruymbeke
Unknown:  Neddy
Unknown:  m3
Unknown:  kg
Unknown:  déja
Unknown:  Renon
Unknown:  Northern
Unknown:  Telecom
Unknown:  Pasqua
Unkn

Unknown:  Nyrup
Unknown:  Rasmussen
Unknown:  SEM
Unknown:  exploration-production
Unknown:  ex-
Unknown:  Renon
Unknown:  Etat
Unknown:  Neues
Unknown:  Deutschland
Unknown:  SME
Unknown:  vingt-quatre
Unknown:  1er
Unknown:  Cooke
Unknown:  Aga
Unknown:  Frigoscandia
Unknown:  CEGF
Unknown:  AMR
Unknown:  Canadian
Unknown:  Guillen
Unknown:  CNPF
Unknown:  Bonn
Unknown:  Bundesbank
Unknown:  recapitalisation
Unknown:  France-soir
Unknown:  savings
Unknown:  and
Unknown:  loans
Unknown:  Etat
Unknown:  Humboldt
Unknown:  Saussure
Unknown:  Chimboraço
Unknown:  après-référendum
Unknown:  chronotachygraphes
Unknown:  ex-
Unknown:  NHS
Unknown:  Hilmar
Unknown:  Kopper
Unknown:  Deutsche
Unknown:  Bank
Unknown:  DM
Unknown:  Kissinger
Unknown:  Giscard
Unknown:  RMI
Unknown:  monde-campus
Unknown:  1er
Unknown:  élisabethains
Unknown:  maitrisée
Unknown:  PAC
Unknown:  Eutelsat
Unknown:  2-F4
Unknown:  Kourou
Unknown:  Laffont-Leenhard
Unknown:  vice-
Unknown:  Bundesbank
Unknown:  Tietm

Unknown:  air-bag
Unknown:  Bonny
Unknown:  Port-Harcourt
Unknown:  Agnelli
Unknown:  chronotachygraphe
Unknown:  Ghigonis
Unknown:  FNTR
Unknown:  Certeau
Unknown:  Gallimard
Unknown:  Mariet
Unknown:  CAC
Unknown:  succursalistes
Unknown:  SFH
Unknown:  pétrochimique
Unknown:  SOLEUVRE
Unknown:  Berlusconi
Unknown:  Denvers
Unknown:  FR3
Unknown:  Rhône-Poulenc
Unknown:  Etat
Unknown:  Belot
Unknown:  Mc.
Unknown:  Donnell
Unknown:  CIP
Unknown:  cuisinistes
Unknown:  Adidas
Unknown:  Jean-Yves
Unknown:  Haberer
Unknown:  AGF
Unknown:  Rubicon
Unknown:  Niedermowwe
Unknown:  Deutsche
Unknown:  RPR
Unknown:  Bonbel
Unknown:  PEA
Unknown:  Chin-Feun
Unknown:  XIIIe
Unknown:  IIIe
Unknown:  HLM
Unknown:  majorisme
Unknown:  citizen's
Unknown:  CNJA
Unknown:  Maastricht
Unknown:  Agnelli
Unknown:  Exor
Unknown:  FR3
Unknown:  Carnival
Unknown:  Cruise
Unknown:  BRS
Unknown:  Toshiba
Unknown:  Corp
Unknown:  sud-coréenne
Unknown:  Samsung
Unknown:  Electronics
Unknown:  Co
Unknown:  Ltd
U

Unknown:  Fitzpatrick
Unknown:  Dauzier
Unknown:  Michel-Edouard
Unknown:  Leclerc
Unknown:  SGAB
Unknown:  Caixa
Unknown:  BBV
Unknown:  Iberduero
Unknown:  Tiananmen
Unknown:  crève-misère
Unknown:  EMEA
Unknown:  H/C/562
Unknown:  pro-
Unknown:  eurosceptiques
Unknown:  SME
Unknown:  franco-britannique
Unknown:  Paribas
Unknown:  Paribas
Unknown:  Télécom
Unknown:  Vignemale
Unknown:  Poullain
Unknown:  Etat
Unknown:  Pasqua
Unknown:  Hoeffel
Unknown:  Atalla
Unknown:  Jaffra
Unknown:  H.
Unknown:  Vuong
Unknown:  Viêtnam
Unknown:  XVIII
Unknown:  No.
Unknown:  OPEP
Unknown:  Jean-Pierre
Unknown:  Dumont
Unknown:  quasi-
Unknown:  assurance-maladie
Unknown:  pub-info
Unknown:  Cousances
Unknown:  Baudesson
Unknown:  IV
Unknown:  Marnaval
Unknown:  XIXe
Unknown:  Espingo
Unknown:  Indosuez
Unknown:  Haussmann
Unknown:  volta-redonda
Unknown:  CSN
Unknown:  Langoni
Unknown:  Sombart
Unknown:  ejidales
Unknown:  Taiwan
Unknown:  Braniff
Unknown:  Airlines
Unknown:  HLM
Unknown:  UNEDIC

Unknown:  Maastricht
Unknown:  Tim.
Unknown:  Diff.
Unknown:  Blas
Unknown:  XIXe
Unknown:  Flandre
Unknown:  Dominati
Unknown:  Marrel
Unknown:  Indochine
Unknown:  GATT
Unknown:  AMB
Unknown:  Bf.G
Unknown:  AMB
Unknown:  Worms
Unknown:  IFIL
Unknown:  SME
Unknown:  PNB
Unknown:  ING
Unknown:  Sviluppo
Unknown:  Finanziaria
Unknown:  Reuter
Unknown:  STASI
Unknown:  Modrow
Unknown:  pin's
Unknown:  Kosovo
Unknown:  Rescue
Unknown:  Committee
Unknown:  IRC
Unknown:  Lockheed
Unknown:  U-2
Unknown:  SR-71
Unknown:  F-117A
Unknown:  Reebok
Unknown:  Rubin
Unknown:  Fennal
Unknown:  chaine
Unknown:  CEE
Unknown:  Wang
Unknown:  Alfonsin
Unknown:  OPEP
Unknown:  Genève
Unknown:  non-
Unknown:  Falcon-2000
Unknown:  Dassault
Unknown:  Alenia
Unknown:  IRI
Unknown:  congés-conversion
Unknown:  Worms
Unknown:  IFIL
Unknown:  Agnelli
Unknown:  CEE
Unknown:  côte-d'ivoire
Unknown:  FMI
Unknown:  stand-by
Unknown:  SED
Unknown:  Léotard
Unknown:  Joxe
Unknown:  Mellick
Unknown:  Cluzel
Unknown:

Unknown:  Etats
Unknown:  sub-
Unknown:  socio-professionnelles
Unknown:  World
Unknown:  Wildlife
Unknown:  Fund
Unknown:  Desktop
Unknown:  IV
Unknown:  Buchsbaum
Unknown:  vice-
Unknown:  Zenith
Unknown:  Altus
Unknown:  Thomson
Unknown:  HLM
Unknown:  ZDS
Unknown:  UNICE
Unknown:  CEEP
Unknown:  EPI
Unknown:  1er
Unknown:  PCF
Unknown:  drolatiques
Unknown:  Jean-Pierre
Unknown:  The
Unknown:  Messenger
Unknown:  Moverman
Unknown:  Etat
Unknown:  CEE
Unknown:  Fininvest
Unknown:  Lafont
Unknown:  Edouard
Unknown:  Leclerc
Unknown:  Edouard
Unknown:  Leclerc
Unknown:  Vnesheconombank
Unknown:  ex-
Unknown:  politico-
Unknown:  RPR
Unknown:  PS
Unknown:  PCF
Unknown:  Abou-Dhabi
Unknown:  Perez
Unknown:  Gaube
Unknown:  Aurélien
Unknown:  Sèze
Unknown:  Télécom
Unknown:  Etat
Unknown:  Ameen
Unknown:  Izzadeen
Unknown:  damn
Unknown:  and
Unknown:  war
Unknown:  Daily
Unknown:  Mirror
Unknown:  of
Unknown:  Lanka
Unknown:  no
Unknown:  precise
Unknown:  provided
Unknown:  accessed
Un

In [116]:
X_dev_charint = word_matrix_to_charint_matrix(X_dev, maxChars)
Y_dev = compute_word_targets(X_dev, outword_mapping, wpm, feature_to_integer, fset, outFeatures)

Unknown:  ULN
Unknown:  Bongrain
Unknown:  Besnier
Unknown:  3e
Unknown:  RPR-UDF
Unknown:  Marti
Unknown:  Lingesler
Unknown:  popularisation
Unknown:  Quarante-deux
Unknown:  évènemens
Unknown:  Agnelli
Unknown:  Apollonies
Unknown:  ci
Unknown:  ATT
Unknown:  CSN
Unknown:  Hongkong
Unknown:  OAT
Unknown:  Airways
Unknown:  vingt-quatre
Unknown:  and
Unknown:  General
Unknown:  Workers
Unknown:  Teulade
Unknown:  Ernewein
Unknown:  dix-mille
Unknown:  CSG
Unknown:  Hurand
Unknown:  Wikipédia
Unknown:  Doubin
Unknown:  BNP
Unknown:  Meciar
Unknown:  Vaclav
Unknown:  Tchécoslovaquie
Unknown:  Bank
Unknown:  of
Unknown:  credit
Unknown:  and
Unknown:  BCCI
Unknown:  Jones
Unknown:  Shakespeare
Unknown:  ejideros
Unknown:  Fortech
Unknown:  Tecphy
Unknown:  Egalement
Unknown:  Etat
Unknown:  demi-
Unknown:  Castro
Unknown:  Théobald
Unknown:  Cottave
Unknown:  Laffore
Unknown:  Toshiba
Unknown:  OM
Unknown:  IPSN
Unknown:  GRS
Unknown:  primo-accession
Unknown:  ULN
Unknown:  Kohl
Unknow

Unknown:  Marionnaux
Unknown:  Moon
Unknown:  Paribas
Unknown:  Poliet
Unknown:  cimentière
Unknown:  Flandres
Unknown:  René-Coty
Unknown:  Etat
Unknown:  Cemex
Unknown:  Compania
Unknown:  Valenciana
Unknown:  Cementos
Unknown:  Rochot
Unknown:  Hansen
Unknown:  Cornéa
Unknown:  Jean-Louis
Unknown:  Normandin
Unknown:  Sopharga
Unknown:  NEW-YORK
Unknown:  Bouygues
Unknown:  OPA
Unknown:  OPEP
Unknown:  78e
Unknown:  FIFA
Unknown:  SELCUK
Unknown:  Pasqua
Unknown:  Berlusconi
Unknown:  Falconbridge
Unknown:  mi-
Unknown:  DEC
Unknown:  Strong
Unknown:  Cedel
Unknown:  Backes
Unknown:  GATT
Unknown:  DASA
Unknown:  Bundesbank
Unknown:  Ramond
Unknown:  Mont-Perdu
Unknown:  appelations
Unknown:  cléricalisant
Unknown:  cléricafard
Unknown:  cléricanaille
Unknown:  jésuitard
Unknown:  Guérard
Unknown:  Eugénie-les-Bains
Unknown:  Dutournier
Unknown:  feuillants
Unknown:  Senderens
Unknown:  Jean-Jacques
Unknown:  Descamps
Unknown:  Etat
Unknown:  Wikipédia
Unknown:  ARRCO
Unknown:  UNIR

Unknown:  Viannet
Unknown:  Krasucki
Unknown:  emploi-solidarité
Unknown:  ex-
Unknown:  TUC
Unknown:  sportswear
Unknown:  casualwear
Unknown:  Zajac
Unknown:  Rouméas
Unknown:  Kassbohrer
Unknown:  Evobus
Unknown:  non-
Unknown:  Fragonard
Unknown:  Saemes
Unknown:  Dominati
Unknown:  Hector-Malot
Unknown:  Général-Beuret
Unknown:  XV
Unknown:  Saint-Martin-Rivoli
Unknown:  Delessert
Unknown:  XVI
Unknown:  Italcementi
Unknown:  MADRID
Unknown:  male
Unknown:  oscuro
Unknown:  Repubblica
Unknown:  Pirani
Unknown:  Halphen
Unknown:  Clearstream
Unknown:  BT
Unknown:  Pinault
Unknown:  Hosneld
Unknown:  réalignements
Unknown:  CNJA
Unknown:  und
Unknown:  Schikedanz
Unknown:  trente-et-un
Unknown:  Flégny
Unknown:  Fragonard
Unknown:  RMI
Unknown:  ETAM
Unknown:  Manos
Unknown:  AT
Unknown:  CBV
Unknown:  OPA
Unknown:  Exor
Unknown:  Généval
Unknown:  Ominco
Unknown:  LTV
Unknown:  Thomson
Unknown:  Ciolina
Unknown:  Tiberi
Unknown:  sud-africain
Unknown:  Alcatel-Alsthom
Unknown:  non

Unknown:  RFA
Unknown:  hedge
Unknown:  funds
Unknown:  américano-hongrois
Unknown:  Soros
Unknown:  Salam
Unknown:  Daher
Unknown:  Airways
Unknown:  Edouard
Unknown:  Balladur
Unknown:  Yonnne
Unknown:  Fillon
Unknown:  Evgueni
Unknown:  Primakov
Unknown:  Murmann
Unknown:  Kohl
Unknown:  Jean-François
Unknown:  Barèges
Unknown:  sous-estimé
Unknown:  UBS
Unknown:  Phillips
Unknown:  Paribas
Unknown:  VSD
Unknown:  Bundesbank
Unknown:  Etat
Unknown:  CSCE
Unknown:  Paris-Dakar
Unknown:  Octopussy
Unknown:  Reuters
Unknown:  admits
Unknown:  altering
Unknown:  Beirut
Unknown:  UAW
Unknown:  SDP
Unknown:  SPD
Unknown:  SPD
Unknown:  SPD
Unknown:  DATAR
Unknown:  Jean-Pierre
Unknown:  Duport
Unknown:  Jean-Pierre
Unknown:  Perier
Unknown:  CCSDN
Unknown:  San-Francisco
Unknown:  GIE
Unknown:  Wouts
Unknown:  Möllemann
Unknown:  Wikipédia
Unknown:  contre-partie
Unknown:  Accor
Unknown:  ICP
Unknown:  Rocard
Unknown:  Etat
Unknown:  TVA
Unknown:  UNEDIC
Unknown:  chauffeurs-routiers
Unkn

Unknown:  Economiquement
Unknown:  RMI
Unknown:  OPA
Unknown:  BERD
Unknown:  entrepreunariat
Unknown:  Etat
Unknown:  Elf-Aquitaine
Unknown:  Saouma
Unknown:  BNP
Unknown:  PEA
Unknown:  Etat
Unknown:  Paribas
Unknown:  MODEF
Unknown:  Mineau
Unknown:  Bérégovoy
Unknown:  Northern
Unknown:  Telecom
Unknown:  Dumez
Unknown:  Bagnères
Unknown:  Luchon
Unknown:  Tourmalet
Unknown:  Hourquette
Unknown:  Louron
Unknown:  Luchon
Unknown:  Designe
Unknown:  Flins
Unknown:  Northwest
Unknown:  Airlines
Unknown:  Foucauld
Unknown:  Monts-Maudits
Unknown:  trente-cinq
Unknown:  Jones
Unknown:  Ier
Unknown:  Ier
Unknown:  Hongkong
Unknown:  Creeks
Unknown:  East
Unknown:  ZDS
Unknown:  Bull
Unknown:  Delors
Unknown:  Camdessus
Unknown:  Attali
Unknown:  Jean-Claude
Unknown:  Trichet
Unknown:  Jean-Claude
Unknown:  Financial
Unknown:  Times
Unknown:  sud-
Unknown:  Diderot
Unknown:  Mireille-Bénédicte
Unknown:  Chalmin
Unknown:  PAC
Unknown:  MATIF
Unknown:  ALAIN
Unknown:  LEBAUBE
Unknown:  Lyon

In [117]:
def pretrained_embedding_layer():
    """
    Creates a Keras Embedding() layer from character symbols (represented as one-hot integers) to
    character features (in one-hot representation, but allowing multiple active features). Uses
    precomputed weights from the "transform" matrix computed in one of the cells above

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = maxChars+1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = 63      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of character vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
 
    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len,emb_dim,trainable=False,mask_zero=True)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([transform])
    
    return embedding_layer

In [118]:
def Embedding_model(input_shape):
    word_indices = Input(shape = input_shape, dtype = 'int32')
    embedding_layer = pretrained_embedding_layer()
    embeddings = embedding_layer(word_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # returning a batch of sequences.
    X = LSTM(128, return_sequences=False)(embeddings)
    X = BatchNormalization()(X)
    Y = Dropout(0.5)(X)
    # Add a (time distributed) Dense layer followed by a softmax activation
    Y = Dense(outFeatures, activation='sigmoid')(Y)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=word_indices,outputs=Y)
        
    return model

In [119]:
emb = Embedding_model((maxChars,))
emb.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 130)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 130, 63)           8253      
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               98304     
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 359)               46311     
Total params: 153,380
Trainable params: 144,871
Non-trainable params: 8,509
_________________________________________________________________


In [120]:
emb.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])

In [None]:
history = emb.fit(X_charint_input, Y_train, epochs = 30, batch_size = 32, shuffle=True, validation_data=(X_dev_charint,Y_dev))

Train on 264432 samples, validate on 88008 samples
Epoch 1/30

In [None]:
print(np.shape(X_charint_input))
print(np.shape(Y_train))

print(np.shape(X_dev_charint))
print(np.shape(Y_dev))



In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:

X_ex = np.array([['Jean', 'aime', 'Marie']])
print(compute_total_words(X_ex))
#X_ex_in = word_matrix_to_charint_matrix(X_ex, 
X_ex_ci = word_matrix_to_charint_matrix(X_ex, maxChars)

In [None]:
print(np.shape(X_ex_ci))
print(X_ex_ci[0])
print(integer_to_character[130])

In [None]:
emb.predict(X_ex_ci)