In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

from keras.models import Model
from keras.layers import Bidirectional, Dense, Input, Dropout, LSTM, Activation, TimeDistributed, BatchNormalization, concatenate, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.initializers import glorot_uniform
from sklearn.model_selection import train_test_split

from itertools import chain

from grail_data_utils import *

%matplotlib inline

np.random.seed(1)

Using TensorFlow backend.


In [2]:
# entire corpus
X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxLen = read_maxentdata('m2.txt')

In [3]:
numClasses = len(partsofspeech2)+1
numSuperClasses = len(superset)+1

print()
print("Longest sentence      : ", maxLen)
print("Number of words       : ", len(vocabulary))
print("Number of norm. words : ", len(vnorm))
print("Number of POS tags    : ", numClasses)
print("Number of supertags   : ", numSuperClasses)



Longest sentence      :  266
Number of words       :  30300
Number of norm. words :  28223
Number of POS tags    :  32
Number of supertags   :  891


In [21]:
# split the training data into the standard 60% train, 20% dev, 20% test 
X_train, X_testdev, Y_train, Y_testdev = train_test_split(X, Y2, test_size=0.4)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_testdev, Y_testdev, test_size=0.5)
print("Train: ", X_train.shape)
print("Test:  ", X_test.shape)
print("Dev:   ", X_dev.shape)


Train:  (9449,)
Test:   (3150,)
Dev:    (3150,)


In [133]:

def get_features(string, cat):
    fset = set()

    if (cat == "v"):
        m0 = re.search(r"<(.*)>", string)
        if m0 is not None:
            for item in m0.group(1).split(','):
                fset.add(item)

    m1 = re.search(r"cat=(.*?)[,\]]", string)
    if m1 is not None:
        fset.add(m1.group(1))

    for m2 in re.findall(r"@(.*?)[,\]]", string):
        if not (m2 == "e"):
            fset.add(m2)
    return fset
            
    
def read_lefff(file):
    vocabulary = set()
    tags = set()
    word_pos_map = {}
    max_word_len = 0
    with open(file, 'r') as f:
        for line in f:
            line = line.strip().split("\t")
            w = line[0]
            w = w.replace("æ", "ae")
            w = w.replace("œ", "oe")
            word_len = len(w)
            if (word_len > max_word_len):
                max_word_len = word_len
            wlist = w.split()
            if (len(wlist) == 2):
                w = wlist[0]
                nextfeat == None
                if (wlist[1]).startswith("qu'"):
                    nextfeat = "Next:que"
                if (wlist[1]).startswith("que "):
                    nextfeat = "Next:que"
                if (wlist[1] == 'que'):
                    nextfeat = "Next:que"
                if (wlist[1]).startswith("d'"):
                    nextfeat = "Next:de"
                if (wlist[1]).startswith("de "):
                    nextfeat = "Next:de"
                if (wlist[1] == 'de'):
                    nextfeat = "Next:de"
                if (wlist[1]).startswith("à "):
                    nextfeat = "Next:à"
                if (wlist[1] == 'à'):
                    nextfeat = "Next:à"
                if (wlist[1] == 'priori'):
                    nextfeat = "Next:priori"
            elif (len(wlist) == 1):
                nextfeat = None
                
            if (len(wlist) == 1) or ((len(wlist) ==2) and (nextfeat is not None)):    
                pos = line[2]
                features = line[3]
                vocabulary.add(w)
                valset = word_pos_map.get(w)
                if valset is None:
                    valset = set()
                valset.add(pos)
                fts = get_features(features, pos)
                valset = valset.union(fts)
                if nextfeat is not None:
                    valset.add(nextfeat)
                word_pos_map[w] = valset

    for w in ['capella', 'contratio', 'fortiori', 'latere', 'minima', 'posteriori',  'priori']:
        word_pos_map[w] = set(['priori'])
                
    for key in iter(word_pos_map.keys()):
        word_pos_map[key] = frozenset(word_pos_map[key])
        
    for val in iter(word_pos_map.values()):
        tags.add(val)
        
    return vocabulary, tags, word_pos_map, max_word_len


In [134]:
v, t, wpm, maxWordLen = read_lefff('lefff-ext-3.0.txt')

In [135]:
print(maxWordLen)
print(t)

60
{frozenset({'Suj:cln|sn', 'Obj:(cla|pour-sn|sn)', 'Obj:(cla|sn)', 'Att:(sn)', 'F1s', 'Suj:cln|scompl|sinf|sn', 'Objde:de-sn', 'pers', 'v'}), frozenset({'I12s', 'Dloc:(de-sn|en)', 'Loc:(loc-sn|y)', 'Suj:cln|sn', 'Obj:cla|sn', 'pers', 'v'}), frozenset({'Obl:(sinf)', 'Objà:y|à-sn', 'CtrlSujObl', 'Obj:cla|sn', 'J1s', 'Suj:cln|scompl|sinf|sn', 'Loc:(loc-sn|y)', 'pers', 'être', 'v'}), frozenset({'active', 'nc', 'Obj:cla|sn', 'Suj:cln|qcompl|scompl|sinf|sn', 'Obl2:(de-sn)', 'CompSubj', 'passive', 'Obl2:(par-sn)', 'v', 'Obj:à-sinf', 'adj', 'AttObj', 'Obj:(cla|sn)', 'CtrlSujObj', 'Obj:(cla|qcompl|scompl|sinf|sn)', 'AttSuj', 'Suj:à-sinf', 'Kfs', 'Suj:cln|sn', 'pers', 'fs', 'Att:sa|sn'}), frozenset({'Obj:(cla|sn)', 'T2p', 'Suj:cln|scompl|sinf|sn', 'Obl:(contre-sn)', 'pers', 'v'}), frozenset({'CtrlSujObjà', 'Objà:(y|à-scompl|à-sinf|à-sn)', 'Suj:cln|scompl|sinf|sn', 'F3s', 'pers', 'v'}), frozenset({'T1p', 'Obj:cla|sn', 'Objà:(avec-sn|cld|à-sn)', 'Objde:(de-scompl|de-sinf|de-sn|en)', 'Suj:cln|sco

In [7]:
print(len(t))

12244


In [8]:
print(wpm["Jean"])

frozenset({'ms', 'np', 'fs', 'hum'})


In [9]:
print(wpm["est"])

frozenset({'Att:(sa|à-sinf|à-sn)', 'adj', 'nc', 'AttSuj', 'fêtre', 'ms', 'Att:(de-sinf|scompl|sn)', 'Suj:cln|scompl|sinf|sn', 'P3s', 'pers', 'v', 'auxEtre'})


In [10]:
print(wpm["été"])

frozenset({'Att:(sa|à-sinf|à-sn)', 'K', 'active', 'nc', 'AttSuj', 'fêtre', 'ms', 'Att:(de-sinf|scompl|sn)', 'Suj:cln|scompl|sinf|sn', 'pers', 'v', 'auxEtre'})


In [11]:
print(wpm["était"])

frozenset({'Att:(sa|à-sinf|à-sn)', 'I3s', 'AttSuj', 'fêtre', 'Att:(de-sinf|scompl|sn)', 'Suj:cln|scompl|sinf|sn', 'pers', 'v', 'auxEtre'})


In [12]:
print(wpm["faut"])

frozenset({'impers', 'P3s', 'Obj:cla|scompl|sinf|sn', 'Objà:(cld|à-sn)', 'CompSubj', 'v'})


In [13]:
print(wpm["que"])

frozenset({'pro_acc', 'que', 'pri', 'que_restr', 'prel'})


In [14]:
print(wpm["qu'"])

frozenset({'pro_acc', 'que', 'pri', 'que_restr', 'prel'})


In [15]:
print(wpm["priori"])

frozenset({'priori'})


In [16]:
print(wpm["importe"])

frozenset({'impers', 'Obj:(cla|sn)', 'Objà:(cld|à-sn)', 'Suj:de-sinf|scompl|sn', 'imperative', 'pers', 'Suj:cln|scompl|sinf|sn', 'Suj:cln|sn', 'PS13s', 'Y2s', 'v'})


In [17]:
print(len(v))

404891


In [165]:
fset = set()
for frozen in t:
    for f in frozen:
        fset.add(f)
print(fset)
print(len(fset))
outFeatures = len(fset)

{'3ms', 'year', 'NV:', 's_P1s', 'C12s', 'v', 'S1s', 'PFIJTSC', 'J12s', 'CtrlSujObl', 'Loc:(loc-sn|y)', 'Suj:sn|sinf|scompl', 'CtrlObjObjà', 'Obj:cla|scompl|sn', 'pseudo-en', 'CtrlSujObj', 'S1p', 'weekday', 'Objà:(à-scompl|à-sinf)', 'etr', 's_P2p', 'Objà:(sur-sn|y|à-sn)', 'Objde:de-scompl|de-sn|en', 'Loc:(sur-sn)', 'Objà:(en-sn|y|à-sn)', 'Suj:(cln|sn)', 'Loc:(cll|loc-sn)', 'clg', 'CtrlSujLoc', 'Obj:cla|qcompl|scompl|sinf|sn', 'GP:', 'f', 'AttSuj', 'T1s', 'fs_P3s', 'ponctw', 'fs', 'pseudo-y', 'Objà:(y|à-scompl|à-sinf|à-sn)', 'Suj:(sn)', 'cln', 'Obl:(par-sn)', 'pro_gen', 'Att:(comme-sa|comme-sn|sa|sinf|sn)', 'Objde:(de-scompl|de-sn|en|scompl|sinf)', ':PV', 'imperative', 'Loc:(à-sn)', 'Obl:(dans-sn)', 'Kfp', 'Obj:(cla|de-sinf|sn)', 'P3p', 'F2s', 'Objà:(cld|y|à-scompl|à-sinf|à-sn)', 'ms', 'passive', 'PS3p', 'Suj:scompl|sinf|sn', 'T2s', 'pro', 'suffAdj', 'p_P3s', 'Obj:(cla|scompl|sinf|sn)', 's_P3p', 'Objà:(sur-sn|y)', 'Obl:(sur-sn)', 'Objà:y|à-sn', 'Obj:(scompl|sinf)', 'Loc:(y|à-sn)', 'Objde

In [19]:
feature_to_integer, integer_to_feature = indexify(fset)

In [20]:
print(X[0])

['Lyonnaise-Dumez', 'vient', "d'", 'hispaniser', 'sa', 'filiale', 'espagnole', 'et', "d'", 'étendre', 'ses', 'participations', 'en', 'Espagne', ',', 'tout', 'en', 'resserrant', 'ses', 'liens', 'avec', 'la', 'Caixa', ',', 'première', 'caisse', "d'", 'épargne', 'espagnole', 'et', "l'", 'un', 'des', 'premiers', 'établissements', 'financiers', 'de', 'la', 'péninsule', 'ibérique', ',', 'à', 'laquelle', 'elle', 'est', 'liée', 'depuis', 'longtemps', 'dans', 'la', 'Société', 'générale', 'des', 'eaux', 'de', 'Barcelone', '(', 'SGAB', ')', ',', 'premier', 'groupe', 'espagnol', 'de', 'services', '(', 'la', 'Caixa', 'détient', 'aussi', '2', '%', 'du', 'capital', 'de', 'Lyonnaise-Dumez', ')', '.']


In [347]:
def get_characters(vocabulary):

    characters = set(['^', 'Û', '<BOS>', '<BOW>', '<EOW>', '<EOS>', '<UNK>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
    for word in vocabulary:
        clist = list(word)
        for char in clist:
            characters.add(char)
    
    remove = set(['|', '«', '»', 'µ', '§', 'þ', 'ø', '_', '¯', 'ð', 'æ', '~', '©', '@', '~'])
    
    return characters - remove

In [348]:
characters = get_characters(v)
print(characters)
print(len(characters))

maxChars = len(characters)

character_to_integer, integer_to_character = indexify(characters)

{',', 'î', 'B', '.', 'O', 'e', 'o', 'T', '*', '<BOW>', '0', 'v', 'z', 'r', '(', '=', 'U', 'P', '[', 'I', 'L', 'ç', "'", '-', 'V', 'È', 'É', 'u', 'ö', 'Y', '<UNK>', '8', '±', 'S', 'c', '<EOS>', 'm', 's', 'ì', '%', 'á', '&', ')', 'Û', 'W', '1', 'â', '4', 'Á', 'é', 'f', 'J', 'D', 'û', 'l', '½', 'ã', '6', 'N', ']', '<EOW>', 'M', 'G', 'A', 'ï', 'g', 'H', '?', 'C', 'å', '²', '/', 'K', ';', 'd', '$', 'ó', 'w', 'Î', 'À', 'k', 'Å', 'h', 'p', 'a', 'E', 'x', 'à', 'ä', '2', 'ù', '<', '°', '7', 'õ', 'ú', 'ô', 'y', '^', 'ò', '"', '5', '!', 'X', 't', 'b', ':', 'R', 'ñ', 'ü', 'í', '<BOS>', 'F', '>', '9', 'q', 'Z', 'ë', 'Ç', 'j', 'ê', 'º', 'è', '3', '+', 'Q', 'i', 'n'}
128


In [349]:
mapping = {}

mapping["="] = set(['math', 'infix'])
mapping["<"] = set(['math', 'infix'])
mapping[">"] = set(['math', 'infix'])


mapping["+"] = set(['math', 'infix'])
mapping["*"] = set(['math', 'infix'])
mapping["±"] = set(['math', 'prefix'])
mapping["°"] = set(['math', 'postfix'])
mapping["º"] = set(['math', 'postfix'])
mapping["²"] = set(['math', 'postfix'])


mapping["0"] = set(['9'])
mapping["1"] = set(['9'])
mapping["2"] = set(['9'])
mapping["3"] = set(['9'])
mapping["4"] = set(['9'])
mapping["5"] = set(['9'])
mapping["6"] = set(['9'])
mapping["7"] = set(['9'])
mapping["8"] = set(['9'])
mapping["½"] = set(['9'])

mapping["A"] = set(['a', 'maj'])
mapping["B"] = set(['b', 'maj'])
mapping["C"] = set(['c', 'maj'])
mapping["D"] = set(['d', 'maj'])
mapping["E"] = set(['e', 'maj'])
mapping["F"] = set(['f', 'maj'])
mapping["G"] = set(['g', 'maj'])
mapping["H"] = set(['h', 'maj'])
mapping["I"] = set(['i', 'maj'])
mapping["J"] = set(['j', 'maj'])
mapping["K"] = set(['k', 'maj'])
mapping["L"] = set(['l', 'maj'])
mapping["M"] = set(['m', 'maj'])
mapping["N"] = set(['n', 'maj'])
mapping["O"] = set(['o', 'maj'])
mapping["P"] = set(['p', 'maj'])
mapping["Q"] = set(['q', 'maj'])
mapping["R"] = set(['r', 'maj'])
mapping["S"] = set(['s', 'maj'])
mapping["T"] = set(['t', 'maj'])
mapping["U"] = set(['u', 'maj'])
mapping["V"] = set(['v', 'maj'])
mapping["W"] = set(['w', 'maj'])
mapping["X"] = set(['x', 'maj'])
mapping["Y"] = set(['y', 'maj'])
mapping["Z"] = set(['z', 'maj'])

mapping["á"] = set(['a', 'aigu'])
mapping["à"] = set(['a', 'grave'])
mapping["ä"] = set(['a', 'uml'])
mapping["â"] = set(['a', 'circ'])
mapping["ã"] = set(['a', 'tilde'])
mapping["å"] = set(['a', 'overring'])

mapping["ç"] = set(['c', 'cedil'])

mapping["é"] = set(['e', 'aigu'])
mapping["è"] = set(['e', 'grave'])
mapping["ê"] = set(['e', 'circ'])
mapping["ë"] = set(['e', 'uml'])


mapping["ï"] = set(['i', 'uml'])
mapping["í"] = set(['i', 'aigu'])
mapping["î"] = set(['i', 'circ'])
mapping["ì"] = set(['i', 'grave'])

mapping["ñ"] = set(['n', 'tilde'])

mapping["ö"] = set(['o', 'uml'])
mapping["ó"] = set(['o', 'aigu'])
mapping["õ"] = set(['o', 'tilde'])
mapping["ô"] = set(['o', 'circ'])
mapping["ò"] = set(['o', 'grave'])

mapping["û"] = set(['u', 'circ'])
mapping["ú"] = set(['u', 'aigu'])
mapping["ù"] = set(['u', 'grave'])
mapping["ü"] = set(['u', 'uml'])

mapping["Á"] = set(['a', 'maj', 'aigu'])
mapping["À"] = set(['a', 'maj', 'grave'])
mapping["Å"] = set(['a', 'maj', 'overring'])

mapping["Ç"] = set(['c', 'maj', 'cedil'])

mapping["È"] = set(['i', 'maj', 'grave'])
mapping["É"] = set(['i', 'maj', 'aigu'])

mapping["Î"] = set(['i', 'maj', 'circ'])

mapping["Û"] = set(['u', 'maj', 'circ'])



In [350]:
def compute_symset (characters, mapping):
    symset = set()
    for c in characters:
        try:
            syms = mapping[c]
            symset = symset.union(syms)
        except:
            symset.add(c)
    return symset

In [351]:
sset = compute_symset(characters, mapping)
print(sset)

maxCharFeat = len(sset)+1
print(len(sset))

print(mapping['+'])

{',', '.', 'postfix', 'e', 'o', 'k', '&', '<BOW>', '<BOS>', 'infix', ')', 'maj', 'p', 'a', 'h', 'v', 'x', '9', 'z', 'f', 'q', 'circ', 'r', '(', 'l', '[', ']', 'grave', 'uml', 'y', '<EOW>', 'j', '^', 'prefix', 'cedil', 'g', "'", '-', 'tilde', '"', '?', 'overring', 'u', '!', '<UNK>', '/', ';', 'd', 't', 'c', '$', '<EOS>', 'aigu', 'b', 'm', 'w', ':', 'i', 'n', 's', '%', 'math'}
62
{'math', 'infix'}


In [352]:
def char_to_charfeatures (char, mpg, symbset):
    try:
        feats = mpg[char]
    except:
        if (char in symbset):
            feats = set([char])
        else:
            feats = set(['<UNK>'])
    
    return feats

In [353]:
char_to_charfeatures("A", mapping, sset)

{'a', 'maj'}

In [354]:
charf_to_integer, integer_to_charf = indexify(sset)

In [355]:
charf_to_integer.keys()

dict_keys([',', '.', 'postfix', 'e', 'o', 'k', '&', '<BOW>', '<BOS>', 'infix', ')', 'maj', 'p', 'a', 'h', 'v', 'x', '9', 'z', 'f', 'q', 'circ', 'r', '(', 'l', '[', ']', 'grave', 'uml', 'y', '<EOW>', 'j', '^', 'prefix', 'cedil', 'g', "'", '-', 'tilde', '"', '?', 'overring', 'u', '!', '<UNK>', '/', ';', 'd', 't', 'c', '$', '<EOS>', 'aigu', 'b', 'm', 'w', ':', 'i', 'n', 's', '%', 'math'])

In [356]:
def char_to_one_hot(char, mpg, c_to_in, symbset, mcf):
    fset = char_to_charfeatures(char, mpg, symbset)
    numf = len(fset)
    
    tmp = np.zeros(numf)
    index = 0
    for item in fset:
        tmp[index] = c_to_in[item]
        index = index + 1
    Y = to_categorical(tmp, mcf)
    return np.sum(Y,axis=0)
   

In [357]:
Y0 = char_to_one_hot("A", mapping, charf_to_integer, sset, maxCharFeat)

print(Y0)


[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [358]:
transform = np.zeros((maxChars+1, maxCharFeat))
for char, i in character_to_integer.items():
    transform[i] = char_to_one_hot(char, mapping, charf_to_integer, sset, maxCharFeat)
    
np.shape(transform)

(129, 63)

In [359]:
def word_to_one_hot(word, mpg, c_to_in, symbset, mcf, bos, eos):
    ls = list(word)
    length = len(ls)+2
    end = length - 1
    Y = np.zeros((length,mcf))
    if bos:
        Y[0] = char_to_one_hot('<BOS>', mpg, c_to_in, symbset, mcf)
    else:    
        Y[0] = char_to_one_hot('<BOW>', mpg, c_to_in, symbset, mcf)
    if eos:
        Y[end] = char_to_one_hot('<EOS>', mpg, c_to_in, symbset, mcf)
    else:    
        Y[end] = char_to_one_hot('<EOW>', mpg, c_to_in, symbset, mcf)

    for i in range(0,end-1):
        Y[i+1] = char_to_one_hot(ls[i], mpg, c_to_in, symbset, mcf)
        
    return Y

In [360]:
Jean = word_to_one_hot('Jean', mapping, charf_to_integer, sset, maxCharFeat, True, True)
print(np.shape(Jean))
print(Jean)

(6, 63)
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.

In [361]:
totalWords = 0
for i in range(len(X_train)-1):
    for j in range(len(X_train[i])):
        totalWords = totalWords + 1
  
print(totalWords)

264421


In [362]:
# this precomputes the results of the embedding layer

X_input = np.zeros((totalwords,maxWordLen+2,maxCharFeat))

xindex = 0

for i in range(len(X_train)-1):
    end = len(X_train[i])
    for j in range(len(X_train[i])):
        bool1 = (j is 0)
        bool2 = (j is end-1)
        X_tmp = word_to_one_hot(X_train[i][j], mapping, charf_to_integer, sset, maxCharFeat, bool1, bool2)       
        for k in range(len(X_tmp)):
            X_input[xindex][k] = X_tmp[k]
        xindex = xindex + 1
        
print(np.shape(X_input))

(264421, 62, 63)


In [379]:
X_charint_input = np.zeros((totalwords,maxChars))

xindex = 0

for i in range(len(X_train)-1):
    end = len(X_train[i])
    for j in range(len(X_train[i])):
        charslist = list(X_train[i][j])
        X_charint_input[xindex][0] = character_to_integer['<BOW>']
        X_charint_input[xindex][len(charslist)] = character_to_integer['<EOW>']
        for c in range(len(charslist)):
            char = charslist[c]
            d = c+1
            try:
                X_charint_input[xindex][d] = character_to_integer[char]
            except:
                print(i)
                print(charslist)
        xindex = xindex + 1
        
print(np.shape(X_charint_input))

(264421, 128)


In [367]:
print(X_train[397])
print(character_to_integer['U'])

['-', '^', 'Reuters', 'Doctoring', 'Photos', 'from', 'Beirut', '?']
17


In [368]:
def outword_mapping(word):
    lword = word.lower
    if lword == "aux":
        return set(['à', 'les'])
    if lword == "des":
         return set(['de', 'les'])
    if lword == "au":
        return set(['à', 'le'])
    if lword == "du":
         return set(['de', 'le'])
    if len(word) == 1 and word.isupper:
        return set(['Alpha'])
    if is_numeral(word):
        return set(["dix-sept"])
    return set([word])
       

In [369]:
def wordf_to_one_hot(word, outword_mapping, wpm, f_to_in, fset, outf):
    wordset = outword_mapping(word)
    featureset = set()
    for word in wordset:
        if word in wpm.keys():
            featureset = featureset.union(wpm[word])
        else:
            wordl = word.lower()
            if wordl in wpm.keys():
                featureset = featureset.union(wpm[wordl])
            else:
                print("Unknown: ", word)
                featureset = set()
            
    length = len(featureset)
    Y = np.zeros((length,outf))

    i = 0
    tmp = np.zeros(len(featureset))
    for f in featureset:
        tmp[i] = f_to_in[f]
        i = i + 1
        
    Y = to_categorical(tmp, outf)

    return np.sum(Y,axis=0)

In [370]:
def normalize_capitals(word):
    word = word.replace("É", "E")
    word = word.replace("Î", "I")
        

In [371]:
vector = wordf_to_one_hot("aux", outword_mapping, wpm, feature_to_integer, fset, outFeatures)
print(np.shape(vector))
print(vector)
print(vector[0])

Unknown:  aux
(357,)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  

In [372]:
print(X_train[0][0])
vec = wordf_to_one_hot(X_train[0][0], outword_mapping, wpm, feature_to_integer, fset, outFeatures)
print(vec)

Des
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0

In [373]:
def empty_mapping(X):
    if is_numeral(X):
        X = "dix-sept"
    if (X == "CBV"):
        X = "Conseil"
    if (X == "D"):
        X = "alpha"
    return X

Y_train = np.zeros((totalwords,outFeatures))

yindex = 0

for i in range(len(X_train)-1):
    for j in range(len(X_train[i])):
        Y_train[yindex] = wordf_to_one_hot(X_train[i][j], outword_mapping, wpm, feature_to_integer, fset, outFeatures)
        yindex = yindex + 1



Unknown:  CBV
Unknown:  aux
Unknown:  Guintoli
Unknown:  ÉTATS-UNIS
Unknown:  au
Unknown:  aux
Unknown:  aux
Unknown:  Popi
Unknown:  Bayard
Unknown:  HLM
Unknown:  aux
Unknown:  aux
Unknown:  TVA
Unknown:  Doubin
Unknown:  Jean-Louis
Unknown:  Pétriat
Unknown:  FNAC
Unknown:  Ruymbeke
Unknown:  au
Unknown:  BCCI
Unknown:  Mahfouz
Unknown:  bank
Unknown:  Jean-Yves
Unknown:  Chamard
Unknown:  RPR
Unknown:  UDF
Unknown:  UDC
Unknown:  aux
Unknown:  Etats
Unknown:  Guinée-Equatoriale
Unknown:  Sierra-Léone
Unknown:  Pronasol
Unknown:  M3
Unknown:  Bundesbank
Unknown:  UNIM
Unknown:  Schlesinger
Unknown:  Bundesbank
Unknown:  -ci
Unknown:  PS
Unknown:  au
Unknown:  Etats
Unknown:  Balmary
Unknown:  Etat
Unknown:  au
Unknown:  OCDE
Unknown:  SME
Unknown:  PNB
Unknown:  au
Unknown:  Etats
Unknown:  au
Unknown:  RPR
Unknown:  UDF
Unknown:  au
Unknown:  rétrocommissions
Unknown:  Strauss-Kahn
Unknown:  au
Unknown:  CNPF
Unknown:  UPA
Unknown:  aux
Unknown:  au
Unknown:  Trente-cinq
Unknown:  

Unknown:  CNRS
Unknown:  MPG
Unknown:  ITCF
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  Leibinger
Unknown:  Vingt-cinq
Unknown:  Eltsine
Unknown:  macro-
Unknown:  recapitalisation
Unknown:  Etat
Unknown:  aux
Unknown:  Cooke
Unknown:  1er
Unknown:  Farinet
Unknown:  Verneau
Unknown:  Education
Unknown:  Vindé
Unknown:  Edition
Unknown:  ISBN
Unknown:  aux
Unknown:  Jean-François
Unknown:  Burgelin
Unknown:  Toulouse-Paris
Unknown:  Paris-Nantes
Unknown:  Paris-Bâle
Unknown:  aux
Unknown:  au
Unknown:  CHOMAGE
Unknown:  INSEE
Unknown:  Dominati
Unknown:  FIJ
Unknown:  procroate
Unknown:  au
Unknown:  SMIC
Unknown:  au
Unknown:  OPA
Unknown:  Exor
Unknown:  Etat
Unknown:  au
Unknown:  aux
Unknown:  Dreyfus
Unknown:  Kleinwort
Unknown:  Securities
Unknown:  au
Unknown:  au
Unknown:  Vieux-Colombier
Unknown:  Mercedes-Benz
Unknown:  au
Unknown:  Reebok
Unknown:  mal-aimée
Unknown:  Genève
Unknown:  Sutherland
Unknown:  Mobis
Unknown:  franchisés
Unknown:  Delaporte
Unknown:  Vieux-Colomb

Unknown:  Houcine
Unknown:  Academy
Unknown:  aux
Unknown:  au
Unknown:  Ssangyong
Unknown:  Korando
Unknown:  Korando
Unknown:  Family
Unknown:  Alcatel
Unknown:  Telettra
Unknown:  au
Unknown:  surengagés
Unknown:  au
Unknown:  bar-tabac
Unknown:  CAC
Unknown:  aux
Unknown:  ultramarines
Unknown:  Belfond
Unknown:  Kuo
Unknown:  Éric
Unknown:  Halphen
Unknown:  Poullain
Unknown:  HLM
Unknown:  Schuller
Unknown:  SAR
Unknown:  au
Unknown:  aux
Unknown:  Procopio
Unknown:  persona
Unknown:  grata
Unknown:  Dumas
Unknown:  II
Unknown:  Hongkong
Unknown:  non-
Unknown:  au
Unknown:  aux
Unknown:  Etat
Unknown:  aux
Unknown:  Pentland
Unknown:  Dachaud
Unknown:  au
Unknown:  CNPF
Unknown:  UNEDIC
Unknown:  Touraine
Unknown:  deficiency
Unknown:  payments
Unknown:  CEE
Unknown:  Etats-Unis
Unknown:  au
Unknown:  Denormandie
Unknown:  trente-sept
Unknown:  UJJEF
Unknown:  BFCE
Unknown:  au
Unknown:  Au
Unknown:  CSN
Unknown:  usine-symbole
Unknown:  au
Unknown:  PIB
Unknown:  aérobic
Unknow

Unknown:  Clinvest
Unknown:  ANC
Unknown:  Mandela
Unknown:  Klerk
Unknown:  Andreuzza
Unknown:  aux
Unknown:  Angiox
Unknown:  AC
Unknown:  Reebok
Unknown:  au
Unknown:  Jean-Jacques
Unknown:  Rollat
Unknown:  RMI
Unknown:  TUC
Unknown:  SIVP
Unknown:  aux
Unknown:  HLM
Unknown:  Lehalle
Unknown:  au
Unknown:  Asko
Unknown:  non-
Unknown:  au
Unknown:  HLM
Unknown:  rogatoire
Unknown:  Vandingenen
Unknown:  Portelli
Unknown:  SAP
Unknown:  fabricant-grossiste
Unknown:  au
Unknown:  aux
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  Belfond
Unknown:  Echiquier
Unknown:  CEE
Unknown:  Au
Unknown:  Au
Unknown:  Corbusier
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  Hongkong
Unknown:  au
Unknown:  pipi-room
Unknown:  au
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  New-York
Unknown:  UAP
Unknown:  AGF
Unknown:  GAN
Unknown:  aux
Unknown:  CSG
Unknown:  Bongrain
Unknown:  Sogénor
Unknown:  au
Unknown:  au
Unknown:  Pasqua
Unknown:  au
Unknown:  au
Unknown:  PAC
Unknown:  Parretti
Unknown:  OC

Unknown:  Besançon
Unknown:  Elf
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  francaise
Unknown:  Champs-Elysées
Unknown:  CVS
Unknown:  au
Unknown:  Bf.G.
Unknown:  Bf.G
Unknown:  Deutschland
Unknown:  Jean-Yves
Unknown:  Haberer
Unknown:  Bf.G
Unknown:  sine
Unknown:  qua
Unknown:  Frydman
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  Eysymontt
Unknown:  hyperinflation
Unknown:  Sanofi
Unknown:  Bongrain
Unknown:  eurobanques
Unknown:  -ci
Unknown:  PRI
Unknown:  au
Unknown:  ejidos
Unknown:  Maastricht
Unknown:  au
Unknown:  au
Unknown:  Fondiaria
Unknown:  AGF
Unknown:  Thomson
Unknown:  franco-italien
Unknown:  Elfi
Unknown:  BNP
Unknown:  UAP
Unknown:  Maladetta
Unknown:  au
Unknown:  trente-deux
Unknown:  au
Unknown:  Jankowski
Unknown:  Marchelli
Unknown:  france
Unknown:  Giraud
Unknown:  Ile-de-France
Unknown:  Etienne
Unknown:  Vatelot
Unknown:  FMI
Unknown:  S.
Unknown:  L.
Unknown:  Rao
Unknown:  Éric
Unknown:  Halphen
Unknown:  Poulla

Unknown:  Dassault
Unknown:  CFAO
Unknown:  au
Unknown:  GATT
Unknown:  Etat
Unknown:  Etat
Unknown:  Elf-Aquitaine
Unknown:  au
Unknown:  au
Unknown:  aux
Unknown:  Moody's
Unknown:  cash-flow
Unknown:  Philips
Unknown:  reparait
Unknown:  Equateur
Unknown:  percutanée
Unknown:  ICP
Unknown:  SME
Unknown:  au
Unknown:  Besnier
Unknown:  Italcementi
Unknown:  Zelnik
Unknown:  Virgin
Unknown:  Bundesbank
Unknown:  Mc.
Unknown:  Donnell
Unknown:  au
Unknown:  J.-F.-Kennedy
Unknown:  New-York
Unknown:  au
Unknown:  3e
Unknown:  au
Unknown:  Blum
Unknown:  aux
Unknown:  sous-évaluation
Unknown:  au
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  ACOSS
Unknown:  Dassler
Unknown:  reclasssement
Unknown:  Exor
Unknown:  Ominco
Unknown:  SBF
Unknown:  Pointe-des-Galets
Unknown:  aux
Unknown:  Tamatave
Unknown:  au
Unknown:  Ruymbeke
Unknown:  CD-ROM
Unknown:  Clearstream
Unknown:  Bagnères
Unknown:  au
Unknown:  Bénasque
Unknown:  UER
Unknown:  Euronews
Unknown:  UTA
Unknown:  ICL
Unknown:  au
Un

Unknown:  au
Unknown:  au
Unknown:  Schlesinger
Unknown:  Bundesbank
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  Angiox
Unknown:  Arianespace
Unknown:  CEE
Unknown:  B.
Unknown:  Lathière
Unknown:  Ladreit
Unknown:  Lacharrière
Unknown:  toujours-plus
Unknown:  autoparodie
Unknown:  Mauroy-Delors
Unknown:  aux
Unknown:  multifonctionnel
Unknown:  au
Unknown:  emploi-solidarité
Unknown:  TUC
Unknown:  aux
Unknown:  congé-maladie
Unknown:  FNB
Unknown:  Philips
Unknown:  Moody's
Unknown:  Ileau
Unknown:  Muzelle
Unknown:  aux
Unknown:  Mermaz
Unknown:  Bergons
Unknown:  leitender
Unknown:  Angestellter
Unknown:  Au
Unknown:  au
Unknown:  Yoshitomi
Unknown:  Pharmaceuticals
Unknown:  au
Unknown:  Éric
Unknown:  Halphen
Unknown:  au
Unknown:  est-à-dire
Unknown:  Olszewski
Unknown:  île-de-France
Unknown:  CSA
Unknown:  au
Unknown:  ex-
Unknown:  BP
Unknown:  au
Unknown:  Etat
Unknown:  Saint-pères
Unknown:  Fininvest
Unknown:  au
Unknown:  Aubry
Unknown:  Calvet
Unknown:  autoentrete

Unknown:  Thomson-CSF
Unknown:  mi-
Unknown:  au
Unknown:  Exchange
Unknown:  au
Unknown:  Barèges
Unknown:  au
Unknown:  franco-italien
Unknown:  SGS-Thomson
Unknown:  au
Unknown:  Hénin
Unknown:  GMF
Unknown:  Asturies
Unknown:  Looses
Unknown:  au
Unknown:  CAC
Unknown:  INSEE
Unknown:  au
Unknown:  au
Unknown:  Etats-Unis
Unknown:  Etat
Unknown:  Etats-Unis
Unknown:  SNAV
Unknown:  au
Unknown:  Onché
Unknown:  Krasucki
Unknown:  au
Unknown:  au
Unknown:  Paris-Montsouris
Unknown:  Brochand
Unknown:  Imbot
Unknown:  au
Unknown:  Pernod-Ricard
Unknown:  Ricard
Unknown:  au
Unknown:  Bercy
Unknown:  CEE
Unknown:  au
Unknown:  EMBLÉE
Unknown:  au
Unknown:  Tiberi
Unknown:  au
Unknown:  au
Unknown:  Edimbourg
Unknown:  CDU
Unknown:  SPD
Unknown:  Waigel
Unknown:  Vogel
Unknown:  Dumas
Unknown:  au
Unknown:  Lanquetin
Unknown:  aux
Unknown:  SNPMI
Unknown:  vice-
Unknown:  SPD
Unknown:  RFA
Unknown:  Vogel
Unknown:  Rau
Unknown:  Berlin-Est
Unknown:  au
Unknown:  Au
Unknown:  CIFUS
Unkno

Unknown:  Gomez
Unknown:  Thomson
Unknown:  CSF
Unknown:  au
Unknown:  Etat
Unknown:  auto-
Unknown:  Baloutchistan
Unknown:  au
Unknown:  quatre-vingts
Unknown:  Elf
Unknown:  BCCI
Unknown:  Legrand
Unknown:  Midwest
Unknown:  au
Unknown:  Etat
Unknown:  Ruymbeke
Unknown:  Talancé
Unknown:  Arlington
Unknown:  Willow-Run
Unknown:  GM
Unknown:  Grim
Unknown:  Berlusconi
Unknown:  aux
Unknown:  trente-deux
Unknown:  Staedelin
Unknown:  BNP
Unknown:  aux
Unknown:  au
Unknown:  aux
Unknown:  aux
Unknown:  Haussmann
Unknown:  Godart
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  XXe
Unknown:  Bérégovoy
Unknown:  Méziré
Unknown:  au
Unknown:  Lander
Unknown:  aux
Unknown:  aux
Unknown:  trachytique
Unknown:  Pichincha
Unknown:  aux
Unknown:  au
Unknown:  CNPA
Unknown:  Alpe-d'Huez
Unknown:  Maastricht
Unknown:  Tchernobyl
Unknown:  Philips
Unknown:  aux
Unknown:  aux
Unknown:  Bundesbank
Unknown:  UAP
Unknown:  au
Unknown:  Victoire-Colonia
Unknown:  aux
Unknown:  Etats-Unis


Unknown:  Faure
Unknown:  aux
Unknown:  Stern
Unknown:  BERD
Unknown:  kids
Unknown:  CNPF
Unknown:  auquel
Unknown:  Nehru
Unknown:  Leigh-Pemberton
Unknown:  réalignement
Unknown:  SME
Unknown:  Etat
Unknown:  Lienemann
Unknown:  Rotterdam
Unknown:  Bentsen
Unknown:  Möllemann
Unknown:  Seiters
Unknown:  SMIC
Unknown:  ADP
Unknown:  Elf
Unknown:  au
Unknown:  CCAS
Unknown:  Maastricht
Unknown:  Hang-Seng
Unknown:  Angiox
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  quasi-
Unknown:  Haddad
Unknown:  aux
Unknown:  aux
Unknown:  Disney
Unknown:  Prévost-Desprez
Unknown:  rétro-
Unknown:  au
Unknown:  TV6
Unknown:  Turner
Unknown:  CNN
Unknown:  Moscow
Unknown:  Independant
Unknown:  Broadcasting
Unknown:  Au
Unknown:  franco-allemande
Unknown:  cinquante-quatre
Unknown:  Jean-François
Unknown:  Régnier
Unknown:  quarante-cinq
Unknown:  OPFI
Unknown:  au
Unknown:  Etat
Unknown:  aux
Unknown:  chiffre-clé
Unknown:  au
Unknown:  au
Unknown:  8e
Unknown:  Calvet
Unknown:  PSA-Citroën
Un

Unknown:  Biba
Unknown:  Puel
Unknown:  Besançon
Unknown:  Jean-Pierre
Unknown:  Halphen
Unknown:  Triboulottes
Unknown:  au
Unknown:  au
Unknown:  aux
Unknown:  Val-Fréjus
Unknown:  GATT
Unknown:  Vaclav
Unknown:  au
Unknown:  au
Unknown:  contre-pouvoirs
Unknown:  Elf
Unknown:  Loïk
Unknown:  Floch-Prigent
Unknown:  Lopoukhine
Unknown:  exploration-production
Unknown:  Etats
Unknown:  Wang
Unknown:  Senhao
Unknown:  CNCC
Unknown:  CEE
Unknown:  AELE
Unknown:  EEE
Unknown:  Tardieu
Unknown:  Au
Unknown:  Indosuez
Unknown:  Jeancourt-Galignani
Unknown:  Etat
Unknown:  Souain
Unknown:  Mexico-City
Unknown:  SED
Unknown:  Gysi
Unknown:  aux
Unknown:  60e
Unknown:  Réveilhac
Unknown:  Maastricht
Unknown:  Au
Unknown:  ejido
Unknown:  précoloniale
Unknown:  COB
Unknown:  aux
Unknown:  VEV
Unknown:  CBV
Unknown:  CPR
Unknown:  CGM
Unknown:  CPR
Unknown:  auxquels
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  Sendowski
Unknown:  trente-neuf
Unknown:  Lesieur
Unknown:  au
Unknown:  Ducros


Unknown:  au
Unknown:  Gorbatchev
Unknown:  PC
Unknown:  valses-hésitations
Unknown:  RDA
Unknown:  anti-
Unknown:  CNCC
Unknown:  Etat
Unknown:  aux
Unknown:  BBC
Unknown:  Elargir
Unknown:  BBC
Unknown:  Chicago
Unknown:  Zenith
Unknown:  Bull
Unknown:  aux
Unknown:  aux
Unknown:  Naf
Unknown:  Naf
Unknown:  au
Unknown:  CIATER
Unknown:  au
Unknown:  Cusiana
Unknown:  Cupiaga
Unknown:  aux
Unknown:  Bundesbank
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  Etat
Unknown:  RMI
Unknown:  Economie
Unknown:  INSEE
Unknown:  au
Unknown:  RMI
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  au
Unknown:  Laske
Unknown:  État-voyou
Unknown:  Denoël
Unknown:  anglo-néerlandais
Unknown:  Ponthieux
Unknown:  Echos
Unknown:  Marboré
Unknown:  Berlusconi
Unknown:  aux
Unknown:  coca-cola
Unknown:  Mercosur
Unknown:  sud-américains
Unknown:  Jack-Yves
Unknown:  Bohbot
Unknown:  Dominati
Unknown:  SAN-FRANCISCO
Unknown:  Evalué
Unknown:  Bongrain
Unknown:  Besnier
Unknown:  UAP
Unknown:  Nordstern

Unknown:  au
Unknown:  AB
Unknown:  BCE
Unknown:  au
Unknown:  Bérégovoy
Unknown:  aux
Unknown:  rupiahs
Unknown:  Serang
Unknown:  au
Unknown:  CNJA
Unknown:  aux
Unknown:  CNPF
Unknown:  au
Unknown:  Birt
Unknown:  Hurand
Unknown:  Boucault
Unknown:  hors-cadre
Unknown:  aux
Unknown:  75ème
Unknown:  ejido
Unknown:  Christchurch
Unknown:  New
Unknown:  Zealand
Unknown:  Herald
Unknown:  USPA
Unknown:  Lafont
Unknown:  ex-
Unknown:  Sabouret
Unknown:  Deluchat
Unknown:  CSA
Unknown:  CEE
Unknown:  antidumping
Unknown:  est-européen
Unknown:  classless
Unknown:  society
Unknown:  Dupont
Unknown:  Orsini
Unknown:  Dupont
Unknown:  au
Unknown:  Au
Unknown:  GATT
Unknown:  ex-
Unknown:  Maastricht
Unknown:  réalignement
Unknown:  au
Unknown:  SME
Unknown:  Buba
Unknown:  Elf
Unknown:  au
Unknown:  KLM
Unknown:  aux
Unknown:  au
Unknown:  Equipment
Unknown:  Comau
Unknown:  Seiko
Unknown:  Olivetti
Unknown:  Gèdre
Unknown:  diner
Unknown:  au
Unknown:  Girard
Unknown:  soixante-neuf
Unknow

Unknown:  Riboud
Unknown:  Giscard
Unknown:  BSN
Unknown:  Evian
Unknown:  au
Unknown:  au
Unknown:  Jean-René
Unknown:  Agnelli
Unknown:  Exor
Unknown:  Grenet
Unknown:  DRH
Unknown:  saint-Gobain
Unknown:  Northern
Unknown:  Telecom
Unknown:  MMB
Unknown:  Lagardère
Unknown:  Lyonnaise-Dumez
Unknown:  Caixa
Unknown:  SGAB
Unknown:  Caixa
Unknown:  Lyonnaise-Dumez
Unknown:  LTV
Unknown:  Aerospace
Unknown:  Defense
Unknown:  Etat
Unknown:  Etat
Unknown:  Sabouret
Unknown:  rauco
Unknown:  suon
Unknown:  tartarea
Unknown:  tromba
Unknown:  CNPF
Unknown:  non-
Unknown:  ARRCO
Unknown:  Etat
Unknown:  au
Unknown:  Sachs
Unknown:  aux
Unknown:  Renon
Unknown:  SNECMA
Unknown:  Bénichou
Unknown:  au
Unknown:  ALGER
Unknown:  au
Unknown:  Honda
Unknown:  Mazda
Unknown:  aux
Unknown:  au
Unknown:  Etat
Unknown:  AGF
Unknown:  Nérac
Unknown:  Kwan
Unknown:  Sirven
Unknown:  Elf
Unknown:  CEL
Unknown:  PEL
Unknown:  Genève
Unknown:  Eno
Unknown:  Lessons
Unknown:  how
Unknown:  to
Unknown:  Ir

Unknown:  crève-misère
Unknown:  au
Unknown:  EMEA
Unknown:  H/C/562
Unknown:  pro-
Unknown:  eurosceptiques
Unknown:  au
Unknown:  SME
Unknown:  franco-britannique
Unknown:  Paribas
Unknown:  Paribas
Unknown:  Télécom
Unknown:  aux
Unknown:  au
Unknown:  Vignemale
Unknown:  au
Unknown:  Poullain
Unknown:  Etat
Unknown:  Pasqua
Unknown:  Hoeffel
Unknown:  Atalla
Unknown:  Jaffra
Unknown:  H.
Unknown:  Vuong
Unknown:  Viêtnam
Unknown:  XVIII
Unknown:  No.
Unknown:  Au
Unknown:  OPEP
Unknown:  Jean-Pierre
Unknown:  Dumont
Unknown:  quasi-
Unknown:  assurance-maladie
Unknown:  au
Unknown:  pub-info
Unknown:  au
Unknown:  Cousances
Unknown:  Baudesson
Unknown:  IV
Unknown:  Marnaval
Unknown:  au
Unknown:  XIXe
Unknown:  Espingo
Unknown:  Indosuez
Unknown:  Haussmann
Unknown:  volta-redonda
Unknown:  CSN
Unknown:  aux
Unknown:  Langoni
Unknown:  Sombart
Unknown:  ejidales
Unknown:  Taiwan
Unknown:  Braniff
Unknown:  Airlines
Unknown:  HLM
Unknown:  aux
Unknown:  UNEDIC
Unknown:  au
Unknown:

Unknown:  au
Unknown:  aux
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  aux
Unknown:  non-
Unknown:  CIP
Unknown:  Etat
Unknown:  aux
Unknown:  BELGIQUE
Unknown:  au
Unknown:  Pinault-Printemps
Unknown:  ISBN
Unknown:  of
Unknown:  Credit
Unknown:  and
Unknown:  BCCI
Unknown:  au
Unknown:  au
Unknown:  OCDE
Unknown:  Mc
Unknown:  Guffey
Unknown:  Grand-Prairie
Unknown:  LTV
Unknown:  Thomson
Unknown:  LTV
Unknown:  au
Unknown:  Clearstream
Unknown:  ex-
Unknown:  Cedel
Unknown:  Euroclear
Unknown:  Menem
Unknown:  Eysymontt
Unknown:  au
Unknown:  Bérégovoy
Unknown:  Northern
Unknown:  Telecom
Unknown:  Elf-Atochem
Unknown:  au
Unknown:  Yoshitomi
Unknown:  Pharmaceuticals
Unknown:  Akishima
Unknown:  Portelli
Unknown:  Vandingenen
Unknown:  GATT
Unknown:  au
Unknown:  au
Unknown:  SED
Unknown:  Modrow
Unknown:  au
Unknown:  Babangida
Unknown:  Maradona
Unknown:  FRANCFORT
Unknown:  CGCT
Unknown:  Ericsson
Unknown:  ex-
Unknown:  ITT
Unknown:  au
Unknown:  est-européennes
Unknown:  Yerg

Unknown:  au
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  au
Unknown:  Au
Unknown:  Deutsche
Unknown:  bank
Unknown:  PIB
Unknown:  SME
Unknown:  Zenith
Unknown:  reconcourir
Unknown:  air-bag
Unknown:  Sochaux-Montbéliard
Unknown:  Propria
Unknown:  barégine
Unknown:  étalon-or
Unknown:  aux
Unknown:  ptt
Unknown:  CIC
Unknown:  Cazalet
Unknown:  Var-Matin
Unknown:  Khrunichev
Unknown:  Kvant
Unknown:  Almaz
Unknown:  UNEDIC
Unknown:  Jean-Pierre
Unknown:  Soisson
Unknown:  robusta
Unknown:  Mesrahi
Unknown:  Tardivat
Unknown:  Elie
Unknown:  Fellous
Unknown:  Rubin
Unknown:  Pentland
Unknown:  Adidas
Unknown:  UNEDIC
Unknown:  au
Unknown:  Electricidad
Unknown:  Argentina
Unknown:  Keller
Unknown:  DRH
Unknown:  Thomson-CSF
Unknown:  Pellerin
Unknown:  Jones
Unknown:  Jonson
Unknown:  Ier
Unknown:  Palladio
Unknown:  URL
Unknown:  aux
Unknown:  Frigoscandia
Unknown:  aux
Unknown:  Etats-Unis
Unknown:  ex-
Unknown:  RDA
Unknown:  au
Unknown:  DM
Unknown:  DM
Unknown:  Au
Unknown:  -ci

In [374]:
def pretrained_embedding_layer():
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = maxChars+1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = 63      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
#    for char, index in character_to_integer.items():
#        emb_matrix[index, :] = char_to_index[char]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len,emb_dim,trainable=False,mask_zero=True)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([transform])
    
    return embedding_layer

In [399]:
def Embedding_model(input_shape):
    word_indices = Input(shape = input_shape, dtype = 'int32')
    embedding_layer = pretrained_embedding_layer()
    embeddings = embedding_layer(word_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # returning a batch of sequences.
    X = LSTM(128, return_sequences=False)(embeddings)
    X = BatchNormalization()(X)
    Y = Dropout(0.5)(X)
    # Add a (time distributed) Dense layer followed by a softmax activation
    Y = Dense(outFeatures, activation='sigmoid')(Y)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=word_indices,outputs=Y)
        
    return model

In [400]:
emb = Embedding_model((maxChars,))
emb.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        (None, 128)               0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 128, 63)           8127      
_________________________________________________________________
lstm_12 (LSTM)               (None, 128)               98304     
_________________________________________________________________
batch_normalization_12 (Batc (None, 128)               512       
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 357)               46053     
Total params: 152,996
Trainable params: 144,613
Non-trainable params: 8,383
_________________________________________________________________


In [401]:
emb.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])

In [None]:
history = emb.fit(X_charint_input, Y_train, epochs = 10, batch_size = 32, shuffle=True)

Epoch 1/10

In [294]:
print(np.shape(X_input))

(264421, 62, 64)


In [295]:
print(X_input[0])

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
