# Commencement

## Google drive

### **Open drive**

In [1]:
import os

from google.colab import drive
drive.mount('drive')

!mkdir -p drive -v

cwd = os.getcwd()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at drive


In [0]:
monchemin = 'drive/My Drive/Traitement de la langue/'

dir_path  = os.path.join(cwd, monchemin)
dirs = os.listdir(dir_path)
os.chdir(dir_path)

### **Chemin vers les fichiers**

In [5]:
path_jap = 'UD_Japanese-Modern/'
path_fr = 'UD_French-GSD/'
path_en = 'UD_English-LinES/'
path_du = 'UD_Dutch-LassySmall/'

print("\nDossier fichiers JAP :")
!ls "$path_jap" | grep ".conllu"
jap_test = "ja_modern-ud-test.conllu"

print("\nDossier fichiers FR :")
!ls "$path_fr" | grep ".conllu"
fr_dev = "fr_gsd-ud-dev.conllu"
fr_test = "fr_gsd-ud-test.conllu"
fr_train = "fr_gsd-ud-train.conllu"

print("\nDossier fichiers EN :")
!ls "$path_en" | grep ".conllu"
en_dev = "en_lines-ud-dev.conllu"
en_test = "en_lines-ud-test.conllu"
en_train = "en_lines-ud-train.conllu"

print("\nDossier fichiers NL :")
!ls "$path_du" | grep ".conllu"
nl_dev = "nl_lassysmall-ud-dev.conllu"
nl_test = "nl_lassysmall-ud-test.conllu"
nl_train = "nl_lassysmall-ud-train.conllu"


Dossier fichiers JAP :
ja_modern-ud-test.conllu

Dossier fichiers FR :
fr_gsd-ud-dev.conllu
fr_gsd-ud-test.conllu
fr_gsd-ud-train.conllu

Dossier fichiers EN :
en_lines-ud-dev.conllu
en_lines-ud-test.conllu
en_lines-ud-train.conllu

Dossier fichiers NL :
nl_lassysmall-ud-dev.conllu
nl_lassysmall-ud-test.conllu
nl_lassysmall-ud-train.conllu


## Import

In [6]:
import sys

import numpy as np
import keras
from keras.models import Model, Sequential
from keras.layers import Input, Dense

Using TensorFlow backend.


# Structures importantes

## Lecture des fichiers

### **Class Word**

In [0]:
class Word:
  def __init__(self):
    self.featDic = {}

  def getFeat(self, featName):
    if(not featName in self.featDic):
      print('WARNING : feat', featName, 'does not exist')
      return None
    else:
      return self.featDic[featName]

  def setFeat(self, featName, featValue):
    self.featDic[featName] = featValue

  def affiche(self, mcd):
    for elt in mcd:
      feat, status = elt
      print(self.getFeat(feat), '\t', end='')
    print('')

        
  @staticmethod
  def fakeWord(mcd):
    w = Word()
    for elt in mcd:
      feat, status = elt
      w.setFeat(feat, 'ROOT')
    return w

  @staticmethod
  def invalidGov():
    return 123456789

  @staticmethod
  def invalidLabel():
    return ''

### **Class WordBuffer**

In [0]:
class WordBuffer:
  def __init__(self, mcd=None):
    self.currentIndex = 0
    self.array = []
    self.length = 0
    self.mcd = mcd

  def addWord(self, w):
    self.array.append(w)
    self.length += 1

  def affiche(self, mcd):
    for w in self.array:
      w.affiche(mcd)

  def getLength(self):
    return self.length

  def getCurrentIndex(self):
    return self.currentIndex

  def getWord(self, index):
    return self.array[index]

  def getCurrentWord(self):
    return self.getWord(self.currentIndex)

  def nextSentence(self):
    sentence = []
    sentence.append(Word.fakeWord(self.mcd))
    if self.currentIndex == self.length - 1 :
      return False
    while self.currentIndex < self.length :
      sentence.append(self.getCurrentWord())
      if int(self.getCurrentWord().getFeat('EOS')) == 1 :
        self.currentIndex += 1
        return sentence
      else:
        self.currentIndex += 1

  def readFromMcfFile(self, mcfFilename):
    try:
      mcfFile = open(mcfFilename, encoding='utf-8')
    except IOError:
      print(mcfFilename, " : ce fichier n'existe pas")
      exit(1)
    tokens = []
    for ligne in mcfFile:
      tokens = ligne.split("\t")
      w = Word()
      for i in range(0, len(tokens)):
        if(self.mcd[i][0] == 'GOV'):
          w.setFeat(self.mcd[i][0], tokens[i])
          w.setFeat('GOVABS', str(self.length + int(tokens[i]))) # compute absolute index of governor
        else:
          w.setFeat(self.mcd[i][0], tokens[i])
      self.addWord(w)
    mcfFile.close();

  def readFromConlluFile(self, conlluFilename):
    try:
      conlluFile = open(conlluFilename, encoding='utf-8')
    except IOError:
      print(conlluFilename, " : ce fichier n'existe pas")
      exit(1)
    tokens = []
    for ligne in conlluFile:
      if ligne[0] == '\n' :
        self.getWord(self.currentIndex - 1).setFeat('EOS', '1')
        next
      elif ligne[0] == '#' :
        next
      else :
        tokens = ligne.split("\t")
        if '-' not in tokens[0]:
          w = Word()
          for i in range(0, len(tokens)):
            if i == 0 :
              w.setFeat('INDEX', tokens[i])
            if i == 1 :
              w.setFeat('FORM', tokens[i])
            if i == 2 :
              w.setFeat('LEMMA', tokens[i])
            if i == 3 :
              w.setFeat('POS', tokens[i])
            if i == 4 :
              w.setFeat('X1', tokens[i])
            if i == 5 :
              w.setFeat('MORPHO', tokens[i])
            if i == 6 :
              w.setFeat('GOV', tokens[i])
            if i == 7 :
              w.setFeat('LABEL', tokens[i])
            if i == 8 :
              w.setFeat('X2', tokens[i])
            if i == 9 :
              w.setFeat('X3', tokens[i])
          w.setFeat('EOS', '0')
          self.addWord(w)
    conlluFile.close();

  def end(self):
    if(self.getCurrentIndex() >= self.getLength()):
      return True
    else:
      return False

### Class Dico

In [0]:
class Dicos:
  def __init__(self, mcd=False, fileName=False, verbose=False):
    self.content = {}
    self.locked = False
    if mcd :
      for elt in mcd :
        name, status = elt;
        if(status == 'SYM') : 
          self.content[name] = ['NULL', 'ROOT']
    elif fileName :
      try:
        dicoFile = open(fileName, encoding='utf-8')
      except IOError:
        print(fileName, 'does not exist')
        exit(1)
      for ligne in dicoFile:
        if ligne[0] == '#' and ligne[1] == '#' :
          currentDico = ligne[2:-1]
          self.content[currentDico] = []
          if(verbose): 
            print('in module', __name__, 'create dico', currentDico)
        else:
          value = ligne[:-1]
          if not value in self.content[currentDico] :
            self.content[currentDico].append(value)
            if(verbose): 
              print('in module', __name__, 'adding entry', value, 'to', currentDico)
      dicoFile.close()
      self.lock()


  def populateFromMcfFile(self, mcfFilename, mcd, verbose=False):
    try:
      mcfFile = open(mcfFilename, encoding='utf-8')
    except IOError:
      print('cannot open', mcfFilename)
      exit(1)
    tokens = []
    for ligne in mcfFile:
      tokens = ligne.split()
      for i in range(0, len(tokens)):
        if mcd[i][1] == 'SYM' :
          if not tokens[i] in self.content[mcd[i][0]] :
            self.content[mcd[i][0]].append(tokens[i])
            if(verbose): 
              print('in module:', __name__, 'adding value ', tokens[i], 'to dico', mcd[i][0]) 
    mcfFile.close();
    for e in self.content:
      print('DICO', e, ':\t', len(self.content[e]), 'entries')

  def populateFromConlluFile(self, conlluFilename, verbose=False):
    try:
      conlluFile = open(conlluFilename, encoding='utf-8')
    except IOError:
      print('cannot open', conlluFilename)
      exit(1)
    mcd = (('INDEX', 'INT'), ('FORM', 'INT'), ('LEMMA', 'INT'), ('POS', 'SYM'), ('X1', 'INT'), ('MORPHO', 'INT'), ('GOV', 'SYM'), ('LABEL', 'SYM'), ('X2', 'SYM'), ('X3', 'SYM'))
    tokens = []
    for ligne in conlluFile:
      if ligne[0] != '\n' and ligne[0] != '#' :
        tokens = ligne.split("\t")
        for i in range(0, len(tokens)):
          if mcd[i][1] == 'SYM' :
            if not tokens[i] in self.content[mcd[i][0]] :
              self.content[mcd[i][0]].append(tokens[i])
              if(verbose): 
                print('in module:', __name__, 'adding value ', tokens[i], 'to dico', mcd[i][0]) 
    conlluFile.close();
    for e in self.content:
      print('DICO', e, ':\t', len(self.content[e]), 'entries')

  def lock(self):
    self.locked = True
    for key in self.content.keys():
      self.content[key] = tuple(self.content[key])

  def print(self):
    for dico in self.content.keys():
      print(dico, self.content[dico])

  def printToFile(self, filename):
    try:
      dicoFile = open(filename, 'w', encoding='utf-8')
    except IOError:
      print('cannot open', filename)
      exit(1)
    for dico in self.content.keys():
      dicoFile.write('##')
      dicoFile.write(dico)
      dicoFile.write('\n')
      for elt in self.content[dico]:
        dicoFile.write(elt)
        dicoFile.write('\n')
    dicoFile.close()

  def getCode(self, dicoName, symbol, verbose=False) :
    if(verbose) : print('in module ', __name__, 'getCode(', dicoName, ',', symbol, ')')
    if not self.locked :
      print('Dicos must be locked before they can be accessed')
      return False
    if not dicoName in self.content :
      print('no such dico as', dicoName)
      return False
    return self.content[dicoName].index(symbol)

  def getSymbol(self, dicoName, code) :
    if not self.locked :
      print('Dicos must be locked before they can be accessed')
      return False
    if not dicoName in self.content :
      print('no such dico as', dicoName)
      return False
    return self.content[dicoName][code]

  def add(self, dicoName, symbol) :
    if not dicoName in self.content :
      self.content[dicoName] = {symbol}
    else:
      self.content[dicoName].add(symbol)

### Test

* **Word / WordBuffer**

In [10]:
mcd =(('INDEX', 'INT'), ('FORM', 'INT'), ('LEMMA', 'INT'), ('POS', 'SYM'), ('X1', 'INT'), ('MORPHO', 'INT'), ('GOV', 'SYM'), ('LABEL', 'SYM'), ('X2', 'SYM'), ('X3', 'SYM'))

wb = WordBuffer(mcd);
wb.readFromConlluFile(path_fr+f1_fr);


def printSentence(sentence, mcd):
    for i in range(0, len(sentence)):
        sentence[i].affiche(mcd)

        
sentence = wb.nextSentence()
sentence[1].affiche(mcd)
print()
for m in mcd:
  print(sentence[1].getFeat(m[0]), "\t=>\t", m[0])  
print()
printSentence(sentence, mcd)

1 	Aviator 	Aviator 	PROPN 	_ 	_ 	0 	root 	_ 	SpaceAfter=No
 	

1 	=>	 INDEX
Aviator 	=>	 FORM
Aviator 	=>	 LEMMA
PROPN 	=>	 POS
_ 	=>	 X1
_ 	=>	 MORPHO
0 	=>	 GOV
root 	=>	 LABEL
_ 	=>	 X2
SpaceAfter=No
 	=>	 X3

ROOT 	ROOT 	ROOT 	ROOT 	ROOT 	ROOT 	ROOT 	ROOT 	ROOT 	ROOT 	
1 	Aviator 	Aviator 	PROPN 	_ 	_ 	0 	root 	_ 	SpaceAfter=No
 	
2 	, 	, 	PUNCT 	_ 	_ 	1 	punct 	_ 	_
 	
3 	un 	un 	DET 	_ 	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art 	4 	det 	_ 	_
 	
4 	film 	film 	NOUN 	_ 	Gender=Masc|Number=Sing 	1 	appos 	_ 	_
 	
5 	sur 	sur 	ADP 	_ 	_ 	7 	case 	_ 	_
 	
6 	la 	le 	DET 	_ 	Definite=Def|Gender=Fem|Number=Sing|PronType=Art 	7 	det 	_ 	_
 	
7 	vie 	vie 	NOUN 	_ 	Gender=Fem|Number=Sing 	4 	nmod 	_ 	_
 	
8 	de 	de 	ADP 	_ 	_ 	9 	case 	_ 	_
 	
9 	Hughes 	Hughes 	PROPN 	_ 	_ 	7 	nmod 	_ 	SpaceAfter=No
 	
10 	. 	. 	PUNCT 	_ 	_ 	1 	punct 	_ 	_
 	


In [0]:
sentence = wb.nextSentence()
sentNb = 1
while sentence :
  for i in range(1, len(sentence)):
    d_ind = i
  sentNb += 1
  sentence = wb.nextSentence()

* **Dico**

In [0]:
if len(sys.argv) < 3 :
    print('usage:', sys.argv[0], 'conllu_file (input) dico_file (output)')
    exit(1)


mcd = (('INDEX', 'INT'), ('FORM', 'INT'), ('LEMMA', 'INT'), ('POS', 'SYM'), ('X1', 'INT'), ('MORPHO', 'INT'), ('GOV', 'SYM'), ('LABEL', 'SYM'), ('X2', 'SYM'), ('X3', 'SYM'))


print('populating dicos from file ', "UD_French-GSD/fr_gsd-ud-train.conllu")
dicos = Dicos(mcd)
dicos.populateFromConlluFile("UD_French-GSD/fr_gsd-ud-train.conllu", verbose=False)
dicos.lock()
print('saving dicos in file ', sys.argv[2])
dicos.printToFile(sys.argv[2])

populating dicos from file  UD_French-GSD/fr_gsd-ud-train.conllu
DICO POS :	 20 entries
DICO GOV :	 238 entries
DICO LABEL :	 54 entries
DICO X2 :	 3 entries
DICO X3 :	 60 entries
saving dicos in file  /root/.local/share/jupyter/runtime/kernel-dd1eefec-65f6-4299-95d7-cc914dad9774.json


## FEATURE

### Differentes features

* **Feature 1**
> $G.0.POS$     ->     retourne la partie de discours du gouverneur     
> $D.0.POS$     ->     retourne la partie de discours du dépendant   
> $DIST$           ->     retourne la distance entre G et D

In [0]:
# Fonction de création de feature
# X : entrée donnée
# Y : sortie attendue
def create_feature1(sentence):
  X = []
  Y = []
  for i in range(1, len(sentence)):
    if sentence[i].getFeat("GOV") == "_":
      break
    d_ind = i                                     # indice du dépendant
    g_ind = int(sentence[d_ind].getFeat("GOV"))   # indice du gouverneur
    label = sentence[d_ind].getFeat("LABEL")      # label de la liaison
    g_pos = sentence[g_ind].getFeat("POS")        # partie de discours du gouverneur
    d_pos = sentence[d_ind].getFeat("POS")        # partie de discours du dépendant
    dist = g_ind - d_ind                          # indice G - indice D
    X.append([g_pos, d_pos, dist])
    Y.append(label)
  return X, Y

* **Feature 2**
> $G.0.POS$                   ->     retourne la partie de discours du gouverneur  
> $G.0.LEMMA$           ->     retourne le lemme du gouverneur  
> $G.0.MORPHO$        ->     retourne la morphologie du gouverneur  
> $D.0.POS$                   ->     retourne la partie de discours du dépendant  
> $D.0.LEMMA$           ->     retourne le lemme du dépendant  
> $D.0.MORPHO$        ->     retourne la morphologie du dépendant  
> $DIST$                          ->     retourne la distance entre G et D  

In [0]:
# Fonction de création de feature
# X : entrée donnée
# Y : sortie attendue
def create_feature2(sentence):
  X = []
  Y = []
  for i in range(1, len(sentence)):
    d_ind = i                                     # indice du dépendant
    g_ind = int(sentence[d_ind].getFeat("GOV"))   # indice du gouverneur
    label = sentence[d_ind].getFeat("LABEL")      # label de la liaison
    g_pos = sentence[g_ind].getFeat("POS")        # partie de discours du gouverneur
    g_lem = sentence[g_ind].getFeat("LEMMA")      # lemme du gouverneur
    g_mor = sentence[g_ind].getFeat("MORPHO")     # morphologie du gouverneur
    d_pos = sentence[d_ind].getFeat("POS")        # partie de discours du dépendant
    d_lem = sentence[d_ind].getFeat("LEMMA")      # lemme du dépendant
    d_mor = sentence[d_ind].getFeat("MORPHO")     # morphologie du dépendant
    dist = g_ind - d_ind                          # indice G - indice D
    X.append((g_pos, g_lem, g_mor, d_pos, d_lem, d_mor, dist))
    Y.append((label))
  return X, Y

* **Feature 3**
> $G.0.POS$                   ->     retourne la partie de discours du gouverneur  
> $G.0.LEMMA$           ->     retourne le lemme du gouverneur  
> $G.0.MORPHO$        ->     retourne la morphologie du gouverneur  
> $G.-1.POS$                  ->     retourne la partie de discours du mot précédent le gouverneur  
> $G.1.POS$                    ->     retourne la partie de discours du mot suivant le gouverneur  
> $D.0.POS$                   ->     retourne la partie de discours du dépendant  
> $D.0.LEMMA$           ->     retourne le lemme du dépendant  
> $D.0.MORPHO$        ->     retourne la morphologie du dépendant  
> $D.-1.POS$                  ->     retourne la partie de discours du mot précédent le dépendant  
> $D.1.POS$                    ->     retourne la partie de discours du mot suivant le dépendant  
> $DIST$                          ->     retourne la distance entre G et D  

In [0]:
# Fonction de création de feature
# X : entrée donnée
# Y : sortie attendue
def create_feature3(sentence):
  X = []
  Y = []
  size = len(sentence)
  for i in range(1, size):
    d_ind = i                                     # indice du dépendant
    g_ind = int(sentence[d_ind].getFeat("GOV"))   # indice du gouverneur
    label = sentence[d_ind].getFeat("LABEL")      # label de la liaison
    g_pos = sentence[g_ind].getFeat("POS")        # partie de discours du gouverneur
    g_lem = sentence[g_ind].getFeat("LEMMA")      # lemme du gouverneur
    g_mor = sentence[g_ind].getFeat("MORPHO")     # morphologie du gouverneur
      # partie de discours du mot avant G
    g_min = sentence[g_ind-1].getFeat("POS") if ((g_ind-1) >= 0) else '_'
      # partie de discours du mot après G
    g_add = sentence[g_ind+1].getFeat("POS") if ((g_ind+1) < size) else '_'  
    d_pos = sentence[d_ind].getFeat("POS")        # partie de discours du dépendant
    d_lem = sentence[d_ind].getFeat("LEMMA")      # lemme du dépendant
    d_mor = sentence[d_ind].getFeat("MORPHO")     # morphologie du dépendant
      # partie de discours du mot avant D
    d_min = sentence[d_ind-1].getFeat("POS") if ((d_ind-1) >= 0) else '_'
      # partie de discours du mot après D
    d_add = sentence[d_ind+1].getFeat("POS") if ((d_ind+1) < size) else '_'
    dist = g_ind - d_ind                          # indice G - indice D
    X.append((g_pos, g_lem, g_mor, g_min, g_add, d_pos, d_lem, d_mor, d_min, d_add, dist))
    Y.append((label))
  return X, Y

### Creations

* **feature 1**

In [14]:
mcd =(('INDEX', 'INT'), ('FORM', 'INT'), ('LEMMA', 'INT'), ('POS', 'SYM'), ('X1', 'INT'), ('MORPHO', 'INT'), ('GOV', 'SYM'), ('LABEL', 'SYM'), ('X2', 'SYM'), ('X3', 'SYM'))
wb = WordBuffer(mcd);
wb.readFromConlluFile(path_fr+f1_fr);


X = []
Y = []
sentence = wb.nextSentence()
sentNb = 1
while sentence :
  x, y = create_feature1(sentence)
  for s in x:
    X.append(s)
  for s in y:
    Y.append(s)
  sentNb += 1
  sentence = wb.nextSentence()
  
X = np.asarray(X)
Y = np.asarray(Y)
  
print(X)
print(X.shape)
print(Y)
print(Y.shape)

[['ROOT' 'PROPN' '-1']
 ['PROPN' 'PUNCT' '-1']
 ['NOUN' 'DET' '1']
 ...
 ['PROPN' 'DET' '1']
 ['VERB' 'PROPN' '-4']
 ['VERB' 'PUNCT' '-14']]
(35768, 3)
['root' 'punct' 'det' ... 'det' 'obl:arg' 'punct']
(35768,)


* **feature 2**

* **feature 3**

## **Apprentissage**

In [15]:
def create_model():
  model = Sequential()
  model.add(Dense(50, input_dim=3, activation='relu'))
  model.add(Dense(2, activation = "softmax"))
  model.compile(optimizer="Adam",
              loss='binary_crossentropy',
              metrics=['accuracy'])
  return model

one_hot_labels = keras.utils.to_categorical(Y, num_classes=17)

m = create_model()
m.fit(X, one_hot_labels)
m.summary()

ValueError: ignored

# Algorithme : Chu Liu Edmonds

In [0]:
G = [
   [1,2,3,4,5],
   [1,1,1,1,1],
   [2,3,5,1,2],
   [8,5,2,1,4],
   [9,6,3,4,1]
]

def CLE(Graph, root):
    return "a"


m = np.argmax(G, axis=0)

for i in range(5):
  print(G[i][m[i]])