In [1]:
import pandas as ps
import numpy as np 
import os
import re

In [2]:
folder = []
for item in os.listdir('data'):
    if(len(item.split('.')) == 1):
        folder.append(item)

In [3]:
folder

['Flore du Cameroun tome 10 1970 Ombellales',
 'Flore du Cameroun tome 12 1972 Loganiaceae',
 'Flore du Cameroun tome 16 1973 Sapindaceae',
 'Flore du Cameroun tome 17 1974 Amaranthaceae',
 'Flore du Cameroun tome 2 1964 Sapotacees',
 'Flore du Cameroun tome 6 1967 Cucurbitaceae',
 'Flore du Cameroun tome 9 1970 Caesalpinioidae']

# =================================
# Find organes and descriptors

In [4]:
from bs4 import BeautifulSoup as bs
import requests

url="https://fr.wikipedia.org/wiki/Glossaire_de_botanique"
response = requests.get(url)

html = response.content

soup = bs(html, 'lxml')


In [5]:
def get_glossaire(soup):
    glossaire = []
    div = soup.find("div", {"class" : "mw-parser-output"})
    uls = div.find_all("ul")
    for i, ul in enumerate(uls):
        lis = ul.find_all("li")
        for li in lis:
            if li.find("b") != None:
                noun = li.get_text().split(':')[0][:-1].strip()
                if re.search("adjectif",li.get_text()) or re.search("se dit\s",li.get_text()) or re.search("é$",noun) or re.search("qualifie\s",li.get_text()):
                    glossaire.append((noun.split(' ')[0].lower(), "adj"))
                    #print(noun.strip().split(' ')[0].lower() + ': adj')
                else:
                    glossaire.append((noun.split(' ')[0].lower(), "noun"))
                    #print(noun.strip().split(' ')[0].lower() + ': adj')
    return glossaire

In [6]:
glossaire = get_glossaire(soup)

In [7]:
glossaire[:10]

[('abaxial', 'adj'),
 ('aberrant', 'adj'),
 ('abortif', 'adj'),
 ('abrévié', 'adj'),
 ('abruptement', 'noun'),
 ('abscise', 'noun'),
 ('absciser', 'noun'),
 ('abscisse', 'noun'),
 ('abscission', 'noun'),
 ('acanthocarpe', 'adj')]

In [8]:
organes = [
    "racine",
    "tige",
    "bourgeon",
    "rhizome",
    "tubercule",
    "bulbe",
    "collet",
    "limbe",
    "lobe",
    "pétiole",
    "sessile",
    "foliole",
    "calice",
    "sépale",
    "calicule",
    "corolle",
    "anthère"
    "tépale",
    "stipule",
    "foliole",
    "carpelle",
    "grappe ",
    "vrille",
    "feuille"
]

In [9]:
adjs = [gls[0].lower() for gls in glossaire if gls[1] == "adj"]
nouns = [gls[0].lower() for gls in glossaire if gls[1] == "noun"]

In [10]:
from nltk import pos_tag, word_tokenize
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='french')

Espèce

In [11]:
def f_remove_accents(old):
    """
    Removes common accent characters, lower form.
    Uses: regex.
    """
    new = old.lower()
    new = re.sub(r'[àáâãäå]', 'a', new)
    new = re.sub(r'[èéêë]', 'e', new)
    new = re.sub(r'[ìíîï]', 'i', new)
    new = re.sub(r'[òóôõö]', 'o', new)
    new = re.sub(r'[ùúûü]', 'u', new)
    return new

In [12]:
def especes_pages(path):
    pages = []
    for i in range(len(os.listdir(path))):
        with open(path+'/'+ str(i+1) +'.txt', 'r') as fp:
            lines = fp.readlines()
            #print("=================")
            for j in range(len(lines)):
                l = lines[j].strip()
                l = l.replace('  ',' ')
                l = f_remove_accents(l)
                x = re.search("cle.?\sdes\sespece", l)
                if(x):
                    pages.append(i+1)
                    pages.append(i+2)
                    pages.append(i+3)
    return pages

In [13]:
def get_especes(pages):
    especes = []
    for p in pages:
        with open(path+'/'+ str(p) +'.txt', 'r') as fp:
            lines = fp.readlines()
            for j in range(len(lines)):
                l = lines[j].strip()
                l = l.replace('  ',' ')
                l = f_remove_accents(l)
                x = re.findall("\s\w\.\s\w+$", l)
                x = re.findall("[A-Za-z0-9]\.\s\w\.\s\w+\.?$", l)
                if(x):
                    especes.append(x)
    especes = np.array(especes).reshape(-1)
    esp = [esp.strip().split(' ')[2][:-1] for esp in especes]
    initials = [esp.strip().split(' ')[1][:-1].upper() for esp in especes]
    return esp, initials

def preprocess(line):
    l = line.strip()
    l = l.replace('  ',' ')
    #l = f_remove_accents(l)
    return l


In [14]:
def get_especes_names(especes, initials, path):
    esp_names = []
    for i in range(len(os.listdir(path))):
        with open(path+'/'+ str(i+1) +'.txt', 'r') as fp:
            lines = fp.readlines()
            for j in range(len(lines)):
                l = preprocess(lines[j])
                for i, esp in enumerate(especes):
                    contains = l.find(esp.capitalize()) != -1 or l.find(esp) != -1 
                    if contains:
                        exp = "^[1-9]\.\s{}[a-z]+.*".format(initials[i])
                        x = re.findall(exp, l)
                        if(x):
                            full_name = x[0].strip()[3:]
                            full_name = full_name.split(' ')[0]+' '+full_name.split(' ')[1]
                            esp_names.append(full_name)
    return esp_names

In [15]:
def espece_(especes, initials, path,l):
    esp_names = []
    pos=[]
    l = preprocess(l)
    for i, esp in enumerate(especes):
        contains = l.find(esp.capitalize()) != -1 or l.find(esp) != -1 
        if contains:
            exp = "^[1-9]\.\s{}[a-z]+.*".format(initials[i])
            x = re.findall(exp, l)
            if(x):
                full_name = x[0].strip()[3:]
                full_name = full_name.split(' ')[0]+' '+full_name.split(' ')[1]
                start = [m.start() for m in re.finditer(full_name, l)]
                for st in start:
                    pos.append([st,st+len(full_name),'ESPECE'])
    return pos

In [16]:
def is_digit(c):
    try:
        d = int(c)
        return True
    except :
        return False

In [17]:
colors = [
    "rouge",
    "rose",
    "bleu",
    "pourpre",
    "violet",
    "jaune",
    "orange",
    "crème",
    "blanc",
    "blanche",
    "vert",
    "brun"
]

In [18]:
###===============================
# utils functions 
###===============================

def prepare(sentence):
    sent = sentence.replace("\n", "").lower()
    sent = preprocess(sent)
    sent = " ".join(word_tokenize(sent))
    return sent

def did_overlap(pos, begin, end):
    for i, p in enumerate(pos):
        if p[0] == begin or p[1] == end:
            return True
    return False

def fix_overlap(temp):
    for i in range(len(temp)-1):
        for j in temp[i+1:]:
            if temp[i][0] == j[0] or temp[i][1] == j[1]:
                len_i = temp[i][1] - temp[i][0]
                len_j = j[1] - j[0]
                if(len_i > len_j):
                    temp.remove(j)
                else:
                    temp.remove(temp[i])
    return temp

def get_measure_begining(sent, last_index):
    last = last_index
    while True:
        virg_ind = sent[:last].rfind(",")
        if sent[virg_ind - 1].isdigit():
            last = virg_ind
        else:
            break
    if virg_ind == -1:
        begin = re.search("[0-9i]+",sent[:last]).span()[0]
    else:
        begin = virg_ind + re.search("[0-9i]+",sent[virg_ind:]).span()[0]
    return begin

In [19]:
#Extract organs
def organe_(sent):
    pos = []
    tokens = word_tokenize(sent)
    for org in organes:
        stem = org
        start = [m.start() for m in re.finditer(stem, sent)]
        if len(start) != 0:
            for st in start:
                begin = st
                end = st + sent[begin:].find(' ')
                if did_overlap(pos, begin, end):
                    continue
                else:
                    pos.append([begin, end, 'ORGAN' ])
    return pos
#Extract descriptor
def descripteur(sent):
    pos = []
    tokens = word_tokenize(sent)
    for adj in adjs:
        stem = adj
        start = [m.start() for m in re.finditer(stem, sent)]
        if len(start) != 0:
            for st in start:
                begin = st
                end = st + sent[begin:].find(' ')
                if did_overlap(pos, begin, end):
                    continue
                else:
                    pos.append([begin, end, 'DESC' ])
    # try to find descriptors using POS (adjectivs ends with é,ée,és,ées)
    start = [m.start() for m in re.finditer("\w+é\s|\w+ée\s|\w+és\s|\w+ées\s", sent)]
    if len(start) != 0:
        for st in start:
            begin = st
            end = st + sent[begin:].find(' ')
            if did_overlap(pos, begin, end):
                continue
            else:
                pos.append([begin, end, 'DESC' ])
    #pos = list(set(pos))
    return pos

#Extract measure
def measure(sent):
    pos = []
    tokens = word_tokenize(sent)
    if(re.search(" m | cm | mm ", sent)):
        measures = re.findall(" m | cm | mm ", sent)
        start = [m.start() for m in re.finditer(" m | cm | mm ", sent)]
        for i, st in enumerate(start):
            try:
                begin = get_measure_begining(sent, st)
            except:
                print(sent)
                continue
            end = st + len(measures[i]) -1    
            if did_overlap(pos, begin, end):
                continue
            else:
                pos.append([begin, end, 'MEASURE' ])
    return pos
    
def color_1(sent):
    pos = []
    tokens = word_tokenize(sent)
    for c in colors:
        start = [m.start() for m in re.finditer(c , sent)]
        if len(start) != 0:
            for st in start:
                begin = st
                # solve the problem of finding DESC & COLOR in the same Entity
                if sent[begin:].find('-') == -1:
                    end = begin + sent[begin:].find(' ')
                else:
                    
                    end = begin + min(sent[begin:].find(' '),sent[begin:].find('-'))
                # for example verticalisé is classified as a color so we check its length 
                if did_overlap(pos, begin, end) or ((end - begin) > len(c)+2) :
                    continue
                else:
                    pos.append([begin, end, 'COLOR' ])
    return pos
# word ends with âtre
def color_2(sent):
    pos = []
    tokens = word_tokenize(sent)
    for c in colors:
        stem = c
        start = [m.start() for m in re.finditer("\s\w+âtre|\s\w+âtres" , sent)]
        if len(start) != 0:
            for st in start:
                begin = st+1
                end = begin + sent[begin:].find(' ')
                if did_overlap(pos, begin, end):
                    continue
                else:
                    pos.append([begin, end, 'COLOR'] )
    return pos
# combine both functions
def color(sent):
    pos = color_1(sent) + color_2(sent)
    pos = fix_overlap(pos)
    return pos


## Construct Dataset for training + Training

In [20]:
from nltk import sent_tokenize

In [21]:
import pandas as ps
import numpy as np 
import os
import re

In [109]:
%%time

import random
import tqdm
from spacy.training import Example
import spacy
DATAS = []
paths = ['data/'+ fold for fold in folder]
for path in paths:
    #path = 'data/' + 'Flore du Cameroun tome 12 1972 Loganiaceae'
    pages = especes_pages(path)
    especes, initials = get_especes(pages)
    for i in range(len(os.listdir(path))):
        with open(path+'/'+ str(i+1) +'.txt', 'r') as fp:
            lines = fp.read()
            sentences = sent_tokenize(lines)
            for sent in sentences:
                sent = sent.replace("\n", "").lower()
                sent = preprocess(sent)
                sent = " ".join(word_tokenize(sent))
                res = descripteur(sent) + organe_(sent) + measure(sent) + list(set(color(sent))) + espece_(especes,initials, path,sent)
                if len(res) != 0:
                    res = {'entities' : res}
                    result = (sent , res)
                    DATAS.append(result)

CPU times: total: 1min 32s
Wall time: 1min 45s


Now we need to run this script on all the articles to go throw each paragraph and get all the descriptors ,(the script may need to be stoped manuallay) 

In [54]:
%%time

import random
import tqdm
from spacy.training import Example
import spacy
DATAS = []
paths = ['data/'+ fold for fold in folder]
for path in paths:
    #path = 'data/' + 'Flore du Cameroun tome 12 1972 Loganiaceae'
    pages = especes_pages(path)
    especes, initials = get_especes(pages)
    c=0
    for i in range(len(os.listdir(path))):
        with open(path+'/'+ str(i+1) +'.txt', 'r') as fp:
            lines = fp.read()
            sentences = lines.replace('\n\n','\n').split('\n')
            k=0
            while(k<len(sentences)):
                sent=sentences[k]
                if(espece_(especes,initials, path,sent) or c==1):
                    #print(espece_(especes,initials, path,sent))
                    match=re.search('[a-z\-]*type\s?:|materiel.*camerounais',f_remove_accents(sent))
                    while(not match and k<len(sentences)):
                        sent=sentences[k]
                        sent = sent.replace("\n", "").lower()
                        sent = preprocess(sent)
                        sent = " ".join(word_tokenize(sent))
                        #print('********************')
                        #print(sent)
                        res = descripteur(sent) + organe_(sent) + measure(sent) + color(sent) + espece_(especes,initials, path,sent)
                        #print('**************************')
                        if len(res) != 0:
                            res = {'entities' : res}
                            result = [sent , res]
                            DATAS.append(result)
                        match=re.search('[a-z\-]*type\s?:|materiel.*camerounais',f_remove_accents(sent))
                        if(match):
                            c=0
                        else:
                            c=1
                        k=k+1
                    #print(match)
                k=k+1

hh > m 6 1/3 ínférícur ' à anthères oblongues ; staminodes
x cm , très fins ; fleurs 2-8 , très petites . pédicelles grêles , glabres ,
— m . gilgianacogn. , repert . sp . nov. 12 : 5o3 ( igi3 ) .
fleurs petites , longues de 3 mm , larges de a mm , en courts
lég . gabon : m ( 1948 ) .
bractées subpersistantes longues de 15-18 mm . bractéoles de i4-
Wall time: 2min 5s
Parser   : 497 ms


In [55]:
len(DATAS)

3651

In [56]:
import pickle
TD=[]
with open('train_data.pkl','rb') as pick:
    TD.append(pickle.load(pick))

In [57]:
TD=TD[0]

In [58]:
TD[50:220]

[['1 peucedanum winkleri wolff', {'entities': [[2, 21, 'espèce']]}],
 ['2 peucedanum zenkeri engler ex wolff', {'entities': [[2, 20, 'espèce']]}],
 ['3 peucedanum angustisectum ( engler ) norman',
  {'entities': [[2, 26, 'espèce']]}],
 ['1 anthocleista djalonensis a chevalier', {'entities': [[2, 26, 'espèce']]}],
 ["Tessmannia est un genre de plantes dicotylédones de la famille des Fabaceae ( Légumineuses ) , sous-famille des Caesalpinioideae , originaire d'Afrique , qui comprend une douzaine d'espèces acceptées . Ce sont des arbres dont certaines espèces sont exploitées pour leur résine ( copal ) ou leur bois utilisé comme bois d'œuvre ( menuiserie , traverses de chemin de fer , manches d'outils , poteaux ) . 2 anthocleista laxiflora baker",
  {'entities': [[404, 426, 'espèce']]}],
 ['3 anthocleista liebrechtsiana de wild & th dur',
  {'entities': [[2, 29, 'espèce']]}],
 ['4 anthocleista microphylla wernham', {'entities': [[2, 26, 'espèce']]}],
 ['5 anthocleista obanensis wernham', {'

In [59]:
len(TD)

653

creating train vaidation and test sets

In [60]:
DATAC=DATAS.copy()

In [61]:
train=DATAC[0:int(len(DATAC)*0.8)]
test=DATAC[int(len(DATAC)*0.8):len(DATAC)]

now we are going to add species titles to the snetences containing the descreptors 

In [62]:
DATAC=train.copy()

we need to create train and test sets that contains the some kind of corpus that is why we are going to apply the same processing on both 

In [63]:
for i,el in enumerate(TD[0:350]):
    if i in range(100,200):
        #print("########## before:")
        #print(DATAC[i][0])
        #print("######### After:")
        
        DATAC[i][0]=TD[i][0]+'\n'+DATAC[i][0]
        #print(DATAC[i][0])

            #print('######### entites:')
            #print(ent)
            #print('######### old entites:')
            #print(DATAC[i][1]['entities'])
        for j,ent in enumerate(DATAC[i][1]['entities']):
            DATAC[i][1]['entities'][j][0]=DATAC[i][1]['entities'][j][0]+1+len(TD[i][0])
            DATAC[i][1]['entities'][j][1]=DATAC[i][1]['entities'][j][1]+1+len(TD[i][0])
        for j,ent in enumerate(TD[i][1]['entities']):
            DATAC[i][1]['entities'].insert(j,ent)
        #print('##### NEW ENTITIES')
        #print(DATAC[i][1]['entities'])
    else:
        DATAC.append(TD[i])

In [64]:
DATAS[100]

['2 crossonephelis unijugatus ( pellegrin ) leenhouts\nde quelques fleurs . ombelle légère , à pedoncule grêle , sans invo-',
 {'entities': [[2, 27, 'espèce'], [102, 107, 'DESC']]}]

In [65]:
train=DATAC.copy()

In [66]:
train[100]

['2 crossonephelis unijugatus ( pellegrin ) leenhouts\nde quelques fleurs . ombelle légère , à pedoncule grêle , sans invo-',
 {'entities': [[2, 27, 'espèce'], [102, 107, 'DESC']]}]

In [67]:
len(TD)

653

let'us build a test set 

In [68]:
DATAC=test.copy()

In [69]:
len(DATAC)

731

In [70]:

for i in range(50,100):
    #print("########## before:")
    #print(DATAC[i][0])
    #print("######### After:")
    
    DATAC[i][0]=TD[i][0]+'\n'+DATAC[i][0]
    #print(DATAC[i][0])

        #print('######### entites:')
        #print(ent)
        #print('######### old entites:')
        #print(DATAC[i][1]['entities'])
    for j,ent in enumerate(DATAC[i][1]['entities']):
        DATAC[i][1]['entities'][j][0]=DATAC[i][1]['entities'][j][0]+1+len(TD[i][0])
        DATAC[i][1]['entities'][j][1]=DATAC[i][1]['entities'][j][1]+1+len(TD[i][0])
    for j,ent in enumerate(TD[i][1]['entities']):
        DATAC[i][1]['entities'].insert(j,ent)
    #print('##### NEW ENTITIES')
    #print(DATAC[i][1]['entities'])

for el in TD[350:653]:
    DATAC.append(el)

In [73]:
test=DATAC

In [74]:
test[50]

['1 peucedanum winkleri wolff\nde 10-22 cm , pubérulent à glabre , canaliculé .',
 {'entities': [[2, 21, 'espèce'],
   [64, 74, 'DESC'],
   [55, 61, 'DESC'],
   [31, 39, 'MEASURE']]}]

In [77]:
import pickle
with open ("train_data_desc_mixte.pkl","wb") as pick:
    pickle.dump(train,pick)

In [78]:
import pickle
with open ("test_data_desc_mixte.pkl","r") as pick:
    pickle.dump(test,pick)

In [1]:
import pickle
train=[]
with open("train_data_desc_mixte.pkl",'rb') as pick:
    train.append(pickle.load(pick))
test=[]
with open("test_data_desc_mixte.pkl",'rb') as pick:
    test.append(pickle.load(pick))

In [5]:
train=train[0]
test=test[0]

In [17]:
#trying to copy and run some code
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

def convert(lang: str, TRAIN_DATA, output_path: Path):
    skipped_counter=0
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            try:
                span = doc.char_span(start, end, label=label)
            except:
                skipped_counter+=1
                continue

            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
                skipped_counter+=1
            else:
                ents.append(span)
        try:
            doc.ents = ents
            db.add(doc)
        except:
            skipped_counter+=1
            continue
    db.to_disk(output_path)
    return(skipped_counter)

In [18]:
s=convert('fr',train[0:2500],'ner_data/train{:}.spacy'.format(len(train)))
ss=convert('fr',train[2500:3170],'ner_data/valid{:}.spacy'.format(len(train)))


'arrondi , strié , atteignant 20-22 cm ; 5-g folioles articulées sur le'


'pétiolule de 4-6 cm , limbe elliptique-oblong ( 4,5-8,5 x 12-20 cm ) ,'


'pétiolule de 4-6 cm , limbe elliptique-oblong ( 4,5-8,5 x 12-20 cm ) ,'


'riqué long de 1-2 cm ; 8-10 fleurs par capitule , comprimées , sessiles ,'


'ensemble ; étamines à anthère oblongue , longue de i,5 mm ; disque'


'neuses ; pétiole grêle , variant de 10 à 20 mm , avec quelques poils'


'étroit est parfois masqué et le limbe semble pelté , marge 7-9-lobée ,'


'chaque lobe peu saillant , 3-denté .'


'sessiles , subsessiles , ou portés sur un pédoncule grêle pouvant'


'sessiles , subsessiles , ou portés sur un pédoncule grêle pouvant'


'atteindre 10 mm , opposés aux feuilles ou terminaux , constitués de'


"6-10 fleurs subsessiles pourvues de bractées courtes ; pas d'involu-"


"6-10 fleurs subsessiles pourvues de bractées courtes ; pas d'involu-"


'o,5 mm , entiers , à apex infléchi ; étamines très courtes , longues de'


'o