### En este notebook crearemos el conjunto de entrenamiento y entrenaremos el modelo

In [1]:
import json
from glob import glob
import random
import spacy
from spacy.lang.es import Spanish
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy import displacy
from spacy.util import minibatch, compounding
import datetime as dt

Bajamos el modelo general para español de tamaño medio.
Deshabilitamos el NER a ese modelo y añadimos para ese cometido nuestras reglas creadas anteriormente.

In [2]:
# python -m spacy download es_core_news_md (con esto nos bajamos el modelo)
with open("./data/antecedentes_patterns_CIE10.jsonl", encoding="utf8") as f:
    patterns = json.loads(f.read())
    
nlp = spacy.load("es_core_news_md", disable=["ner"]) 
ruler = EntityRuler(nlp, overwrite_ents=True, validate=True)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

Comprobamos los componentes de nuestro pipe

In [3]:
nlp.pipe_names

['tagger', 'parser', 'entity_ruler']

Funciones para cargar y salvar modelos

In [4]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=1)


Función para crear conjunto de entrenamiento.

El formato de los datos de entremamiento debe ser así: **TRAIN_DATA = [(text, {"entities": [(start, end, label)]})]**

In [5]:
def create_train(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return (results)

Vamos a crear el set de entrenamiento utilizando el dataset de train

In [6]:
files = glob("./data/train/text_files/*.txt")
             
with open("./data/train/allfiles.txt", "w", encoding="utf-8" ) as result:
    for file in files:
        for line in open( file, "r", encoding="utf-8" ):
            result.write( line )

Creamos el conjunto de entrenamiento y lo guardamos en un archivo

In [7]:
TRAIN_DATA = []
with open ("./data/train/allfiles.txt", "r", encoding="utf8") as f:
    text_to_train = f.read()
    #print (text)
    segments = text_to_train.split("\n")
    for segment in segments:
        results = create_train(nlp, segment)
        if results != None:
            TRAIN_DATA.append(results)

save_data("./data/antecedentes_training_data.jsonl", TRAIN_DATA)

Vemos los antecedentes entrenados

In [8]:
print (f"Número de antecedente en datos de entrenamiento: {len(TRAIN_DATA)}\n")

for text, _ in TRAIN_DATA:
    doc = nlp(text)
    #print('Antecedente entrenado', [(ent.text) for ent in doc.ents])

Número de antecedente en datos de entrenamiento: 644



Función para entrenar el modelo

In [9]:
def train_model(data, iterations):
    TRAIN_DATA = data
    
    # Creamos un modelo de spacy vacio
    #nlp = spacy.blank("en") 
    #nlp = Spanish()
    
    # Añadimos la componente ner a la pipeline sino no está
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    
    # Añadimos todas las antecedentes del conjunto de entrenamiento al modelo
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    # Eliminamos el efecto del training en otros pipes 
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    # Empezamos a entrenar
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            #print ("Starting iteration " + str(itn))
            # Mezcla los datos de entrenamiento
            random.shuffle(TRAIN_DATA)
            losses = {}
            # Crea lotes con los ejemplos e itera sobre ellos
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                # Actualiza el modelo
                nlp.update(
                    texts,  
                    annotations,  
                    drop=0.1,  # dropout - hace mas dificil que se momoricen los datos de entrenamiento para evitar el sobrenetrenamiento
                    losses=losses,
                )
            #print (losses)
            print(f"Losses at iteration {itn} - {dt.datetime.now()} {losses}")
    return (nlp)

Cargamos los datos de entrenamiento, entrenamos el modelo en 30 iteraciones y lo guardamos 

In [10]:
TRAIN_DATA = load_data("./data/antecedentes_training_data.jsonl")
nlp = train_model(TRAIN_DATA, 30)
nlp.to_disk("./models/model_antecedentes_ner_es_md")

Losses at iteration 0 - 2021-07-15 14:27:28.511528 {'ner': 4831.141551545451}
Losses at iteration 1 - 2021-07-15 14:27:50.371501 {'ner': 1673.7956220201688}
Losses at iteration 2 - 2021-07-15 14:28:12.020400 {'ner': 1903.1977685644104}
Losses at iteration 3 - 2021-07-15 14:28:33.618714 {'ner': 1864.8738198465787}
Losses at iteration 4 - 2021-07-15 14:28:54.984361 {'ner': 917.7131908770169}
Losses at iteration 5 - 2021-07-15 14:29:16.926273 {'ner': 1095.4789427200953}
Losses at iteration 6 - 2021-07-15 14:29:38.445356 {'ner': 914.4035253696131}
Losses at iteration 7 - 2021-07-15 14:29:59.735336 {'ner': 525.0726797397605}
Losses at iteration 8 - 2021-07-15 14:30:21.843280 {'ner': 534.1464740445881}
Losses at iteration 9 - 2021-07-15 14:30:44.144245 {'ner': 467.09535934823157}
Losses at iteration 10 - 2021-07-15 14:31:06.939131 {'ner': 480.09305212971094}
Losses at iteration 11 - 2021-07-15 14:31:30.289077 {'ner': 379.7916839289244}
Losses at iteration 12 - 2021-07-15 14:31:53.347366 {'ne

In [11]:
nlp.pipe_names

['tagger', 'parser', 'entity_ruler', 'ner']