In [2]:
import spacy
import json
from spacy.tokens import DocBin

## Datasets

In [7]:
path_data_training= '/resources/datasets/unified/training_.json'
path_data_validation = '/resources/datasets/unified/validation_.json'
path_data_testing = '/resources/datasets/unified/test.json'

In [8]:
with open(path_data_training) as f:
    training_data = json.load(f)

with open(path_data_validation) as f:
    validation_data = json.load(f)

with open(path_data_testing) as f:
    testing_data = json.load(f)

In [4]:
print(f'Train: {len(training_data)}\nValidation: {len(validation_data)}\nTest: {len(testing_data)}')

Train: 357
Validation: 59
Test: 49


#### Crear DocBin

In [None]:
def make_docbin(data, outputFile):
    """
    data: .json
    outputFile: str
    """
    nlp = spacy.blank("es") 
    db = DocBin()
    for text, annot in data:
        doc = nlp.make_doc(text) 
        ents = []
        for element in annot["entities"]:
            for start, end, label in [element]:
                span = doc.char_span(start,end,label=label,alignment_mode="contract")
                if span is None:
                    print("Skipping entity")
                else:
                    ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(f"{outputFile}.spacy")
    return f"Processed {len(db)}"

In [None]:
output_train = '/resources/datasets/docbin/train'
output_val = '/resources/datasets/docbin/val'
output_test = '/resources/datasets/docbin/test'

In [None]:
# Create docBin Train
make_docbin(training_data, output_train)

In [None]:
# Create docBin Val
make_docbin(validation_data, output_val)

In [None]:
# Create docBin test
make_docbin(testing_data, output_test)

## Train CPU

#### Download base model

In [None]:
!python -m spacy download es_core_news_lg 

#### Create config

In [None]:
! python -m spacy init config config.cfg --lang es --pipeline ner --optimize accuracy

#### Training model

In [None]:
! python -m spacy train config.cfg --output /src/ia2/ia2/models  --paths.train /resources/datasets/docbin/train.spacy --paths.dev /resources/datasets/docbin/val.spacy

## Train GPU

#### Create config

In [None]:
# ! python -m spacy init config config_GPU.cfg --lang es --pipeline ner --gpu

#### Training model

In [None]:
# ! python -m spacy train config_GPU.cfg --output /src/ia2/ia2/models  --paths.train /resources/datasets/docbin/train.spacy  --paths.dev /resources/datasets/docbin/val.spacy --gpu-id 0

## Load Best Model

In [3]:
nlp_ner = spacy.load('/src/ia2/ia2/models/ner/model-best')

In [4]:
nlp_ner.get_pipe('ner').labels

('ARTÍCULO', 'DIRECCIÓN', 'LOC', 'PER')

In [5]:
nlp_ner.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f4a059ab940>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f4a0593f760>)]

## Evaluate

### Evaluate Cli

In [None]:
! python -m spacy evaluate /src/ia2/ia2/models/ner/model-best  /resources/datasets/docbin/test.spacy --output /resources/ouputs/metrics/metics.json --gold-preproc --displacy-path /resources/ouputs/displacy --displacy-limit 10

#### Evaluate old

In [None]:
from spacy.scorer import Scorer
from spacy.training import Example
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

def evaluate(model, data):
    examples = []
    scorer = Scorer()
    for text, annotations in data:
        doc = model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        example.predicted = model(str(example.predicted))
        examples.append(example)
    return scorer.score(examples)

def metrics_per_ents(model,data):
    metrics = evaluate(model, data)
    return pd.DataFrame.from_dict(metrics['ents_per_type'])

def metrics(model,data):
    metrics = evaluate(model, data)
    return {'precision': metrics['ents_p'], 'recall': metrics['ents_r'],'f-score': metrics['ents_f']}

In [None]:
metrics(nlp_ner,testing_data)

In [None]:
metrics_per_ents(nlp_ner,testing_data)

#### Display Text

In [9]:
text = testing_data[3][0]
doc = nlp_ner(text)

In [10]:
doc.ents

(Art. 52,
 art. 52- Hostigar, Maltratar, Intimar - CC,
 Estela Andrea Liotta,
 artículo 311 del Código Procesal Penal de la Ciudad Autónoma de Buenos Aires,
 Julio,
 Silvia Bermúdez,
 Rivadavia 11248,
 Marisa Nasimof,
 Tacuarí 138,
 Tacuarí 138)

In [None]:
{'LOC': 1252, 'DIRECCIÓN': 1493, 'PER': 3729, 'ARTÍCULO': 4174}

In [22]:
colors = {"ARTÍCULO": "red","PER": "blue", "DIRECCIÓN":"yellow", "LOC": "orange"  }
options = {"ents": ["ARTÍCULO", "PER", "DIRECCIÓN", "LOC" ], "colors": colors}
spacy.displacy.render(doc, style='ent', jupyter=True, options=options)