# Building Model Spacy
### By **Néstor Suat** in 2020

**Descripción:** Generando un modelo ML con Spacy para la tarea de NER en tweets de accidentes para las etiquetas `loc` y `time` usando el estandar BIO.

**Input:**
* TSV con dataset etiquetado con BIO

**Output:**
* Model


    
    python 1\ Building\ Spacy\ NER.py -m es_core_news_lg -o /home/hat/code/traffic-accidents/model/data/v1/NER/spacy_model/ -n 500

***

### Importando librerías

In [1]:
import pandas as pd
import spacy

from sklearn.model_selection import train_test_split

In [2]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        #agg_func = lambda s: " ".join(s["Word"].values.tolist())
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            #s = self.grouped["Sentence: {}".format(self.n_sent)]
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

#### Importando modelo spacy construido localmente

In [3]:
spacy_model = "../../../data/v1/NER/spacy_model_complete/"
nlp = spacy.load(spacy_model)

### Importando dataset anotado

El archivo `ner-crf-training-data.tsv` fue construido anteriormente transformando el formato de anotación de Standoff a BIO.

In [4]:
file = 'ner-crf-test-data.tsv'
dir_ = "../../../data/v1/NER/test/"
data = pd.read_csv(dir_+file, delimiter = "\t", quoting = 3, names=['Sentence #','Word','POS','Tag'])

In [5]:
getter = SentenceGetter(data)
sentences = getter.sentences
sentences[0]

[('Rt', 'NOUN', 'O'),
 ('accidente', 'NOUN', 'O'),
 ('de', 'ADP', 'O'),
 ('transito', 'NOUN', 'O'),
 ('de', 'ADP', 'O'),
 ('biarticulado', 'NOUN', 'O'),
 ('y', 'CCONJ', 'O'),
 ('bicitaxi', 'NOUN', 'O'),
 ('en', 'ADP', 'O'),
 ('la', 'DET', 'O'),
 ('cali', 'NOUN', 'B-loc'),
 ('con', 'ADP', 'I-loc'),
 ('villavicencio', 'NOUN', 'I-loc'),
 (',', 'PUNCT', 'O'),
 ('es', 'AUX', 'O'),
 ('en', 'ADP', 'O'),
 ('la', 'DET', 'O'),
 ('salida', 'NOUN', 'O'),
 ('de', 'ADP', 'O'),
 ('Portal', 'PROPN', 'B-loc'),
 ('Americas', 'PROPN', 'I-loc')]

### Construyendo X e Y.

In [6]:
X = [" ".join([w[0] for w in s]) for s in sentences]
y = [[w[2] for w in s] for s in sentences]

print(X[1])
print(y[1])

movilidad bogota acueducto trancon accidente llevó 3 horas en el carro bajando de la calera y muchos Buses escolares con niños pequeños de los colegios , nada que quitan el camión del acueducto que se accidentó en la circunvalar con 85 , TERRIBLE !!
['O', 'O', 'O', 'O', 'O', 'B-time', 'I-time', 'I-time', 'O', 'O', 'O', 'O', 'O', 'O', 'B-loc', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-loc', 'I-loc', 'I-loc', 'O', 'O', 'O']


## Train and Test set

### Evaluando Modelo

In [7]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [8]:
def predict_fn(sentence):
    labels = []
    words = []
    doc2 = nlp(sentence)
    for token in doc2:
        l = token.ent_type_ if token.ent_type_ != '' else 'O'
        w = token.text
        labels.append(l)
        words.append(w)
    return words,labels

def predict_all_fn(test):
    pred = []
    words = []
    for s in test:
        w, p = predict_fn(s)
        pred.append(p)
        words.append(w)
    return words, pred
        

In [9]:
tokens, y_pred = predict_all_fn(X)

In [10]:
#tokens

#Comparar diferencias de tokenización
for i in range(len(y_pred)):
    print(len(y_test[i]),len(y_pred[i]))

In [14]:
#print("F1-score: {:.1%}".format(f1_score(y_test, y_pred)))
print("F1-score: {:.6%}".format(f1_score(y, y_pred)))

F1-score: 79.100529%


In [15]:
#print(classification_report(y_test, y_pred))
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

         loc       0.80      0.81      0.81       354
        time       0.67      0.44      0.53        27

   micro avg       0.80      0.78      0.79       381
   macro avg       0.74      0.63      0.67       381
weighted avg       0.79      0.78      0.79       381



In [13]:
cont = 0
for i in range(len(y_pred)-1):
    if len(y_pred[i]) != len(y[i]):
        print(i)
        del y_pred[i]
        del y[i]

In [16]:
from sklearn_crfsuite import metrics

labels = ['B-loc', 'I-loc', 'B-time', 'I-time']
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y, y_pred, labels=sorted_labels, digits=4
))

              precision    recall  f1-score   support

       B-loc     0.8960    0.8782    0.8870       353
       I-loc     0.9386    0.9582    0.9483       862
      B-time     0.7778    0.5185    0.6222        27
      I-time     0.8636    0.6552    0.7451        29

   micro avg     0.9234    0.9197    0.9216      1271
   macro avg     0.8690    0.7525    0.8007      1271
weighted avg     0.9217    0.9197    0.9197      1271



In [17]:
print("F1-score: {:.4%}".format(metrics.flat_f1_score(y, y_pred, average='weighted', labels=labels)))

F1-score: 91.9730%


In [18]:
print("Accuracy: "+str(round(metrics.flat_accuracy_score(y, y_pred),6)))

print("F1-Micro: "+str(round(metrics.flat_f1_score(y, y_pred, average='micro', labels=labels),6)))
print("Recall-Micro: "+str(round(metrics.flat_recall_score(y, y_pred, average='micro', labels=labels),6)))
print("Precision-Micro: "+str(round(metrics.flat_precision_score(y, y_pred, average='micro', labels=labels),6)))

print("F1-Macro: "+str(round(metrics.flat_f1_score(y, y_pred, average='macro', labels=labels),6)))
print("Recall-Macro: "+str(round(metrics.flat_recall_score(y, y_pred, average='macro', labels=labels),6)))
print("Precision-Macro: "+str(round(metrics.flat_precision_score(y, y_pred, average='macro', labels=labels),6)))

print("F1-Weighted: "+str(round(metrics.flat_f1_score(y, y_pred, average='weighted', labels=labels),6)))
print("Recall-Weighted: "+str(round(metrics.flat_recall_score(y, y_pred, average='weighted', labels=labels),6)))
print("Precision-Weighted: "+str(round(metrics.flat_precision_score(y, y_pred, average='weighted', labels=labels),6)))

Accuracy: 0.973344
F1-Micro: 0.921561
Recall-Micro: 0.919748
Precision-Micro: 0.923381
F1-Macro: 0.800659
Recall-Macro: 0.752529
Precision-Macro: 0.869001
F1-Weighted: 0.91973
Recall-Weighted: 0.919748
Precision-Weighted: 0.921654


### Evaluando con una muestra

In [None]:
data

In [19]:
i = 0
n = 64

test_text = X[n]
sent, y_predict = predict_fn(test_text)

tags = data[data['Sentence #']==n+1073]['Tag']
print("{:15} ({:8}): {}".format("Word", "True", "Pred"))
for w, true,  pred in zip(sent, tags, y_predict):        
    print("{:15} ({:8}): {}".format(w, true, pred))    

Word            (True    ): Pred
sectormovilidad (O       ): O
policia         (O       ): O
c               (O       ): O
marca           (O       ): O
transito        (O       ): O
policia         (O       ): O
supertrancon    (O       ): O
ingreso         (O       ): O
a               (O       ): O
Bogota          (B-loc   ): B-loc
por             (O       ): O
calle           (B-loc   ): B-loc
80              (I-loc   ): I-loc
va              (O       ): O
hasta           (O       ): O
el              (B-loc   ): B-loc
.               (I-loc   ): I-loc
peaje           (I-loc   ): I-loc
la              (I-loc   ): I-loc
vega            (I-loc   ): I-loc
por             (O       ): O
incidente       (O       ): O
en              (O       ): O
la              (O       ): O
kra.114         (B-loc   ): B-loc
con             (I-loc   ): I-loc
80              (I-loc   ): I-loc
no              (O       ): O
hay             (O       ): O
policia         (O       ): O
y               (O     

In [20]:
from spacy import displacy
n=64
test_text = X[n]
print(test_text)
doc2 = nlp(test_text)
entities = []
entity = []
for ent in doc2.ents:
    if ent.label_.split("-")[0] == 'B' and len(entity) > 0:
        print(entity)
        entities.append((' '.join(entity),ent.label_.split("-")[1]))
        entity = []
    entity.append(ent.text)
    print(entity)
    print(ent.text, ent.label_)

entities.append((' '.join(entity),ent.label_.split("-")[1]))

print(entities)
colors = {"B-LOC": "#fc9ce7", "I-LOC": "#fc9ce7","B-TIME":'#3371ff',"I-TIME":'#3371ff'}
options = {"ents": ["B-LOC","I-LOC","B-TIME","I-TIME"], "colors": colors}
displacy.render(doc2, style="ent",options=options)

sectormovilidad policia c marca transito policia supertrancon ingreso a Bogota por calle 80 va hasta el . peaje la vega por incidente en la kra.114 con 80 no hay policia y no hay semaforos . . donde estan los de transito . . urgente . .
['Bogota']
Bogota B-loc
['Bogota']
['calle']
calle B-loc
['calle', '80']
80 I-loc
['calle', '80']
['el']
el B-loc
['el', '.']
. I-loc
['el', '.', 'peaje']
peaje I-loc
['el', '.', 'peaje', 'la']
la I-loc
['el', '.', 'peaje', 'la', 'vega']
vega I-loc
['el', '.', 'peaje', 'la', 'vega']
['kra.114']
kra.114 B-loc
['kra.114', 'con']
con I-loc
['kra.114', 'con', '80']
80 I-loc
[('Bogota', 'loc'), ('calle 80', 'loc'), ('el . peaje la vega', 'loc'), ('kra.114 con 80', 'loc')]


In [114]:
ents = [(e.text, e.label_) for e in doc2.ents]
print(ents)

[('Bogota', 'B-loc'), ('calle', 'B-loc'), ('80', 'I-loc'), ('el', 'B-loc'), ('.', 'I-loc'), ('peaje', 'I-loc'), ('la', 'I-loc'), ('vega', 'I-loc'), ('kra.114', 'B-loc'), ('con', 'I-loc'), ('80', 'I-loc')]


In [21]:
from spacy import displacy


n=64
test_text = X[n]

doc2 = nlp(test_text)
colors = {"B-LOC": "#fc9ce7", "I-LOC": "#fc9ce7","B-TIME":'#3371ff',"I-TIME":'#3371ff'}
options = {"ents": ["B-LOC","I-LOC","B-TIME","I-TIME"], "colors": colors}
displacy.render(doc2, style="ent",options=options)

ents = [(e.text, e.label_) for e in doc2.ents]

def get_entities():
    entities = []
    entity = ''
    tokens = []
    for e in range(len(ents)):
        token = ents[e][0]
        ner = ents[e][1]
        ner_iob = ner.split("-")[0]
        ner_text = ner.split("-")[1] 

        if (ner_iob == 'B' and len(tokens) > 0):        
            t = ' '.join(tokens)
            entities.append((t,entity))
            tokens = []

        entity = ner_text        
        tokens.append(token)
        if e == len(ents)-1:
            t = ' '.join(tokens)
            entities.append((t,entity))


    return entities

entities = get_entities()
entities

[('Bogota', 'loc'),
 ('calle 80', 'loc'),
 ('el . peaje la vega', 'loc'),
 ('kra.114 con 80', 'loc')]

In [24]:
ent = [ t for (t,l) in entities  if l == 'loc' ]
ent

['Bogota', 'calle 80', 'el . peaje la vega', 'kra.114 con 80']

In [159]:
' '.join(ent)

'Bogota calle 80 el . peaje la vega kra.114 con 80'