In [1]:
import pandas as pd
from pprint import pprint
import random

import spacy
from spacy.gold import GoldParse

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ

import pycrfsuite

# Let's detect natural disasters!

https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data

## 0. Get the data

In [2]:
df_dataset = pd.read_csv("ner_dataset.csv", encoding="latin1")
df_dataset.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [3]:
last_sent_id= 0
for i, row in df_dataset.iterrows():  
    if not pd.isnull(row["Sentence #"]):
        last_sent_id = int(row["Sentence #"][10:])
        row["Sentence #"] = last_sent_id
    else:
        row["Sentence #"] = last_sent_id

### Find those with 'nat' tag:

In [4]:
df_dataset.dtypes

Sentence #    object
Word          object
POS           object
Tag           object
dtype: object

In [5]:
sent_id = df_dataset[df_dataset["Tag"].str.contains("nat")]["Sentence #"].unique()

In [6]:
df_dataset_nat = df_dataset[df_dataset["Sentence #"].isin(sent_id)]

### Remap tags

In [8]:
lst_tags = df_dataset_nat["Tag"].unique().tolist()

In [9]:
lst_tags.remove("I-nat")
lst_tags.remove("B-nat")

In [10]:
dict_tags = {}
for i in lst_tags:
    dict_tags[i] = "O"
    
dict_tags["I-nat"] = "I-NAT"
dict_tags["B-nat"] = "B-NAT"

In [11]:
df_dataset_nat["Tag remapped"] = df_dataset_nat["Tag"].map(dict_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [12]:
df_dataset_nat["Tag remapped"].unique().tolist()

['O', 'B-NAT', 'I-NAT']

## 1. Using Spacy

In [13]:
LABEL = 'NAT'
MAX_ITERATIONS = 50

### Training format

In [14]:
def join_space(values):
    return " ".join(values).strip()

In [15]:
df_sentences_1 = df_dataset_nat.groupby("Sentence #")["Word"].apply(list).reset_index()

In [16]:
df_sentences_2 = df_dataset_nat.groupby("Sentence #")["Tag remapped"].apply(list).reset_index()

In [17]:
df_sentences = pd.merge(left=df_sentences_1, right = df_sentences_2, on = "Sentence #")

In [18]:
df_sentences.head(4)

Unnamed: 0,Sentence #,Word,Tag remapped
0,121,"[Officials, say, the, 27-year, old, man, from,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,206,"[Humans, are, usually, infected, with, bird, f...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,227,"[One, of, the, 2008, Olympic, mascots, is, mod...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAT, I-..."
3,229,"[Sam, Beattie, reports, from, Jing, Jing, 's, ...","[O, O, O, O, B-NAT, I-NAT, O, O, O, O, O, O]"


In [19]:
train_data = []

for i, row in df_sentences.iterrows():
    raw_sent = " ".join(row["Word"]).replace(" ,", ",")
    
    tags = list(zip(row["Word"],row["Tag remapped"]))
    advance = 0

    new_ents = []

    for i in range(len(tags)):
        tag = tags[i]

        word = tag[0]
        ent = tag[1]

        ent = ent.replace("B-", "")
        ent = ent.replace("I-", "")
        ent = ent.replace("L-", "")
        ent = ent.replace("O-", "")
        ent = ent.replace("U-", "")

        ent_range = [advance, advance + len(word), ent]

        advance += len(word)
        if i < (len(tags) - 1):
            if tags[i + 1][0] != ',':
                advance += 1

        if not ent_range[2] == "O":
            new_ents.append(ent_range)

    new_ents_merged = []

    for j in range(len(new_ents)):
        if len(new_ents_merged) == 0:
            new_ents_merged.append(new_ents[j])

        if new_ents_merged[-1][2] == new_ents[j][2]:
            new_ents_merged[-1][1] = new_ents[j][1]
        else:
            new_ents_merged.append(new_ents[j])

    new_ents_merged_tuples = [tuple(item) for item in new_ents_merged]
    train_data.append((raw_sent, {"entities": new_ents_merged_tuples}))

In [20]:
pprint(train_data[:2])

[("Officials say the 27-year old man from Vietnam 's northern Ninh Binh "
  'province died late Thursday and tested positive for the H5N1 strain of bird '
  'flu .',
  {'entities': [(125, 129, 'NAT')]}),
 ('Humans are usually infected with bird flu by direct contact with infected '
  'poultry, but experts fear the H5N1 virus may mutate into a form easily '
  'transmitted between people .',
  {'entities': [(104, 108, 'NAT')]})]


### Split

In [21]:
test_data = train_data[155:]
train_data = train_data[:155]

### Train

In [22]:
nlp = spacy.load("en_core_web_sm")

In [23]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)

else:
    ner = nlp.get_pipe('ner')

In [24]:
ner.add_label(LABEL)

In [25]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [26]:
%%time
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(2):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35,losses=losses)
        print(losses)

{'ner': 1151.5631708588069}
{'ner': 970.3055948485708}
Wall time: 1min 32s


In [27]:
nlp.meta['name'] = "en_core_web_sm_newlabel"

In [28]:
nlp.to_disk("models")

### Test it out

In [29]:
nlp2 = spacy.load("models")

In [30]:
y_true = []

for i, test in enumerate(test_data):
    y_true.append([test[0][j[0]:j[1]] for j in test[1]["entities"]])

In [31]:
y_predict = []
for test in test_data:
    doc = nlp2(test[0])
    y_predict.append([ent.text for ent in doc.ents if ent.label_ == "NAT"])

In [32]:
y_predict[0]

['Rita']

In [33]:
def evaluate(y_predict, y_true):
    correct = 0
    for j, val in enumerate(y_predict):
        if val == y_true[j]:
            correct += 1
            
    return correct / len(y_predict)

In [34]:
evaluate(y_predict=y_predict, y_true=y_true)

0.64

## 3. Using PyCRF

### Training format

In [35]:
lemmatizer = WordNetLemmatizer()

In [36]:
train_data = []
for index, row in df_sentences.iterrows():
    
    train_data_sentence = []
    
    raw_sent = row["Word"]
    tokens = nltk.pos_tag(raw_sent)

    for i, val in enumerate(tokens):
        train_data_word = []
        
        word = raw_sent[i]
        label = row["Tag remapped"][i]
        pos_tag = tokens[i][1]

        if pos_tag.startswith("N"):
            lemma = lemmatizer.lemmatize(word.lower(), pos=NOUN)
        elif pos_tag.startswith("V"):
            lemma = lemmatizer.lemmatize(word.lower(), pos=VERB)
        elif pos_tag.startswith("J"):
            lemma = lemmatizer.lemmatize(word.lower(), pos=ADJ)
        else:
            lemma = word
            
        train_data_word.append(word)
        train_data_word.append(pos_tag)
        train_data_word.append(lemma)
        train_data_word.append(label)
        
        train_data_sentence.append(train_data_word)
        
    train_data.append(train_data_sentence)

### Feature engineering

In [37]:
def word2features(sent, i, embed={}, use_gazetteers=False):
    word = sent[i][0]
    postag = sent[i][-3]
    lemma = sent[i][-2].lower()
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2]
    ]
    if embed != {}:
        features.extend(['word.embed=%s' % embed.get(word, len(embed))])
    if use_gazetteers:
        features.extend(['word.measures=%s' % str(word.lower() in UNIT_GAZETTEER or lemma in UNIT_GAZETTEER),
                        'word.products=%s' % str(word.lower() in PRODUCTS_GAZETTEER or lemma in PRODUCTS_GAZETTEER)])

    if i > 0:
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][-3]
        lemma1 = sent[i - 1][-2].lower()
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2]
        ])
        if embed != {}:
            features.extend(['-1:word.embed=%s' % embed.get(word1, len(embed))])
        if use_gazetteers:
            features.extend(['-1:word.measures=%s' % str(word1.lower() in UNIT_GAZETTEER or lemma1 in UNIT_GAZETTEER),
                            '-1:word.products=%s' % str(word1.lower() in PRODUCTS_GAZETTEER or lemma1 in PRODUCTS_GAZETTEER)])

    else:
        features.append('BOS')

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][-3]
        lemma1 = sent[i + 1][-2].lower()
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2]
        ])
        if use_gazetteers:
            features.extend(['+1:word.measures=%s' % str(word1.lower() in UNIT_GAZETTEER or lemma1 in UNIT_GAZETTEER),
                            '+1:word.products=%s' % str(word1.lower() in PRODUCTS_GAZETTEER or lemma1 in PRODUCTS_GAZETTEER)])

    else:
        features.append('EOS')

    return features

In [38]:
def sent2features(sent, embed={}, use_gazetteers=False):

    return [word2features(sent, i, embed=embed, use_gazetteers=use_gazetteers) for i in range(len(sent))]

In [39]:
train_data_formatted = [sent2features(x) for x in train_data]

### Labels

In [40]:
y_data = df_sentences["Tag remapped"].tolist()

### Split

In [41]:
x_test = train_data_formatted[155:]
y_test = y_data[155:]

x_train = train_data_formatted[:155]
y_train = y_data[:155]

### Model training

In [42]:
def train(X_train, y_train, model_name):
    """ Trains a CRF on the given training data and saves the model. """
    print("Training", model_name)
    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 0.1,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'feature.possible_transitions': True
    })

    trainer.train(model_name)

In [43]:
%%time
train(x_train, y_train, 'pycrfmodel.model')

Training pycrfmodel.model
Wall time: 6.36 s


In [44]:
def tag(X_test,model_name):
    """ Labels test data with the model saved in model_name. """
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)

    return [tagger.tag(seq) for seq in X_test]

In [45]:
tag(x_test, 'pycrfmodel.model')[0]

['O',
 'O',
 'O',
 'B-NAT',
 'I-NAT',
 'I-NAT',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [46]:
def evaluate(y_predict, y_true, ignore_bio = True):
    correct = 0
    total = 0
    for i, y_pred in enumerate(y_predict):
        for j, y in enumerate(y_pred):
            if ignore_bio:
                if y[2:] == y_true[i][j][2:]:
                    correct += 1
                
            else:
                if y == y_true[i][j]:
                    correct += 1
                
            
        
        total += len(y_pred)
        
    return correct / total
        

In [47]:
evaluate(tag(x_test, 'pycrfmodel.model'), y_test, ignore_bio=True)

0.9840213049267643