### Imports

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import scipy
import numpy as np
import pandas as pd
import os,sys,inspect

In [3]:
# The skseq library must be previously installed. Here we assume it is in '../skseq'
sys.path.insert(0,'../') 
import skseq

from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList

from skseq.sequences.label_dictionary import LabelDictionary

import skseq.sequences.structured_perceptron as spc

In [27]:
from utils.utils import evaluate_corpus

### Load data

In [5]:
df_train = pd.read_csv('data/train_data_ner.csv')
print(df_train.shape)

(839149, 3)


In [6]:
df_test = pd.read_csv('data/test_data_ner.csv')
print(df_test.shape)
df_test.head(5)

(837339, 3)


Unnamed: 0,sentence_id,words,tags
0,1,Iranian,B-gpe
1,1,officials,O
2,1,say,O
3,1,they,O
4,1,expect,O


### Dictionaries

In [7]:
# We assume that the training data includes all possible tags that may exist (17 tags)

tag_dict = LabelDictionary(label_names=set(df_train.tags))
print('Number of distinct tags:', len(tag_dict))
tag_dict

Number of distinct tags: 17


{'I-nat': 0,
 'B-per': 1,
 'I-geo': 2,
 'B-nat': 3,
 'O': 4,
 'I-org': 5,
 'B-art': 6,
 'I-art': 7,
 'I-tim': 8,
 'B-org': 9,
 'I-per': 10,
 'I-gpe': 11,
 'B-eve': 12,
 'I-eve': 13,
 'B-gpe': 14,
 'B-geo': 15,
 'B-tim': 16}

In [8]:
# For the word dictionary, we are including both the words in train and test. 
# Otherwise there will be an error when creating a SequenceList with the test dataset.
# This should not be a problem, since the model is trained with training data only, so
# it will not use those words.

# word_dict = LabelDictionary(label_names=set(df_train.words))
word_dict = LabelDictionary(label_names=(set(df_train.words) | set(df_test.words)))  # Union of train and test sets of words
print('Number of distinct words:', len(word_dict))

Number of distinct words: 55145


### Build train_seq and test_seq

In [9]:
%%time

# Build sequence_list object with train data

train_seq = SequenceList(word_dict, tag_dict)

for sentence_id, group in df_train.groupby('sentence_id'):
    seq_x = []
    seq_y = []
    for i in range(len(group)):
        seq_x.append(group.iloc[i].words)
        seq_y.append(group.iloc[i].tags)
    train_seq.add_sequence(seq_x, seq_y, train_seq.x_dict, train_seq.y_dict)


CPU times: user 53.4 s, sys: 353 ms, total: 53.7 s
Wall time: 53.6 s


In [10]:
%%time

# Build sequence_list object with test data

test_seq = SequenceList(word_dict, tag_dict)

for sentence_id, group in df_test.groupby('sentence_id'):
    seq_x = []
    seq_y = []
    for i in range(len(group)):
        seq_x.append(group.iloc[i].words)
        seq_y.append(group.iloc[i].tags)
    test_seq.add_sequence(seq_x, seq_y, test_seq.x_dict, test_seq.y_dict)


CPU times: user 53.4 s, sys: 222 ms, total: 53.6 s
Wall time: 53.5 s


In [11]:
# Number of sequences in train and test
len(train_seq), len(test_seq)

(38366, 38367)

In [12]:
# vocabulary size
len(train_seq.x_dict), len(test_seq.x_dict)

(55145, 55145)

In [13]:
train_seq[0]

9504/4 6457/4 34759/4 29132/4 4975/4 48564/4 34608/15 35297/4 16790/4 10114/4 48592/4 21341/4 28205/15 22707/4 24069/4 10114/4 14295/4 6457/4 1115/14 640/4 16736/4 966/4 53397/4 50892/4 

In [14]:
train_seq[0].to_words(sequence_list=train_seq)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

### Get feature mapper (from training data)

In [15]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

### Load saved model

In [16]:
sp = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper)

In [17]:
sp.load_model(dir="fitted_models/model1")

## Evaluate model

In [18]:
%%time
pred_train = sp.viterbi_decode_corpus(train_seq)

CPU times: user 2min 30s, sys: 1.22 s, total: 2min 31s
Wall time: 7min 8s


In [19]:
%%time
pred_test = sp.viterbi_decode_corpus(test_seq)

CPU times: user 2min 27s, sys: 582 ms, total: 2min 28s
Wall time: 2min 28s


In [28]:
# Evaluate train dataset
O_value = train_seq.y_dict['O']  # code of 'O' value, to be ignored
eval_train = evaluate_corpus(train_seq.seq_list, pred_train, train_seq.y_dict, ignore_tag_code=O_value)

Accuracy: 0.963
Weighted F1 score: 0.962
Accuracy (ignore tag excluded): 0.805
Weighted F1 score (ignore tag excluded): 0.839


In [29]:
# Evaluate test dataset
eval_test = evaluate_corpus(test_seq.seq_list, pred_test, test_seq.y_dict, ignore_tag_code=O_value)

Accuracy: 0.876
Weighted F1 score: 0.847
Accuracy (ignore tag excluded): 0.229
Weighted F1 score (ignore tag excluded): 0.353


## Run the "tiny test"

In [30]:
def predict_sentence(p):
    # Create a seq object with empty tags
    new_seq = skseq.sequences.sequence.Sequence(x=p.split(), y=[int(0) for w in p.split()])
    return sp.viterbi_decode(new_seq)[0].to_words(train_seq, only_tag_translation=True)

In [31]:
predict_sentence("The programmers from Barcelona might write a sentence without a spell checker.")

'The/O programmers/O from/O Barcelona/B-geo might/O write/O a/O sentence/O without/O a/O spell/B-art checker./I-art '

In [32]:
predict_sentence("The programmers from Barchelona cannot write a sentence without a spell checker.")

'The/O programmers/O from/O Barchelona/O cannot/O write/O a/O sentence/O without/O a/O spell/B-art checker./I-art '

In [33]:
predict_sentence("Jack London went to Parris.")

'Jack/B-per London/B-geo went/O to/O Parris./O '

In [34]:
predict_sentence("Jack London went to Paris.")

'Jack/B-per London/B-geo went/O to/O Paris./O '

In [35]:
predict_sentence("Bill gates and Steve jobs never though Microsoft would become such a big company.")

'Bill/B-per gates/O and/O Steve/B-per jobs/O never/O though/O Microsoft/B-org would/O become/O such/O a/O big/O company./O '

In [36]:
predict_sentence("Bill Gates and Steve Jobs never though Microsof would become such a big company.")

'Bill/B-per Gates/I-per and/O Steve/B-per Jobs/I-per never/O though/O Microsof/O would/O become/O such/O a/O big/O company./O '

In [37]:
predict_sentence("The president of U.S.A though they could win the war.")

'The/O president/O of/O U.S.A/O though/O they/O could/O win/O the/O war./O '

In [38]:
predict_sentence("The president of the United States of America though they could win the war.")

'The/O president/O of/O the/O United/B-geo States/I-geo of/O America/B-geo though/O they/O could/O win/O the/O war./O '

In [39]:
predict_sentence("The king of Saudi Arabia wanted total control.")

'The/O king/O of/O Saudi/B-geo Arabia/I-geo wanted/O total/O control./O '

In [40]:
predict_sentence("Robin does not want to go to Saudi Arabia.")

'Robin/O does/O not/O want/O to/O go/O to/O Saudi/B-art Arabia./I-art '

In [41]:
predict_sentence("Apple is a great company.")

'Apple/O is/O a/O great/O company./O '

In [42]:
predict_sentence("I really love apples and oranges.")

'I/O really/O love/O apples/O and/O oranges./O '

In [43]:
predict_sentence("Alice and Henry went to the Microsoft store to buy a new computer during their trip to New York.")

'Alice/O and/O Henry/B-per went/O to/O the/O Microsoft/B-org store/O to/O buy/O a/O new/O computer/O during/O their/O trip/O to/O New/B-org York./I-org '