### Imports

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import scipy
import numpy as np
import pandas as pd
import os,sys,inspect

In [3]:
# The skseq library must be previously installed. Here we assume it is in '../skseq'
sys.path.insert(0,'../') 
import skseq

from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList

from skseq.sequences.label_dictionary import LabelDictionary

import skseq.sequences.structured_perceptron as spc

In [4]:
from utils.utils import build_sequence_list, evaluate_corpus, generate_tiny_test

### Load data

In [5]:
df_train = pd.read_csv('data/train_data_ner.csv')
print(df_train.shape)

(839149, 3)


In [6]:
df_test = pd.read_csv('data/test_data_ner.csv')
print(df_test.shape)
df_test.head(5)

(837339, 3)


Unnamed: 0,sentence_id,words,tags
0,1,Iranian,B-gpe
1,1,officials,O
2,1,say,O
3,1,they,O
4,1,expect,O


In [7]:
tiny_test = generate_tiny_test()
tiny_test

['The programmers from Barcelona might write a sentence without a spell checker .',
 'The programmers from Barchelona cannot write a sentence without a spell checker .',
 'Jack London went to Parris .',
 'Jack London went to Paris .',
 'Bill gates and Steve jobs never though Microsoft would become such a big company .',
 'Bill Gates and Steve Jobs never though Microsof would become such a big company .',
 'The president of U.S.A though they could win the war .',
 'The president of the United States of America though they could win the war .',
 'The king of Saudi Arabia wanted total control .',
 'Robin does not want to go to Saudi Arabia .',
 'Apple is a great company .',
 'I really love apples and oranges .',
 'Alice and Henry went to the Microsoft store to buy a new computer during their trip to New York .']

### Dictionaries

In [8]:
# We assume that the training data includes all possible tags that may exist (17 tags)

tag_dict = LabelDictionary(label_names=set(df_train.tags))
print('Number of distinct tags:', len(tag_dict))
tag_dict

Number of distinct tags: 17


{'I-nat': 0,
 'I-geo': 1,
 'B-nat': 2,
 'B-eve': 3,
 'I-tim': 4,
 'B-per': 5,
 'B-geo': 6,
 'B-org': 7,
 'B-tim': 8,
 'O': 9,
 'I-art': 10,
 'I-eve': 11,
 'B-gpe': 12,
 'I-gpe': 13,
 'I-per': 14,
 'B-art': 15,
 'I-org': 16}

In [9]:
# Generate dictionary of words for training dataset
word_dict_train = LabelDictionary(label_names=set(df_train.words))
print('Number of distinct words in train set: ', len(word_dict_train))

Number of distinct words in train set:  31979


In [10]:
# Generate dictionary of words for test dataset
word_dict_test = LabelDictionary(label_names=set(df_test.words))
print('Number of distinct words in test set: ', len(word_dict_test))

Number of distinct words in test set:  43955


### Build train_seq and test_seq

In [11]:
%%time

# Build sequence_list object with train data

train_seq = build_sequence_list(df_train, word_dict_train, tag_dict)

CPU times: user 4.96 s, sys: 34.5 ms, total: 4.99 s
Wall time: 4.99 s


In [12]:
%%time

# Build sequence_list object with test data

test_seq = build_sequence_list(df_test, word_dict_test, tag_dict, use_labels=True)

CPU times: user 4.79 s, sys: 19.5 ms, total: 4.81 s
Wall time: 4.81 s


In [13]:
# Number of sequences in train and test
len(train_seq), len(test_seq)

(38366, 38367)

In [14]:
# vocabulary size
len(train_seq.x_dict), len(test_seq.x_dict)

(31979, 43955)

In [15]:
train_seq[0]

3298/9 11978/9 1565/9 11508/9 14682/9 25550/9 19553/6 28976/9 7421/9 31009/9 1960/9 23772/9 5971/6 16157/9 1861/9 31009/9 18631/9 11978/9 18273/12 29905/9 20283/9 4269/9 17062/9 11799/9 

In [16]:
train_seq[0].to_words(sequence_list=train_seq)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

### Get feature mapper (from training data)

In [17]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

In [19]:
from skseq.sequences.extended_feature import ExtendedFeatures
extended_feature_mapper = ExtendedFeatures(train_seq, "fitted_models/cluster_dict.joblib", 100)
extended_feature_mapper.build_features()

### Load saved model

In [20]:
sp = spc.StructuredPerceptron(word_dict_train, tag_dict, feature_mapper)

In [21]:
sp.load_model(dir="fitted_models/model1")

In [24]:
sp_features = spc.StructuredPerceptron(word_dict_train, tag_dict, extended_feature_mapper)
sp_features.load_model(dir="fitted_models/model_features_")

## Evaluate model

In [25]:
%%time
pred_train = sp.viterbi_decode_corpus(train_seq)

CPU times: user 2min 12s, sys: 126 ms, total: 2min 12s
Wall time: 2min 12s


In [26]:
%%time
pred_test = sp.viterbi_decode_corpus(test_seq)

CPU times: user 2min 11s, sys: 126 ms, total: 2min 11s
Wall time: 2min 11s


In [27]:
# Evaluate train dataset
O_value = train_seq.y_dict['O']  # code of 'O' value, to be ignored
eval_train = evaluate_corpus(train_seq.seq_list, pred_train, train_seq.y_dict, ignore_tag_code=O_value)

Accuracy: 0.963
Weighted F1 score: 0.963
Accuracy (ignore tag excluded): 0.806
Weighted F1 score (ignore tag excluded): 0.840


In [28]:
# Evaluate test dataset
eval_test = evaluate_corpus(test_seq.seq_list, pred_test, test_seq.y_dict, ignore_tag_code=O_value)

Accuracy: 0.876
Weighted F1 score: 0.847
Accuracy (ignore tag excluded): 0.230
Weighted F1 score (ignore tag excluded): 0.354


In [32]:
%%time
pred_train_features = sp_features.viterbi_decode_corpus(train_seq)

CPU times: user 3min 5s, sys: 110 ms, total: 3min 5s
Wall time: 3min 5s


In [33]:
%%time
pred_test_features = sp_features.viterbi_decode_corpus(test_seq)

CPU times: user 3min 22s, sys: 169 ms, total: 3min 22s
Wall time: 3min 22s


In [34]:
# Evaluate train dataset
O_value = train_seq.y_dict['O']  # code of 'O' value, to be ignored
eval_train_features = evaluate_corpus(train_seq.seq_list, pred_train_features, train_seq.y_dict, ignore_tag_code=O_value)

Accuracy: 0.962
Weighted F1 score: 0.963
Accuracy (ignore tag excluded): 0.830
Weighted F1 score (ignore tag excluded): 0.848


In [35]:
# Evaluate test dataset
eval_test_features = evaluate_corpus(test_seq.seq_list, pred_test_features, test_seq.y_dict, ignore_tag_code=O_value)

Accuracy: 0.894
Weighted F1 score: 0.899
Accuracy (ignore tag excluded): 0.478
Weighted F1 score (ignore tag excluded): 0.490


## Run the "tiny test"

In [40]:
tiny_predictions = []
tiny_predictions_features = []

for tiny_sentence in tiny_test:
    tiny_seq = skseq.sequences.sequence.Sequence(x=tiny_sentence.split(), y=[int(0) for w in tiny_sentence.split()])
    tiny_predictions.append(sp.viterbi_decode(tiny_seq)[0].to_words(train_seq, only_tag_translation=True))
    tiny_predictions_features.append(sp_features.viterbi_decode(tiny_seq)[0].to_words(train_seq, only_tag_translation=True))

In [41]:
for tiny_prediction in tiny_predictions:
    print(tiny_prediction)

The/O programmers/O from/O Barcelona/B-geo might/O write/O a/O sentence/O without/O a/O spell/O checker/O ./O 
The/O programmers/O from/O Barchelona/O cannot/O write/O a/O sentence/O without/O a/O spell/O checker/O ./O 
Jack/B-per London/B-geo went/O to/O Parris/O ./O 
Jack/B-per London/B-geo went/O to/O Paris/B-geo ./O 
Bill/B-per gates/O and/O Steve/B-per jobs/O never/O though/O Microsoft/B-org would/O become/O such/O a/O big/O company/O ./O 
Bill/B-per Gates/I-per and/O Steve/B-per Jobs/I-per never/O though/O Microsof/O would/O become/O such/O a/O big/O company/O ./O 
The/O president/O of/O U.S.A/O though/O they/O could/O win/O the/O war/O ./O 
The/O president/O of/O the/O United/B-geo States/I-geo of/O America/B-geo though/O they/O could/O win/O the/O war/O ./O 
The/O king/O of/O Saudi/B-geo Arabia/I-geo wanted/O total/O control/O ./O 
Robin/O does/O not/O want/O to/O go/O to/O Saudi/B-org Arabia/I-org ./O 
Apple/O is/O a/O great/O company/O ./O 
I/O really/O love/O apples/O and/O 

In [42]:
for tiny_prediction in tiny_predictions_features:
    print(tiny_prediction)

The/O programmers/O from/O Barcelona/B-org might/O write/O a/O sentence/O without/O a/O spell/B-tim checker/I-tim ./O 
The/O programmers/O from/O Barchelona/B-geo cannot/O write/O a/O sentence/O without/O a/O spell/B-tim checker/I-tim ./O 
Jack/B-per London/B-geo went/O to/O Parris/B-per ./O 
Jack/B-per London/B-geo went/O to/O Paris/B-geo ./O 
Bill/B-per gates/O and/O Steve/B-per jobs/O never/O though/O Microsoft/B-org would/O become/O such/O a/O big/O company/O ./O 
Bill/B-per Gates/I-per and/O Steve/B-per Jobs/I-per never/O though/O Microsof/B-art would/O become/O such/O a/O big/O company/O ./O 
The/O president/O of/O U.S.A/B-geo though/O they/O could/O win/O the/O war/O ./O 
The/O president/O of/O the/O United/B-org States/I-org of/I-org America/I-org though/O they/O could/O win/O the/O war/O ./O 
The/O king/O of/O Saudi/B-org Arabia/I-org wanted/O total/O control/O ./O 
Robin/B-per does/O not/O want/O to/O go/O to/O Saudi/B-org Arabia/I-org ./O 
Apple/B-per is/O a/O great/O compan