### Imports

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import scipy
import numpy as np
import pandas as pd
import os,sys,inspect

In [3]:
# The skseq library must be previously installed. Here we assume it is in '../skseq'
sys.path.insert(0,'../') 
import skseq

from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList

from skseq.sequences.label_dictionary import LabelDictionary

import skseq.sequences.structured_perceptron as spc

In [4]:
from utils.utils import build_sequence_list, evaluate_corpus

### Load data

In [5]:
df_train = pd.read_csv('data/train_data_ner.csv')
print(df_train.shape)

(839149, 3)


In [6]:
df_test = pd.read_csv('data/test_data_ner.csv')
print(df_test.shape)
df_test.head(5)

(837339, 3)


Unnamed: 0,sentence_id,words,tags
0,1,Iranian,B-gpe
1,1,officials,O
2,1,say,O
3,1,they,O
4,1,expect,O


### Dictionaries

In [7]:
# We assume that the training data includes all possible tags that may exist (17 tags)

tag_dict = LabelDictionary(label_names=set(df_train.tags))
print('Number of distinct tags:', len(tag_dict))
tag_dict

Number of distinct tags: 17


{'I-nat': 0,
 'B-art': 1,
 'B-nat': 2,
 'I-geo': 3,
 'I-per': 4,
 'I-gpe': 5,
 'I-org': 6,
 'B-gpe': 7,
 'O': 8,
 'I-art': 9,
 'I-eve': 10,
 'B-per': 11,
 'I-tim': 12,
 'B-org': 13,
 'B-eve': 14,
 'B-tim': 15,
 'B-geo': 16}

In [8]:
# Generate dictionary of words for training dataset
word_dict_train = LabelDictionary(label_names=set(df_train.words))
print('Number of distinct words in train set: ', len(word_dict_train))

Number of distinct words in train set:  31979


In [9]:
# Generate dictionary of words for test dataset
word_dict_test = LabelDictionary(label_names=set(df_test.words))
print('Number of distinct words in test set: ', len(word_dict_test))

Number of distinct words in test set:  43955


### Build train_seq and test_seq

In [10]:
%%time

# Build sequence_list object with train data

train_seq = build_sequence_list(df_train, word_dict_train, tag_dict)

CPU times: user 1.47 s, sys: 17.6 ms, total: 1.49 s
Wall time: 1.49 s


In [11]:
%%time

# Build sequence_list object with test data

test_seq = build_sequence_list(df_test, word_dict_test, tag_dict, use_labels=True)

CPU times: user 1.35 s, sys: 20.7 ms, total: 1.37 s
Wall time: 1.37 s


In [12]:
# Number of sequences in train and test
len(train_seq), len(test_seq)

(38366, 38367)

In [13]:
# vocabulary size
len(train_seq.x_dict), len(test_seq.x_dict)

(31979, 43955)

In [14]:
train_seq[0]

19374/8 2285/8 21686/8 12468/8 25447/8 18898/8 17647/16 25471/8 10856/8 9017/8 2890/8 8141/8 20800/16 5877/8 856/8 9017/8 3174/8 2285/8 3422/7 25366/8 8375/8 6862/8 15822/8 2558/8 

In [15]:
train_seq[0].to_words(sequence_list=train_seq)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

### Get feature mapper (from training data)

In [16]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

In [22]:
from skseq.sequences.extended_feature import ExtendedFeatures
clusters_dict = 'fitted_models/clusters_dict.joblib')
extended_feature_mapper = ExtendedFeatures(train_seq, clusters_dict, 100)
extended_feature_mapper.build_features()

NameError: name 'clusters_dict' is not defined

### Load saved model

In [17]:
sp = spc.StructuredPerceptron(word_dict_train, tag_dict, feature_mapper)

In [18]:
sp.load_model(dir="fitted_models/model1")

In [None]:
sp_features = spc.StructuredPerceptron(word_dict_train, tag_dict, feature_mapper)
sp_features.save_model("fitted_models/model_features_")

## Evaluate model

In [19]:
%%time
pred_train = sp.viterbi_decode_corpus(train_seq)

KeyboardInterrupt: 

In [20]:
%%time
pred_test = sp.viterbi_decode_corpus(test_seq)

CPU times: user 1min 13s, sys: 456 ms, total: 1min 13s
Wall time: 1min 14s


In [21]:
# Evaluate train dataset
O_value = train_seq.y_dict['O']  # code of 'O' value, to be ignored
eval_train = evaluate_corpus(train_seq.seq_list, pred_train, train_seq.y_dict, ignore_tag_code=O_value)

NameError: name 'pred_train' is not defined

In [None]:
# Evaluate test dataset
eval_test = evaluate_corpus(test_seq.seq_list, pred_test, test_seq.y_dict, ignore_tag_code=O_value)

## Run the "tiny test"

In [30]:
def predict_sentence(p):
    # Create a seq object with empty tags
    new_seq = skseq.sequences.sequence.Sequence(x=p.split(), y=[int(0) for w in p.split()])
    return sp.viterbi_decode(new_seq)[0].to_words(train_seq, only_tag_translation=True)

In [31]:
predict_sentence("The programmers from Barcelona might write a sentence without a spell checker.")

'The/O programmers/O from/O Barcelona/B-geo might/O write/O a/O sentence/O without/O a/O spell/B-art checker./I-art '

In [32]:
predict_sentence("The programmers from Barchelona cannot write a sentence without a spell checker.")

'The/O programmers/O from/O Barchelona/O cannot/O write/O a/O sentence/O without/O a/O spell/B-art checker./I-art '

In [33]:
predict_sentence("Jack London went to Parris.")

'Jack/B-per London/B-geo went/O to/O Parris./O '

In [34]:
predict_sentence("Jack London went to Paris.")

'Jack/B-per London/B-geo went/O to/O Paris./O '

In [35]:
predict_sentence("Bill gates and Steve jobs never though Microsoft would become such a big company.")

'Bill/B-per gates/O and/O Steve/B-per jobs/O never/O though/O Microsoft/B-org would/O become/O such/O a/O big/O company./O '

In [36]:
predict_sentence("Bill Gates and Steve Jobs never though Microsof would become such a big company.")

'Bill/B-per Gates/I-per and/O Steve/B-per Jobs/I-per never/O though/O Microsof/O would/O become/O such/O a/O big/O company./O '

In [37]:
predict_sentence("The president of U.S.A though they could win the war.")

'The/O president/O of/O U.S.A/O though/O they/O could/O win/O the/O war./O '

In [38]:
predict_sentence("The president of the United States of America though they could win the war.")

'The/O president/O of/O the/O United/B-geo States/I-geo of/O America/B-geo though/O they/O could/O win/O the/O war./O '

In [39]:
predict_sentence("The king of Saudi Arabia wanted total control.")

'The/O king/O of/O Saudi/B-geo Arabia/I-geo wanted/O total/O control./O '

In [40]:
predict_sentence("Robin does not want to go to Saudi Arabia.")

'Robin/O does/O not/O want/O to/O go/O to/O Saudi/B-art Arabia./I-art '

In [41]:
predict_sentence("Apple is a great company.")

'Apple/O is/O a/O great/O company./O '

In [42]:
predict_sentence("I really love apples and oranges.")

'I/O really/O love/O apples/O and/O oranges./O '

In [43]:
predict_sentence("Alice and Henry went to the Microsoft store to buy a new computer during their trip to New York.")

'Alice/O and/O Henry/B-per went/O to/O the/O Microsoft/B-org store/O to/O buy/O a/O new/O computer/O during/O their/O trip/O to/O New/B-org York./I-org '