### Imports

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
import scipy
import numpy as np
import pandas as pd

import os,sys,inspect

In [4]:
# The skseq library must be previously installed. Here we assume it is in '../skseq'
sys.path.insert(0,'../') 
import skseq

from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList

from skseq.sequences.label_dictionary import LabelDictionary

import skseq.sequences.structured_perceptron as spc

### Read data

In [7]:
df_train = pd.read_csv('data/train_data_ner.csv')
print(df_train.shape)
df_train.head(10)

(839149, 3)


Unnamed: 0,sentence_id,words,tags
0,0,Thousands,O
1,0,of,O
2,0,demonstrators,O
3,0,have,O
4,0,marched,O
5,0,through,O
6,0,London,B-geo
7,0,to,O
8,0,protest,O
9,0,the,O


In [9]:
df_test = pd.read_csv('data/test_data_ner.csv')
print(df_test.shape)

(837339, 3)


In [10]:
# We assume that the training data includes all possible tags that may exist (17 tags)

tag_dict = LabelDictionary(label_names=set(df_train.tags))
print('Number of distinct tags:', len(tag_dict))
tag_dict

Number of distinct tags: 17


{'I-nat': 0,
 'B-tim': 1,
 'I-geo': 2,
 'B-org': 3,
 'I-per': 4,
 'B-gpe': 5,
 'I-gpe': 6,
 'I-tim': 7,
 'O': 8,
 'B-nat': 9,
 'B-art': 10,
 'B-per': 11,
 'B-eve': 12,
 'I-org': 13,
 'B-geo': 14,
 'I-art': 15,
 'I-eve': 16}

In [11]:
# For the word dictionary, we are including both the set of words in train and test. 
# Otherwise there is an error when creating a SequenceList with the test dataset.
# The test dataset is only used to get the list of words.
# This should not be an issue, since the model is trained with training data only, so it 
# will not use any word that is not in training.

word_dict = LabelDictionary(label_names=(set(df_train.words) | set(df_test.words)))  # Union of train and test sets of words
print('Number of distinct words:', len(word_dict))

Number of distinct words: 55145


In [12]:
%%time

# Build sequence_list object with training data

train_seq = SequenceList(word_dict, tag_dict)

for sentence_id, group in df_train.groupby('sentence_id'):
    seq_x = []
    seq_y = []
    for i in range(len(group)):
        seq_x.append(group.iloc[i].words)
        seq_y.append(group.iloc[i].tags)
    train_seq.add_sequence(seq_x, seq_y, train_seq.x_dict, train_seq.y_dict)


Wall time: 2min 34s


# Training structured perceptron


In order to train a structured perceptron we need to construct a feature mapper that will translate Sequence objects to numerical features. Then the structured perceptron can be instantiated using

- The corpus dictionary of words
- The corpus dictionary of tags
- The feature mapper

In [35]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

In [36]:
sp = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper)

In [37]:
sp.get_num_states(), sp.get_num_observations()

(17, 55145)

In [38]:
%%time
num_epochs = 5
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.894375
Epoch: 1 Accuracy: 0.931975
Epoch: 2 Accuracy: 0.940628
Epoch: 3 Accuracy: 0.946124
Epoch: 4 Accuracy: 0.950197
Wall time: 25min 15s


## Saving model weights

In [39]:
len(sp.parameters)

39802

In [40]:
sp.parameters

array([ 6.4,  7.8, 11.4, ..., -3.4,  1. ,  1. ])

In [41]:
sp.save_model("model1")