### Imports

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import scipy
import numpy as np
import pandas as pd

import os,sys,inspect

In [3]:
# The skseq library must be previously installed. Here we assume it is in '../skseq'
sys.path.insert(0,'../') 
import skseq

from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList

from skseq.sequences.label_dictionary import LabelDictionary

import skseq.sequences.structured_perceptron as spc

from sklearn.cluster import KMeans
import joblib
import gensim.downloader as api
from utils.utils import generate_words_embeddings, generate_tiny_test

### Read data

In [4]:
df_train = pd.read_csv('data/train_data_ner.csv')
print(df_train.shape)
df_train.head(10)

(839149, 3)


Unnamed: 0,sentence_id,words,tags
0,0,Thousands,O
1,0,of,O
2,0,demonstrators,O
3,0,have,O
4,0,marched,O
5,0,through,O
6,0,London,B-geo
7,0,to,O
8,0,protest,O
9,0,the,O


In [5]:
# We assume that the training data includes all possible tags that may exist (17 tags)

tag_dict = LabelDictionary(label_names=set(df_train.tags))
print('Number of distinct tags:', len(tag_dict))
tag_dict

Number of distinct tags: 17


{'B-tim': 0,
 'B-geo': 1,
 'I-nat': 2,
 'I-gpe': 3,
 'I-art': 4,
 'O': 5,
 'B-art': 6,
 'B-per': 7,
 'I-geo': 8,
 'I-per': 9,
 'B-org': 10,
 'B-nat': 11,
 'I-tim': 12,
 'I-eve': 13,
 'I-org': 14,
 'B-gpe': 15,
 'B-eve': 16}

In [6]:
# Generate dictionary of training words
word_dict = LabelDictionary(label_names=set(df_train.words))
print('Number of distinct words:', len(word_dict))

Number of distinct words: 31979


In [7]:
%%time

# Build sequence_list object with training data

train_seq = SequenceList(word_dict, tag_dict)

for _, group in df_train.groupby('sentence_id'):
    train_seq.add_sequence(group.words, group.tags, train_seq.x_dict, train_seq.y_dict)


CPU times: user 5.52 s, sys: 46.4 ms, total: 5.57 s
Wall time: 5.58 s


# Generate cluster information of the words

We are going to create a dictionary that contains clustering information of the words that make up the training and test datasets in order to speedup the generation of some features afterwards.

First, we are going to generate the embeddings of the training corpus. With this information, we are going to trian a k-Means algorithm and create a bunch of clusters. Finally, we are going to find the closest centroids to all of the samples in the training dataset, as well as those in the test dataset.

In [24]:
# Load word2vec model
model = api.load('word2vec-google-news-300')

In [32]:
# Generate word embeddings for training data
embeddings, _ = generate_words_embeddings(set(df_train.words), model)

In [29]:
# Define k-Means model
k_means = KMeans(n_clusters=100, random_state=451)

In [34]:
# Create clusters and save model
k_means.fit(embeddings)

KMeans(n_clusters=100, random_state=451)

In [35]:
# Load test and tiny test data
df_test = pd.read_csv('data/test_data_ner.csv')
tiny_test = generate_tiny_test()

In [36]:
# Create vocabulary containing training, test and tiny test data
tiny_test_sentences = []

for sentence in tiny_test:
    tiny_test_sentences.extend(sentence)

all_words = set(df_train.words) | set(df_test.words) | set(tiny_test_sentences)

In [37]:
all_embeddings, embedded_words = generate_words_embeddings(all_words, model)
clusters = k_means.predict(all_embeddings)
all_words_clusters = {word: cluster for word, cluster in zip(embedded_words, clusters)}

clusters_dict = "fitted_models/cluster_dict.joblib"
joblib.dump(all_words_clusters, clusters_dict)

['fitted_models/cluster_dict.joblib']

# Training structured perceptron


In order to train a structured perceptron we need to construct a feature mapper that will translate Sequence objects to numerical features. Then the structured perceptron can be instantiated using

- The corpus dictionary of words
- The corpus dictionary of tags
- The feature mapper

In [None]:
%%time
skseq.sequences.id_feature.IDFeatures

feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

In [None]:
sp = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper)

In [None]:
sp.get_num_states(), sp.get_num_observations()

In [None]:
%%time
num_epochs = 5
sp.fit(feature_mapper.dataset, num_epochs)

In [42]:
from skseq.sequences.extended_feature import ExtendedFeatures

In [44]:
%%time

extended_feature_mapper = skseq.sequences.extended_feature.ExtendedFeatures(train_seq, clusters_dict, len(k_means.labels_))
extended_feature_mapper.build_features()

CPU times: user 7.77 s, sys: 71 ms, total: 7.84 s
Wall time: 7.85 s


In [45]:
sp_features = spc.StructuredPerceptron(word_dict, tag_dict, extended_feature_mapper)

In [46]:
sp_features.get_num_states(), sp_features.get_num_observations()

(17, 31979)

In [47]:
%%time
num_epochs = 5
sp_features.fit(extended_feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.939492
Epoch: 1 Accuracy: 0.949420
Epoch: 2 Accuracy: 0.952592


KeyboardInterrupt: 

## Saving model weights

In [None]:
len(sp.parameters)

In [None]:
sp.parameters

In [None]:
sp.save_model("model1")

In [None]:
len(sp_features.parameters)

In [None]:
sp_features.paramters

In [None]:
sp_features.save_model("model_features")