In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scipy
import numpy as np
import pandas as pd

import os,sys,inspect

sys.path.insert(0,'../') 
import skseq

from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList

from skseq.sequences.label_dictionary import LabelDictionary

import skseq.sequences.structured_perceptron as spc

df_train = pd.read_csv('data/train_data_ner.csv')
df_test = pd.read_csv('data/test_data_ner.csv')
tag_dict = LabelDictionary(label_names=set(df_train.tags))
word_dict = LabelDictionary(label_names=(set(df_train.words) | set(df_test.words)))  # Union of train and test sets of words

train_seq = SequenceList(word_dict, tag_dict)

for sentence_id, group in df_train.groupby('sentence_id'):
    seq_x = []
    seq_y = []
    for i in range(len(group)):
        seq_x.append(group.iloc[i].words)
        seq_y.append(group.iloc[i].tags)
    train_seq.add_sequence(seq_x, seq_y, train_seq.x_dict, train_seq.y_dict)

feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

sp = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper)

In [2]:
%%prun -s cumulative
sp.fit(feature_mapper.dataset, 1)

Epoch: 0 Accuracy: 0.894070
          688672086 function calls in 518.480 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  518.480  518.480 {built-in method builtins.exec}
        1    0.000    0.000  518.480  518.480 <string>:1(<module>)
        1    0.000    0.000  518.480  518.480 structured_perceptron.py:25(fit)
        1    0.115    0.115  518.479  518.479 structured_perceptron.py:58(fit_epoch)
    38366    4.411    0.000  518.364    0.014 structured_perceptron.py:95(perceptron_update)
    38366    0.224    0.000  513.243    0.013 sequence_classifier.py:124(viterbi_decode)
    38366  164.527    0.004  306.158    0.008 discriminative_sequence_classifier.py:25(compute_scores)
    38366   69.183    0.002  206.636    0.005 sequence_classification_decoder.py:83(run_viterbi)
231708035  114.319    0.000  130.703    0.000 id_feature.py:113(get_transition_features)
 27456818   15.912    0.000  114.

In [5]:
%%prun -s cumulative -T profile_output/compute_scores.txt
np.random.seed(140)
for i in np.random.randint(low=0, high=len(train_seq), size=1000):
  sequence = train_seq[i]
  initial_scores, transition_scores, final_scores, emission_scores = \
    sp.compute_scores(sequence)

 
*** Profile printout saved to text file 'profile_output/compute_scores.txt'.


In [4]:
%%prun -s cumulative -T profile_output/run_viterbi.txt
for i in range(1000):
  best_states, total_score = sp.decoder.run_viterbi(initial_scores,
                                                      transition_scores,
                                                      final_scores,
                                                      emission_scores)

NameError: name 'initial_scores' is not defined

In [41]:
%%timeit
sp.decoder.run_viterbi(initial_scores,
                       transition_scores,
                       final_scores,
                       emission_scores)

327 µs ± 9.43 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [40]:
a = sp.decoder.run_viterbi(initial_scores,
                       transition_scores,
                       final_scores,
                       emission_scores)

In [33]:
b = sp.decoder.run_viterbi(initial_scores,
                       transition_scores,
                       final_scores,
                       emission_scores)

In [32]:
a

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.0)

In [36]:
b

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.0)

In [26]:
a.shape

(21, 17)

In [27]:
b.shape

(20, 17, 17)

In [30]:
(a[0] + b[0]).shape

(17, 17)