# class-based picklable sentence indexer

In [1]:
import numpy as np
from mltools.preprocessing import Tokenizer, Indexer, Pipeline

Using TensorFlow backend.
  return f(*args, **kwds)


## test on toy data

from the Aeneid:  http://classics.mit.edu/Virgil/aeneid.1.i.html

In [2]:
text = [
    'arms and the man i sing who forced by fate,',
    "and haughty Juno's unrelenting hate.",
    'expelled and exiled left the trojan shore,',
    'long labors both by sea and land he bore.',
    'and in the doubtful war before he won',
    'the Latian realm and built the destined town.',
    'his banished gods restored to rites divine',
    'and settled sure succession in his line',
    'from whence the race of Alban fathers come',
    'and the long glories of majestic Rome.',
    'O, muse, the causes and the crimes relate',
    'what goddess was provoked and whence her hate',
    'for what offense the queen of heaven began',
    'to persecute so brave so just a man,',
    'involved his anxious life in endless cares,',
    'exposed to wants and hurried into wars?',
    'can heavenly minds such high resentment show',
    'or exercise their spite in human woe',
    "against the Tiber's mouth but far away",
    'an ancient town was seated on the sea',
    'a Tyrian colony the people made',
    'stout for the war and studious of their trade',
    'carthage the name beloved by Juno more',
    'than her own argos or the Samian shore',
    'here stood her chariot here if heaven were kind',
    'the seat of awful empire she designed',
    'yet she had heard an ancient rumor fly',
    'long cited by the people of the sky',
    'that times to come should see the trojan race',
    'her Carthage ruin and her towers deface',
    'nor thus confined the yoke of sovreign sway',
    'should on the necks of all the nations lay',
    'she pondered this and feared it was in fate',
    'nor could forget the war she waged of late',
    'for conquring greece against the Trojan state.'
]
split_idx = int(len(text)*0.85)

## test pipelining with `Tokenizer` and pickling model

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

In [4]:
tokenizer = Tokenizer(max_vocab=100, min_count=1, lower=True, regex=True)
indicizer = Indexer(max_len=10, pad='post', truncate='post',
                    reverse=False, unk_name='UNK', pad_name='PAD')

In [5]:
pipeline = Pipeline([
    ('tokenize', tokenizer),
    ('indicize', indicizer)
])

In [6]:
pipeline.fit(text[:split_idx])

Pipeline(steps=[('tokenize', Tokenizer(char=False, lower=True, max_vocab=100, min_count=1, regex=True,
     stopwords=[], unk_name='UNK',
     vcounts={'the': 20, 'and': 12, 'of': 6, 'by': 4, 'in': 4, 'to': 4, 'long': 3, 'his': 3, 'her': 3, 'man': 2, 'hate': 2, 'trojan': 2, 'shore': 2, 'sea': 2, 'he': 2, 'wa...mor': 141, 'fly': 142, 'cited': 143, 'sky': 144, 'that': 145, 'times': 146, 'UNK': 147, 'PAD': 0}))])

In [7]:
vects = pipeline.transform(text[:split_idx])
vects

array([[ 34,   2,   1,  10,  35,  36,  37,  38,   4,  39],
       [  2,  40,  41,  42,  11,   0,   0,   0,   0,   0],
       [ 43,   2,  44,  45,   1,  12,  13,   0,   0,   0],
       [  7,  46,  47,   4,  14,   2,  48,  15,  49,   0],
       [  2,   5,   1,  50,  16,  51,  15,  52,   0,   0],
       [  1,  53,  54,   2,  55,   1,  56,  17,   0,   0],
       [  8,  57,  58,  59,   6,  60,  61,   0,   0,   0],
       [  2,  62,  63,  64,   5,   8,  65,   0,   0,   0],
       [ 66,  18,   1,  19,   3,  67,  68,  20,   0,   0],
       [  2,   1,   7,  69,   3,  70,  71,   0,   0,   0],
       [ 72,  73,   1,  74,   2,   1,  75,  76,   0,   0],
       [ 21,  77,  22,  78,   2,  18,   9,  11,   0,   0],
       [ 23,  21,  79,   1,  80,   3,  24,  81,   0,   0],
       [  6,  82,  25,  83,  25,  84,  26,  10,   0,   0],
       [ 85,   8,  86,  87,   5,  88,  89,   0,   0,   0],
       [ 90,   6,  91,   2,  92,  93,  94,   0,   0,   0],
       [ 95,  96,  97,  98,  99, 100, 101,   0,   0,   0

In [8]:
texts = pipeline.inverse_transform(vects)
for t in texts[:5]:
    print(' '.join(t))

arms and the man i sing who forced by fate
and haughty junos unrelenting hate PAD PAD PAD PAD PAD
expelled and exiled left the trojan shore PAD PAD PAD
long labors both by sea and land he bore PAD
and in the doubtful war before he won PAD PAD


## pickle, load pickle to new pipe and compare

In [9]:
from sklearn.externals import joblib

In [10]:
joblib.dump(pipeline, 'test-indexer.pkl')

['test-indexer.pkl']

In [11]:
loaded_pipe = joblib.load('test-indexer.pkl')

In [12]:
loaded_pipe.steps[0][1]

Tokenizer(char=False, lower=True, max_vocab=100, min_count=1, regex=True,
     stopwords=[], unk_name='UNK',
     vcounts={'the': 20, 'and': 12, 'of': 6, 'by': 4, 'in': 4, 'to': 4, 'long': 3, 'his': 3, 'her': 3, 'man': 2, 'hate': 2, 'trojan': 2, 'shore': 2, 'sea': 2, 'he': 2, 'war': 2, 'town': 2, 'whence': 2, 'race': 2, 'come': 2, 'what': 2, 'was': 2, 'for': 2, 'heaven': 2, 'so': 2, 'a': 2, 'or': 2, 'their': 2,...heard': 1, 'rumor': 1, 'fly': 1, 'cited': 1, 'sky': 1, 'that': 1, 'times': 1, 'should': 1, 'see': 1})

In [13]:
vects = loaded_pipe.transform(text[:split_idx])
vects

array([[ 34,   2,   1,  10,  35,  36,  37,  38,   4,  39],
       [  2,  40,  41,  42,  11,   0,   0,   0,   0,   0],
       [ 43,   2,  44,  45,   1,  12,  13,   0,   0,   0],
       [  7,  46,  47,   4,  14,   2,  48,  15,  49,   0],
       [  2,   5,   1,  50,  16,  51,  15,  52,   0,   0],
       [  1,  53,  54,   2,  55,   1,  56,  17,   0,   0],
       [  8,  57,  58,  59,   6,  60,  61,   0,   0,   0],
       [  2,  62,  63,  64,   5,   8,  65,   0,   0,   0],
       [ 66,  18,   1,  19,   3,  67,  68,  20,   0,   0],
       [  2,   1,   7,  69,   3,  70,  71,   0,   0,   0],
       [ 72,  73,   1,  74,   2,   1,  75,  76,   0,   0],
       [ 21,  77,  22,  78,   2,  18,   9,  11,   0,   0],
       [ 23,  21,  79,   1,  80,   3,  24,  81,   0,   0],
       [  6,  82,  25,  83,  25,  84,  26,  10,   0,   0],
       [ 85,   8,  86,  87,   5,  88,  89,   0,   0,   0],
       [ 90,   6,  91,   2,  92,  93,  94,   0,   0,   0],
       [ 95,  96,  97,  98,  99, 100, 101,   0,   0,   0