In [1]:
from pathlib import Path
import sentencepiece as spm
import pandas as pd

## Data

In [2]:
#stenograms of Riigikogu
df=pd.read_csv('data/raw/stenos.csv')
df.shape

(131337, 9)

In [3]:
df.head()

Unnamed: 0,heading,link,speaker,subpart_id,text,year,month,day,time
0,14:00 Istungi rakendamine,http://stenogrammid.riigikogu.ee/et/201712061400,Esimees Eiki Nestor,PKP-21949,"Austatud Riigikogu, tere päevast! Teate mis, s...",2017,12,6,14:00
1,14:00 Istungi rakendamine,http://stenogrammid.riigikogu.ee/et/201712061400,Priit Sibul,PKP-21949,Austatud Riigikogu esimees! Head kolleegid! An...,2017,12,6,14:00
2,14:00 Istungi rakendamine,http://stenogrammid.riigikogu.ee/et/201712061400,Esimees Eiki Nestor,PKP-21949,Rohkem soove ei ole. Olen vastu võtnud kaks ee...,2017,12,6,14:00
3,1.\n \t14:05 Kõrgemate riig...,http://stenogrammid.riigikogu.ee/et/201712061400,Esimees Eiki Nestor,PKP-21950,Alustame tänaste päevakorrapunktide menetlemis...,2017,12,6,14:05
4,1.\n \t14:05 Kõrgemate riig...,http://stenogrammid.riigikogu.ee/et/201712061400,Artur Talvik,PKP-21950,Hea esimees! Head Riigikogu liikmed! Vabaerako...,2017,12,6,14:05


#### Save text only

In [4]:
df[['text']].to_csv('data/interim/text.txt',index=False, header=False)

## Params to change

In [18]:
input_file='data/interim/text.txt'
model_path='tokenizers'
vocab_sizes=[1000, 5000, 10000, 20000]
model_types=['unigram', 'bpe']
normalization_rule_names=['nmt_nfkc','nfkc', 'nmt_nfkc_cf', 'nfkc_cf', 'identity']

## Helpers

In [19]:
def train_spt_model(input_file, vocab_size, model_type, normalization_rule_name, model_path):
    model_prefix=f'{model_path}/{model_type}_vocab_size_{vocab_size}_norm_{normalization_rule_name}'
    print(f'Training model {model_prefix}')
    spm.SentencePieceTrainer.Train(f'--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type} --normalization_rule_name={normalization_rule_name}')

In [20]:
def train_spt_models(input_file, vocab_sizes, model_types, normalization_rule_names, model_path):
    for vocab_size in vocab_sizes:
        for model_type in model_types:
            for normalization_rule_name in normalization_rule_names:
                train_spt_model(input_file, vocab_size, model_type, normalization_rule_name, model_path)

## Train models

In [21]:
train_spt_models(input_file, vocab_sizes, model_types, normalization_rule_names, model_path)

Training model tokenizers/unigram_vocab_size_1000_norm_nmt_nfkc
Training model tokenizers/unigram_vocab_size_1000_norm_nfkc
Training model tokenizers/unigram_vocab_size_1000_norm_nmt_nfkc_cf
Training model tokenizers/unigram_vocab_size_1000_norm_nfkc_cf
Training model tokenizers/unigram_vocab_size_1000_norm_identity
Training model tokenizers/bpe_vocab_size_1000_norm_nmt_nfkc
Training model tokenizers/bpe_vocab_size_1000_norm_nfkc
Training model tokenizers/bpe_vocab_size_1000_norm_nmt_nfkc_cf
Training model tokenizers/bpe_vocab_size_1000_norm_nfkc_cf
Training model tokenizers/bpe_vocab_size_1000_norm_identity
Training model tokenizers/unigram_vocab_size_5000_norm_nmt_nfkc
Training model tokenizers/unigram_vocab_size_5000_norm_nfkc
Training model tokenizers/unigram_vocab_size_5000_norm_nmt_nfkc_cf
Training model tokenizers/unigram_vocab_size_5000_norm_nfkc_cf
Training model tokenizers/unigram_vocab_size_5000_norm_identity
Training model tokenizers/bpe_vocab_size_5000_norm_nmt_nfkc
Traini

## Load trained model

In [8]:
st = spm.SentencePieceProcessor()

In [9]:
st.Load('models/m.model')

True

## Tokenize

In [10]:
st.SampleEncodeAsPieces('Kallid riigikogu liikmed.', -1, 0.1)

['▁K', 'al', 'li', 'd', '▁r', 'i', 'i', 'gi', 'kogu', '▁', 'liikme', 'd', '.']

In [11]:
st.SampleEncodeAsPieces('Kallid riigikogu liikmed', -1, 0.5)

['▁Ka', 'lli', 'd', '▁riigi', 'kogu', '▁liikmed']

In [12]:
st.SampleEncodeAsPieces('Kallid riigikogu liikmed', -1, 1)

['▁Ka', 'lli', 'd', '▁riigi', 'kogu', '▁liikmed']

In [21]:
st.SampleEncodeAsPieces('Kallid riigikogu liikmed', -1, 0.1)

['▁K', 'al', 'l', 'i', 'd', '▁riigi', 'ko', 'gu', '▁', 'liikme', 'd']

In [20]:
st.EncodeAsPieces('Kallid riigikogu liikmed')

['▁Ka', 'lli', 'd', '▁riigi', 'kogu', '▁liikmed']

In [32]:
type(st.EncodeAsPieces('Kallid riigikogu liikmed'))

list

In [33]:
type(st.EncodeAsPieces('Kallid riigikogu liikmed')[0])

str

In [22]:
len(st)

10000

In [25]:
list(st)

TypeError: not a string

<Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7ff6924faa20>