See example: https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb

In [3]:
import sentencepiece as spm
import pandas as pd

## Read data

In [8]:
data_df = pd.read_csv('test_pa.tsv', sep='\t', header=None)

In [14]:
#data_df

## Data cleaning

In [10]:
transcripts = data_df[2].tolist()

In [12]:
# Remove '|' from the end of the sentences

transcripts_str = [str(x) for x in transcripts] #convert sentences to a string
transcripts = [x.replace('।','') for x in transcripts_str]
print(len(transcripts))

574


In [13]:
transcripts[0]

'ਰਿਪ ਕਰੰਟ ਸਮੁੰਦਰੀ ਤੱਟ ਨੂੰ ਤੋੜਦੀਆਂ ਲਹਿਰਾਂ ਤੋਂ ਵਾਪਸ ਆਉਣ ਵਾਲਾ ਵਹਾਅ ਹੁੰਦਾ ਹੈ, ਜੋ ਕਿ ਅਕਸਰ ਰੀਫ ਜਾਂ ਇਸਦੇ ਸਮਾਨ ਥਾਵਾਂ ‘ਤੇ ਦੇਖਣ ਨੂੰ ਮਿਲਦਾ ਹੈ'

## Save transcripts in a .txt file

In [16]:
df_transcripts = pd.DataFrame(transcripts)

In [18]:
df_transcripts.to_csv('data.txt', sep='\t', index=False)

## Sentence Piece

In [31]:
# train sentencepiece model and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train('--input=data.txt --model_prefix=m --vocab_size=1000')

# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=data.txt --model_prefix=m --vocab_size=1000
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: data.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_pie

True

/piece=5.92672
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=1 size=1160 obj=11.3024 num_tokens=6874 num_tokens/piece=5.92586
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=0 size=1100 obj=11.4582 num_tokens=7002 num_tokens/piece=6.36545
unigram_model_trainer.cc(580) LOG(INFO) EM sub_iter=1 size=1100 obj=11.4345 num_tokens=7003 num_tokens/piece=6.36636
trainer_interface.cc(686) LOG(INFO) Saving model: m.model
trainer_interface.cc(698) LOG(INFO) Saving vocabs: m.vocab


In [34]:
print(sp.encode_as_pieces('ਰਿਪ ਕਰੰਟ ਸਮੁੰਦਰੀ ਤੱਟ ਨੂੰ ਤੋੜਦੀਆਂ ਲਹਿਰਾਂ ਤੋਂ ਵਾਪਸ ਆਉਣ ਵਾਲਾ ਵਹਾਅ ਹੁੰਦਾ ਹੈ'))
print(sp.encode_as_ids('ਰਿਪ ਕਰੰਟ ਸਮੁੰਦਰੀ ਤੱਟ ਨੂੰ ਤੋੜਦੀਆਂ ਲਹਿਰਾਂ ਤੋਂ ਵਾਪਸ ਆਉਣ ਵਾਲਾ ਵਹਾਅ ਹੁੰਦਾ ਹੈ'))

['▁ਰਿ', 'ਪ', '▁ਕਰ', 'ੰ', 'ਟ', '▁ਸਮੁੰਦਰ', 'ੀ', '▁ਤ', 'ੱਟ', '▁ਨੂੰ', '▁ਤੋੜ', 'ਦ', 'ੀਆਂ', '▁', 'ਲਹਿ', 'ਰਾਂ', '▁ਤੋਂ', '▁ਵਾਪਸ', '▁ਆਉਣ', '▁ਵਾਲ', 'ਾ', '▁ਵ', 'ਹ', 'ਾਅ', '▁ਹੁੰਦਾ', '▁ਹੈ']
[301, 26, 90, 999, 40, 273, 6, 49, 106, 17, 585, 63, 29, 5, 967, 245, 28, 361, 473, 67, 4, 118, 61, 387, 119, 7]


In [33]:
# decode: id => text
print(sp.decode_pieces(['▁ਤੁਸੀ', '▁ਕਿਵੇਂ', '▁ਹੋ']))
print(sp.decode_ids([209, 31, 9, 375, 586]))

▁ਤੁਸੀ▁ਕਿਵੇਂ ਹੋ
ਦੌਰਾਨ ਇਹ ਦੇ ਲਿਖ ਰੱਖਿਆ


In [36]:
# returns vocab size
print(sp.get_piece_size())

# id <=> piece conversion
print(sp.id_to_piece(301))
print(sp.piece_to_id('ਪ'))

# returns 0 for unknown tokens (we can change the id for UNK)
print(sp.piece_to_id('__MUST_BE_UNKNOWN__'))

# , ,  are defined by default. Their ids are (0, 1, 2)
#  and  are defined as 'control' symbol.
for id in range(3):
  print(sp.id_to_piece(id), sp.is_control(id))

1000
▁ਰਿ
26
0
<unk> False
<s> True
</s> True


In [39]:
vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]
vocabs

['<unk>',
 '<s>',
 '</s>',
 'ੇ',
 'ਾ',
 '▁',
 'ੀ',
 '▁ਹੈ',
 ',',
 '▁ਦੇ',
 '▁ਵਿੱਚ',
 'ਨ',
 'ਾਂ',
 'ਲ',
 '▁ਅਤ',
 '▁ਦੀ',
 'ਰ',
 '▁ਨੂੰ',
 'ਕ',
 'ਤ',
 '▁ਕ',
 'ਮ',
 '▁ਹਨ',
 'ਸ',
 'ਤੇ',
 '▁ਦਾ',
 'ਪ',
 '▁ਇੱਕ',
 '▁ਤੋਂ',
 'ੀਆਂ',
 '▁ਲਈ',
 '▁ਇਹ',
 '▁ਕਿ',
 'ਂ',
 '▁ਸੀ',
 '▁ਨਾਲ',
 '-',
 'ਵ',
 'ਆਂ',
 'ਣ',
 'ਟ',
 '▁ਨੇ',
 '▁ਪ',
 '▁ਵੀ',
 '▁ਅ',
 '▁ਉਹ',
 'ੋਂ',
 '▁ਇਸ',
 'ਗ',
 '▁ਤ',
 'ਦਾ',
 '▁ਨਹੀਂ',
 'ੋ',
 '▁ਆ',
 '▁ਜੋ',
 'ਨਾਂ',
 '▁ਪ੍ਰ',
 "▁'",
 '‘',
 '▁ਕਰਨ',
 'ੂ',
 'ਹ',
 '▁ਸ',
 'ਦ',
 '▁ਮ',
 '▁ਜਾਂ',
 'ਿਤ',
 '▁ਵਾਲ',
 'ਬ',
 'ਿਆਂ',
 'ਈ',
 '▁ਬ',
 'ਡ',
 '▁ਕੀਤਾ',
 'ਵਾਂ',
 'ਚ',
 '▁ਗਿਆ',
 '▁ਜ',
 '▁ਹੋ',
 '▁ਲੋਕ',
 '▁ਬਹੁਤ',
 'ਜ',
 '▁ਜਿਸ',
 '▁ਨ',
 'ਿਆ',
 '▁ਜਾਂਦ',
 'ਜ਼',
 'ਿ',
 '▁ਜਾ',
 'ਤਾ',
 '▁ਕਰ',
 '▁ਹ',
 'ਾਈ',
 '▁ਸਾਰ',
 'ੌਰ',
 'ਿਕ',
 'ੱ',
 ')',
 '▁(',
 '▁ਪਰ',
 'ਭ',
 '▁ਸਕਦਾ',
 '▁ਦੁ',
 'ਸ਼',
 '▁ਹਰ',
 '▁ਕੀਤੀ',
 'ੱਟ',
 '▁ਤੱਕ',
 '▁ਜਾਣ',
 '▁ਦ',
 'ਲੀ',
 'ਲਾ',
 'ਨਾ',
 '▁ਆਪਣ',
 '▁ਖ',
 '▁ਸਭ',
 'ੈ',
 'ਦੀ',
 '▁ਵ',
 '▁ਹੁੰਦਾ',
 'ਨੂੰ',
 '▁ਰੂ',
 '▁ਵਿਸ਼',
 'ਇ',
 'ਫ਼',
 '▁ਬਾਅਦ',
 '▁ਤਰ੍ਹਾਂ',
 '▁ਬਣ',
 '▁ਕਰਦ',
 'ਿੰਗ',
 '0',
 '▁ਹੀ',
 'ੁ',
 