In [1]:
import numpy as np
import pandas as pd
import gymnasium as gym

import torch
import sentencepiece as spm

In [5]:
with open('D:/GFlowNet/Datasets/wat-2021/dev.en', encoding='UTF-8') as f:
    text = f.read()

In [None]:
print(len(text))
print(len(text.split()))        #   Approx 16k words

101716
16322


In [11]:
help(spm.SentencePieceTrainer.train)

Help on function Train in module sentencepiece:

Train(arg=None, logstream=None, **kwargs)



In [20]:
train_size = 0.2
n = int(len(text)*train_size)
train_text = text[:n]
test_text = text[n:]

In [23]:
with open('train.txt', 'w', encoding='utf-8') as f:
    f.write(train_text)

with open('test.txt', 'w', encoding='utf-8') as f:
    f.write(test_text)

In [24]:
import os
os.getcwd()

'd:\\GFlowNet\\My_Experiments'

In [42]:
spm.SentencePieceTrainer.train(
    input = 'train.txt',
    model_prefix = 'tokenizer',
    vocab_size = 1000,
    model_type = 'char'
    # charecter_coverage = 1.0
)

In [43]:
sp = spm.SentencePieceProcessor()
sp.load('tokenizer.model')

True

In [44]:
k = 100
print(test_text[:k])

pieces = sp.encode(test_text[:k], out_type=str)
print("Pieces:", pieces)

tasks which seem extremely difficult.
He said organizations such as the WHO have appreciated the pro
Pieces: ['▁', 't', 'a', 's', 'k', 's', '▁', 'w', 'h', 'i', 'c', 'h', '▁', 's', 'e', 'e', 'm', '▁', 'e', 'x', 't', 'r', 'e', 'm', 'e', 'l', 'y', '▁', 'd', 'i', 'f', 'f', 'i', 'c', 'u', 'l', 't', '.', '▁', 'H', 'e', '▁', 's', 'a', 'i', 'd', '▁', 'o', 'r', 'g', 'a', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', 's', '▁', 's', 'u', 'c', 'h', '▁', 'a', 's', '▁', 't', 'h', 'e', '▁', 'W', 'H', 'O', '▁', 'h', 'a', 'v', 'e', '▁', 'a', 'p', 'p', 'r', 'e', 'c', 'i', 'a', 't', 'e', 'd', '▁', 't', 'h', 'e', '▁', 'p', 'r', 'o']


In [None]:
from typing import Literal

class SentencePiece_Tokenizer:
    def __init__(self):
        super().__init__()
        self.sp = spm.SentencePieceProcessor()


    def token_to_ids(self, tokens):
        return [self.sp.piece_to_id(token) for token in tokens]


    def ids_to_tokens(self, ids):
        return [self.sp.id_to_piece(id_) for id_ in ids]


    def train_tokenizer(self, file_path: str, vocab_size: int, token_type: Literal['bpe', 'unigram', 'char', 'word'] = 'bpe', model_name: str='tokenizer', **kwargs):

        spm.SentencePieceTrainer.train(
            input = file_path,
            model_prefix = str(token_type + '_' + model_name),
            vocab_size = vocab_size,
            model_type = token_type
        )            

    
    def encode(self, model_path, text: str, out_type: str | int):
        self.sp.load(model_path)

        tokens = self.sp.encode(text, out_type=out_type)
        return tokens
    

    def join_token(self, token):
        if isinstance(token[0], int):
            return self.sp.decode(token)
        return self.sp.decode_pieces(token)


    def disjoin_token(self):
        pass

Refer to this [Colab_Link](https://colab.research.google.com/github/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb) for detailed examples of SentencePiece

In [None]:
#   A Basic GPT code :)

class TokenizeEnv(gym.Env):
    def __init__(self, text):
        super().__init__()
        self.text = text.strip()
        self.reset()

    def reset(self):
        self.state = list(self.text)  # start with character-level tokens
        return self.state

    def step(self, action):
        i, j = action
        if j != i + 1 or i < 0 or j >= len(self.state):
            raise ValueError("Invalid merge action")

        merged_token = self.state[i] + self.state[j]
        self.state = self.state[:i] + [merged_token] + self.state[j+1:]

        done = self.is_terminal()
        reward = self.compute_reward() if done else 0

        return self.state, reward, done, {}

    def is_terminal(self):
        return len(self.state) <= 3  # or some custom stopping condition

    def compute_reward(self):
        return -len(self.state)  # toy reward: fewer tokens = better

In [54]:
env = TokenizeEnv(test_text)
print(env.state)

['t', 'a', 's', 'k', 's', ' ', 'w', 'h', 'i', 'c', 'h', ' ', 's', 'e', 'e', 'm', ' ', 'e', 'x', 't', 'r', 'e', 'm', 'e', 'l', 'y', ' ', 'd', 'i', 'f', 'f', 'i', 'c', 'u', 'l', 't', '.', '\n', 'H', 'e', ' ', 's', 'a', 'i', 'd', ' ', 'o', 'r', 'g', 'a', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', 's', ' ', 's', 'u', 'c', 'h', ' ', 'a', 's', ' ', 't', 'h', 'e', ' ', 'W', 'H', 'O', ' ', 'h', 'a', 'v', 'e', ' ', 'a', 'p', 'p', 'r', 'e', 'c', 'i', 'a', 't', 'e', 'd', ' ', 't', 'h', 'e', ' ', 'p', 'r', 'o', 'g', 'r', 'e', 's', 's', ' ', 'm', 'a', 'd', 'e', ' ', 'i', 'n', ' ', 't', 'h', 'e', ' ', 'S', 'w', 'a', 'c', 'h', 'h', ' ', 'B', 'h', 'a', 'r', 'a', 't', ' ', 'M', 'i', 's', 's', 'i', 'o', 'n', '.', '\n', 'S', 'h', 'r', 'i', ' ', 'N', 'a', 'r', 'e', 'n', 'd', 'r', 'a', ' ', 'M', 'o', 'd', 'i', ' ', 'a', 'n', 'n', 'o', 'u', 'n', 'c', 'e', 'd', ' ', 't', 'h', 'e', ' ', 'l', 'a', 'u', 'n', 'c', 'h', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'P', 'r', 'a', 'd', 'h', 'a', 'n', ' ', 'M', 'a', 'n', 't

In [55]:
env.step((5,6))

(['t',
  'a',
  's',
  'k',
  's',
  ' w',
  'h',
  'i',
  'c',
  'h',
  ' ',
  's',
  'e',
  'e',
  'm',
  ' ',
  'e',
  'x',
  't',
  'r',
  'e',
  'm',
  'e',
  'l',
  'y',
  ' ',
  'd',
  'i',
  'f',
  'f',
  'i',
  'c',
  'u',
  'l',
  't',
  '.',
  '\n',
  'H',
  'e',
  ' ',
  's',
  'a',
  'i',
  'd',
  ' ',
  'o',
  'r',
  'g',
  'a',
  'n',
  'i',
  'z',
  'a',
  't',
  'i',
  'o',
  'n',
  's',
  ' ',
  's',
  'u',
  'c',
  'h',
  ' ',
  'a',
  's',
  ' ',
  't',
  'h',
  'e',
  ' ',
  'W',
  'H',
  'O',
  ' ',
  'h',
  'a',
  'v',
  'e',
  ' ',
  'a',
  'p',
  'p',
  'r',
  'e',
  'c',
  'i',
  'a',
  't',
  'e',
  'd',
  ' ',
  't',
  'h',
  'e',
  ' ',
  'p',
  'r',
  'o',
  'g',
  'r',
  'e',
  's',
  's',
  ' ',
  'm',
  'a',
  'd',
  'e',
  ' ',
  'i',
  'n',
  ' ',
  't',
  'h',
  'e',
  ' ',
  'S',
  'w',
  'a',
  'c',
  'h',
  'h',
  ' ',
  'B',
  'h',
  'a',
  'r',
  'a',
  't',
  ' ',
  'M',
  'i',
  's',
  's',
  'i',
  'o',
  'n',
  '.',
  '\n',
  'S',
  'h',
  '

In [57]:
env.state[-5:]

['e', 'l', 'h', 'i', '.']