In [None]:
# default_exp data.management.tokenizers

# Data Management Tokenizers Core

> This module contains all of the tokenizers supported by `ds4se`.

In [None]:
# export
import ds4se

import sentencepiece as sp

from pathlib import Path
from typing import List, Optional

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
def pandas_to_txt_file(df: pd.DataFrame, output_path: Path, cols) -> Path:
    if cols is None:
        cols = list(df.columns)
    merged_df = pd.concat([df[col] for col in cols])

    with open(str(output_path / "ds4se_data.txt"), "w") as f:
        f.write("\n".join(list(merged_df)))

    return output_path / "ds4se_data.txt"

In [None]:
# export
class DS4SETokenizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def tokenize(self, example: str) -> List[str]:
        pass

    @staticmethod
    def from_pretrained(path: Path):
        pass

In [None]:
# export
class SentencePieceTokenizer(DS4SETokenizer):
    def __init__(self, tokenizer):
        super().__init__(self, tokenizer)

    def tokenize(self, example: str) -> List[str]:
        return self.tokenizer.EncodeAsPieces(example)

    @staticmethod
    def from_pandas(
        df: pd.DataFrame, output_path: Path, cols: Optional[List[str]] = None
    ) -> SentencePieceTokenizer:
        f_path = pandas_to_txt_file(df, cols, output_path)
        sp.SentencePieceTrainer.train(
            f"--input={f_path} --model_prefix={output_path}/sentencepiece --hard_vocab_limit=false"
        )

        return SentencePieceTokenizer.from_pretrained(
            output_path / "sentencepiece.model"
        )

    @staticmethod
    def from_pretrained(f_path: Path) -> SentencePieceTokenizer:
        tokenizer = sp.SentencePieceProcessor()
        tokenizer.Load(str(f_path))

        return SentencePieceTokenizer(tokenizer)

In [None]:
# export
class ByteLevelTokenizer(DS4SETokenizer):
    def __init__(self, tokenizer):
        super().__init__(self, tokenizer)

    def tokenize(self, example: str) -> List[str]:
        return self.tokenizer.EncodeAsPieces(example)

    @staticmethod
    def from_pandas(
        df: pd.DataFrame, output_path: Path, cols: Optional[List[str]] = None
    ) -> SentencePieceTokenizer:
        f_path = pandas_to_txt_file(df, cols, output_path)
        sp.SentencePieceTrainer.train(
            f"--input={f_path} --model_prefix={output_path}/sentencepiece --hard_vocab_limit=false"
        )

        return SentencePieceTokenizer.from_pretrained(
            output_path / "sentencepiece.model"
        )

    @staticmethod
    def from_pretrained(f_path: Path) -> SentencePieceTokenizer:
        tokenizer = sp.SentencePieceProcessor()
        tokenizer.Load(str(f_path))

        return SentencePieceTokenizer(tokenizer)