In [1]:
from transformers import GPT2TokenizerFast
import pandas as pd

model_name = 'skt/kogpt2-base-v2'

class CustomTokenizer:
    def __init__(self, base_model_name):
        self.tokenizer = GPT2TokenizerFast.from_pretrained(base_model_name)
    
    # 토큰 초기화 함수
    def initialize_tokens(self, new_vocab):
        self.tokenizer = GPT2TokenizerFast()  # 새로운 토크나이저 생성
        self.tokenizer.add_tokens(new_vocab)
        print("토크나이저가 초기화되었습니다.")
    
    # 새로운 토큰 추가 함수
    def add_new_tokens(self, new_tokens):
        self.tokenizer.add_tokens(new_tokens)
        print("새로운 토큰이 추가된 후 단어 집합 크기:", len(self.tokenizer))
    
    # 토큰 저장 함수
    def save_tokenizer(self, save_path):
        self.tokenizer.save_pretrained(save_path)
        print(f"토크나이저가 {save_path}에 저장되었습니다.")
    
    # 문장 토큰화 함수
    def tokenize_sentence(self, sentence):
        tokens = self.tokenizer(sentence, return_tensors='pt')
        return tokens

In [10]:
from transformers import PreTrainedTokenizerFast
import pandas as pd

model_name = 'skt/kogpt2-base-v2'

class CustomTokenizer:
    def __init__(self, base_model_name):
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(base_model_name)
    
    # 토큰 초기화 함수
    def initialize_tokens(self, new_vocab):
        # 새로운 토크나이저 생성
        self.tokenizer = PreTrainedTokenizerFast()  
        self.tokenizer.add_special_tokens({'pad_token': '<pad>'})
        self.tokenizer.add_tokens(new_vocab)
        print("토크나이저가 초기화되었습니다.")
    
    # 새로운 토큰 추가 함수
    def add_new_tokens(self, new_tokens):
        self.tokenizer.add_tokens(new_tokens)
        print("새로운 토큰이 추가된 후 단어 집합 크기:", len(self.tokenizer))
    
    # 토큰 저장 함수
    def save_tokenizer(self, save_path):
        self.tokenizer.save_pretrained(save_path)
        print(f"토크나이저가 {save_path}에 저장되었습니다.")
    
    # 문장 토큰화 함수
    def tokenize_sentence(self, sentence):
        tokens = self.tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
        return tokens

In [11]:
# 사용 예시
new_vocab = ["Hello", "world", "I", "am", "a", "new", "tokenizer", 
             "specific", "domain", "specialized", "tokens"]
new_tokens = ["<NEW_TOKEN_1>", "<NEW_TOKEN_2>", "<DOMAIN_SPECIFIC_TOKEN>"]

In [12]:
custom_tokenizer = CustomTokenizer(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [13]:
sentence = 'hello world i am a new tokenizer specific domain specialized tokens'

In [15]:
custom_tokenizer.initialize_tokens(new_vocab)

ValueError: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

In [None]:
custom_tokenizer.add_new_tokens(new_tokens)
custom_tokenizer.save_tokenizer('./custom_tokenizer')

In [None]:
# 예시 데이터프레임
data = {'sentence': ["Hello, I am testing the new tokenizer.",
                     "This is another example sentence.",
                     "Let's see how this works with multiple sentences."]}
df = pd.DataFrame(data)

# 데이터프레임의 각 문장을 토큰화
df['tokens'] = df['sentence'].apply(lambda x: custom_tokenizer.tokenize_sentence(x))

# 결과 출력
print(df)
