In [26]:
import pandas as pd
from tokenizers import Tokenizer
from transformers import AutoTokenizer
from tokenizers.processors import TemplateProcessing
from tokenizers.normalizers import (Sequence, NFD)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder
from tokenizers.trainers import BpeTrainer

In [27]:
train_law_par = pd.read_parquet(r"c:\Users\siren\Desktop\1507\dataset\hukuk_raw_dataset\train.parquet", engine="pyarrow")
wiki_law_par = pd.read_parquet(r"C:\Users\siren\Desktop\1507\dataset\ts_wikipedia\bpe_support.parquet")
concat_par = pd.concat([train_law_par, wiki_law_par]) 

In [28]:
concat_par = concat_par.iloc[:-1]
concat_par.drop(columns=["source"])

Unnamed: 0,text
0,Kötü kadın tiplemesi derken Bellerophontes efs...
1,vatandaşlık kavramına yeni bir yaklaşım getiri...
2,ALİ YEŞİLIRMAKIN TEBLİĞİ Arabuluculuk şartı ve...
3,Deniz İş Kanununda özel bir düzenleme bulunmay...
4,3.3. Sosyal Hukuk Devleti lkesi Anlayıına Duyu...
...,...
2372004,Ancak bipolar koordinatlarda bu ifade burada a...
2372005,Bipolar koordinatların klasik uygulamaları kıs...
2372006,Tipik bir örnek olarak iki paralel silindirik ...
2372007,Bipolar üç boyutlu Ortogonal koordinatların bi...


In [29]:
concat_par.reset_index(drop=True,inplace=True)

In [30]:
special_tokens = ["[UNK]", " "]

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.normalizer = Sequence([NFD()])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()

In [31]:
trainer = BpeTrainer(vocab_size=52000,
                     min_frequency = 1,
                     special_tokens=special_tokens,
                     max_token_length = 6,
                     show_progress = True)
tokenizer.train_from_iterator(concat_par["text"].astype(str), trainer=trainer)

In [32]:
tokenizer.model.save('.')
tokenizer.save("MyBPETokenizerWikiLaw.json")

In [33]:
tokenizerFromFile = Tokenizer.from_file("MyBPETokenizerWikiLaw.json")
sen_enc3 = tokenizerFromFile.encode("Bırakın adalet yerini bulsun, isterse kıyamet kopsun.")
print(f"Output: {format(sen_enc3.tokens)}")

Output: ['Bırak', 'ın', ' ', 'ada', 'let', ' ', 'yer', 'ini', ' ', 'bul', 'sun', ',', ' ', 'ister', 'se', ' ', 'kıya', 'met', ' ', 'kop', 'sun', '.']


In [34]:
sen_enc3 = tokenizerFromFile.encode("Adalet önce devletten gelir. Çünkü hukuk, devletin toplumsal düzenidir.")
print(f"Output: {format(sen_enc3.tokens)}")

Output: ['Ada', 'let', ' ', 'önce', ' ', 'dev', 'let', 'ten', ' ', 'gelir', '.', ' ', 'Çün', 'kü', ' ', 'hukuk', ',', ' ', 'dev', 'letin', ' ', 'top', 'lum', 'sal', ' ', 'düz', 'eni', 'dir', '.']


In [35]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=r"C:\Users\siren\Desktop\1507\GPT2_HF\Tokenizer\MyBPETokenizerWikiLaw.json",
    merges_file=r"C:\Users\siren\Desktop\1507\GPT2_HF\Tokenizer\merges.txt",
    vocab_file=r"C:\Users\siren\Desktop\1507\GPT2_HF\Tokenizer\vocab.json"
)

In [36]:
encoded = tokenizer.encode("Adalet önce devletten gelir. Çünkü hukuk, devletin toplumsal düzenidir.")
decoded = tokenizer.decode(encoded)
print(decoded)

Adalet önce devletten gelir. Çünkü hukuk, devletin toplumsal düzenidir.


In [37]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [45]:
tokenizer.push_to_hub("sergeantson/1507_Law_Tokenizer")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sergeantson/1507_Law_Tokenizer/commit/dff3a162f66fdd7db17c7c2625e72b8514c46c19', commit_message='Upload tokenizer', commit_description='', oid='dff3a162f66fdd7db17c7c2625e72b8514c46c19', pr_url=None, pr_revision=None, pr_num=None)

In [46]:
push_tokenizer = AutoTokenizer.from_pretrained("sergeantson/1507_Law_Tokenizer")
sequence = "Adalet Mülkün Temelidir"
tokens = push_tokenizer.tokenize(sequence)

print(tokens)

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

['Ada', '##let', 'M', '##ü', '##lk', '##ü', '##n', 'Te', '##mel', '##id', '##ir']


In [47]:
ids = push_tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[19222, 5765, 150, 17176, 10493, 17176, 1179, 12008, 10212, 2386, 3161]


In [48]:
decoded_string = push_tokenizer.decode(ids)
print(decoded_string)

Adalet Mülkün Temelidir
