In [73]:
import sentencepiece as spm
import transformers
from tokenizers import SentencePieceBPETokenizer, BertWordPieceTokenizer
from tokenizers import SentencePieceUnigramTokenizer, ByteLevelBPETokenizer
from typing import List
from transformers import BertTokenizer ## to create the config files for tokenizer wordpiece only
from transformers import RobertaTokenizer ## to create the config for sentence piece/ BPE
import os
import string
from typing import List
import glob
import re
from data_prep_utils import get_txt_from_dir

In [67]:
def bpe_tokenizer_trainer(
                list_of_files : list[str],
                vocab_size : int,
                save_path : str,
                min_frequency : int = 2,
                special_tokens : list[str] = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
                model_max_length : int = 512,
                show_progress : bool = True,
                save_model=False,
                dropout = 0
                ):
    if (dropout != 0):
        bpe = ByteLevelBPETokenizer(dropout=dropout)
    else:
        bpe = ByteLevelBPETokenizer()
        
    bpe.train_from_iterator(
        list_of_files,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        show_progress=show_progress,
        special_tokens=special_tokens
    )

    bpe_fast = transformers.PreTrainedTokenizerFast(tokenizer_object=bpe, model_max_length=model_max_length,
                special_tokens=special_tokens)

    bpe.unk_token_="[UNK]"
    bpe.sep_token_="[SEP]"
    bpe.pad_token_="[PAD]"
    bpe.cls_token_="[CLS]"
    bpe.mask_token="[MASK]"


    bpe_fast.pad_token_id = bpe.token_to_id("[PAD]")
    bpe_fast.unk_token_id = bpe.token_to_id("[UNK]")
    bpe_fast.cls_token_id = bpe.token_to_id("[CLS]")
    bpe_fast.sep_token_id = bpe.token_to_id("[SEP]")
    bpe_fast.mask_token_id = bpe.token_to_id("[MASK]")
    
    if save_model:
        bpe_fast.save_pretrained(f"{save_path}/1_BPE_1_batch_size_8_vocab_{vocab_size}_not")
    
    return bpe_fast
"""
path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/dup_texts"
list_of_files = get_txt_from_dir(path_to_data_files)
save_path = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/BPE_tokenizer_dropout"
for i in [1000, 1200, 2000,4000]:#[1500,1800,2000,4000, 8000, 16000]:
    print(i)
    bpe_tokenizer_trainer(list_of_files, i,
                          save_path, save_model=True,
                    dropout = 0) """

'\npath_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/dup_texts"\nlist_of_files = get_txt_from_dir(path_to_data_files)\nsave_path = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/BPE_tokenizer_dropout"\nfor i in [1000, 1200, 2000,4000]:#[1500,1800,2000,4000, 8000, 16000]:\n    print(i)\n    bpe_tokenizer_trainer(list_of_files, i,\n                          save_path, save_model=True,\n                    dropout = 0) '

In [68]:
## adjust with RobertaTokenizer
def sentencepiece_tk_unigram(data : List[str], vocab_size : int, save_path : str,
                 special_tokens : list = None, min_frequence=2, show_progress=True,
                 model_max_length : int = 512):
    if (special_tokens == None):
        special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<cls>", "<sep>", "<mask>"]

    tk_tokenizer = SentencePieceUnigramTokenizer()
    tk_tokenizer.train_from_iterator(
        data,
        vocab_size=vocab_size,
        show_progress=show_progress,
        special_tokens=special_tokens,
        unk_token = "<unk>"
    )
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    tk_tokenizer.save(save_path + "/tokenizer.json")
    # convert
    tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_object=tk_tokenizer, model_max_length=model_max_length, special_tokens=special_tokens)
    tokenizer.bos_token = "<s>"
    tokenizer.bos_token_id = tk_tokenizer.token_to_id("<s>")
    tokenizer.pad_token = "<pad>"
    tokenizer.pad_token_id = tk_tokenizer.token_to_id("<pad>")
    tokenizer.eos_token = "</s>"
    tokenizer.eos_token_id = tk_tokenizer.token_to_id("</s>")
    tokenizer.unk_token = "<unk>"
    tokenizer.unk_token_id = tk_tokenizer.token_to_id("<unk>")
    tokenizer.cls_token = "<cls>"
    tokenizer.cls_token_id = tk_tokenizer.token_to_id("<cls>")
    tokenizer.sep_token = "<sep>"
    tokenizer.sep_token_id = tk_tokenizer.token_to_id("<sep>")
    tokenizer.mask_token = "<mask>"
    tokenizer.mask_token_id = tk_tokenizer.token_to_id("<mask>")

    # and save for later!
    tokenizer.save_pretrained(save_path)     

In [84]:
## adjust with RobertaTokenizer
def sentencepiece_tk(data : List[str], vocab_size : int, save_path : str,
                 special_tokens : list = None, min_frequence=2, show_progress=True,
                 model_max_length : int = 512, dropout=0):
    if (special_tokens == None):
        special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<cls>", "<sep>", "<mask>"]
    
    if(dropout != 0):
        
        tk_tokenizer = SentencePieceBPETokenizer(dropout = dropout)
    else:
        tk_tokenizer = SentencePieceBPETokenizer()
        
    tk_tokenizer.train_from_iterator(
        data,
        vocab_size=vocab_size,
        min_frequency=min_frequence,
        show_progress=show_progress,
        special_tokens=special_tokens
    )
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    tk_tokenizer.save(save_path + "/tokenizer.json")
    # convert
    tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_object=tk_tokenizer, model_max_length=model_max_length, special_tokens=special_tokens)
    tokenizer.bos_token = "<s>"
    tokenizer.bos_token_id = tk_tokenizer.token_to_id("<s>")
    tokenizer.pad_token = "<pad>"
    tokenizer.pad_token_id = tk_tokenizer.token_to_id("<pad>")
    tokenizer.eos_token = "</s>"
    tokenizer.eos_token_id = tk_tokenizer.token_to_id("</s>")
    tokenizer.unk_token = "<unk>"
    tokenizer.unk_token_id = tk_tokenizer.token_to_id("<unk>")
    tokenizer.cls_token = "<cls>"
    tokenizer.cls_token_id = tk_tokenizer.token_to_id("<cls>")
    tokenizer.sep_token = "<sep>"
    tokenizer.sep_token_id = tk_tokenizer.token_to_id("<sep>")
    tokenizer.mask_token = "<mask>"
    tokenizer.mask_token_id = tk_tokenizer.token_to_id("<mask>")

    # and save for later!
    tokenizer.save_pretrained(save_path)
    return tokenizer
"""
    tokenizer = RobertaTokenizer(vocab_file = save_path,
                tokenizer_file=tokenizer_file,
                errors=errors,
                bos_token=bos_token,
                eos_token=eos_token,
                sep_token=sep_token,
                cls_token=cls_token,
                unk_token=unk_token,
                pad_token=pad_token,
                mask_token=mask_token,
                add_prefix_space=add_prefix_space,
                trim_offsets=trim_offsets,
                model_max_length=model_max_length)
    
   """      

'\n    tokenizer = RobertaTokenizer(vocab_file = save_path,\n                tokenizer_file=tokenizer_file,\n                errors=errors,\n                bos_token=bos_token,\n                eos_token=eos_token,\n                sep_token=sep_token,\n                cls_token=cls_token,\n                unk_token=unk_token,\n                pad_token=pad_token,\n                mask_token=mask_token,\n                add_prefix_space=add_prefix_space,\n                trim_offsets=trim_offsets,\n                model_max_length=model_max_length)\n    \n   '

In [87]:
path_to_txt = "/home/fb198/BA/DataNephroTexts/train_tokenizer_data/numbers_punt_filter"
list_of_files = get_txt_from_dir(path_to_txt)
for i in [1000, 1200, 1500, 1800, 2000, 4000, 8000]:# , 
    save_path = f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/tokenizers/sentencepiece_filtered_data/tokenizer_sp_duplicated/sentencepiece_vocab_{i}/"
    sentencepiece_tk(data=list_of_files, vocab_size=i, save_path=save_path, dropout=0.1)
    print("finished training with ", i, " vocab size")




finished training with  1000  vocab size



finished training with  1200  vocab size



finished training with  1500  vocab size



finished training with  1800  vocab size



finished training with  2000  vocab size



finished training with  4000  vocab size



finished training with  8000  vocab size


   bertTokenizer = BertTokenizer(vocab_file=save_path+"/tokenizer.json",
        do_lower_case=False,
        do_basic_tokenize=False,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None, 
        max_model_length=512)
 

In [1]:
def wordpiece_tokenizer(files : list[str], vocab_size : int, save_path : str,
                        min_frequency=2, special_tokens : list[str] = None, max_length = 512,
                        show_progress = True):
    tk_tokenizer = BertWordPieceTokenizer()
    print("about to train")

    if(special_tokens == None):
        tk_tokenizer.train(files = files, vocab_size = vocab_size, min_frequency=min_frequency)
    else:
        tk_tokenizer.train(files = files, vocab_size = vocab_size,
                           min_frequency=min_frequency, special_tokens=special_tokens,
                           show_progress=show_progress)
    tk_tokenizer.enable_truncation(max_length=max_length) 
    print("done training.. \n")
    tk_tokenizer.save(save_path+"tokenizer.json")

    wp_fast = transformers.PreTrainedTokenizerFast(tokenizer_object=tk_tokenizer, model_max_length=512,
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

    bpe.unk_token_="[UNK]"
    bpe.sep_token_="[SEP]"
    bpe.pad_token_="[PAD]"
    bpe.cls_token_="[CLS]"
    bpe.mask_token="[MASK]"


    wp_fast.pad_token_id = tk_tokenizer.token_to_id("[PAD]")
    wp_fast.unk_token_id = tk_tokenizer.token_to_id("[UNK]")
    wp_fast.cls_token_id = tk_tokenizer.token_to_id("[CLS]")
    wp_fast.sep_token_id = tk_tokenizer.token_to_id("[SEP]")
    wp_fast.mask_token_id = tk_tokenizer.token_to_id("[MASK]")    

    wp_fast.save_pretrained(save_path)    # and save for later!
    #tokenizer.save_pretrained(save_path)
    #return bertTokenizer
    
#save_path = f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/tokenizer_WordPiece/Wordpiece_vocab_3000_temp"
#tt = wordpiece_tokenizer(files = list_of_files, vocab_size = 3000,
#                    save_path = save_path, min_frequency=2)
#print("done with the first word piece")

In [74]:
path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/dup_texts"
list_of_files = get_txt_from_dir(get_txt_from_dir)

for i in [860]:# , 
    save_path = f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/tokenizer_WordPiece/Wordpiece_vocab_860"

    wordpiece_tokenizer(files = list_of_files, vocab_size = i,
                        save_path = save_path, min_frequency=2)
    print("done with the first word piece")

NameError: name 'wordpiece_tokenizer' is not defined

In [4]:
def main():
    
    i = 3
    
    if(i == 0):
        path_file = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/prepare_data/sentences_for_tokenizer_duplicated.txt"
        with open(path_file, mode="r", encoding="utf-8") as f:
            data_dup = f.read()
        data_dup = data_dup.split("\n")

        for i in [1000, 1500, 1800, 2000, 2500, 3000, 4000, 8000,16000, 32000]:# , 
            save_path = f"tokenizer_sp_duplicated/sentence_piece_vocab_{i}/"
            sentencepiece_tk(data=data_dup, vocab_size=i, save_path=save_path)
            print("finished training with ", i, " vocab size")
            
    if(i == 1):
        path_file = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/prepare_data/sentences_for_tokenizer.txt"

        with open(path_file, mode="r", encoding="utf-8") as f:
            sen_txts = f.read()
        sen_txts = sen_txts.split("\n")

        #for i in [1000, 1500, 1800, 2000, 2500, 3000, 4000, 8000,16000, 32000]:# , 
        #    save_path = f"tokenizer_sp_unique/sentence_piece_vocab_{i}/"
        #    sentencepiece_tk(data=sen_txts, vocab_size=i, save_path=save_path)
        #    print("finished training with ", i, " vocab size")
    
    i=2
    if(i == 2):
        path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/txt_unique"
        list_of_files = get_txt_from_dir(path_to_data_files)
        
        for i in [1000, 1500, 1800, 2000, 2500, 3000, 4000, 8000,16000, 32000]:# , 
            save_path = f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/tokenizer_WordPiece/WordPiece_vocab_{i}/"
            
            wordpiece_tokenizer(files = list_of_files, vocab_size = i,
                                save_path = save_path, min_frequency=2)
            print("done with the first word piece")
    
    i=3
    if(i == 3):
        path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/dup_texts"
        list_of_files = get_txt_from_dir(path_to_data_files)
        
        for i in [1000, 1500, 1800, 2000, 2500, 3000, 4000, 8000,16000, 32000]:# , 
            save_path = f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/tokenizer_WordPiece_duplicated/WordPiece_vocab_{i}/"
            
            wordpiece_tokenizer(files = list_of_files, vocab_size = i,
                                save_path = save_path, min_frequency=2)
            print("done with the first word piece")

In [5]:
main()

about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word piece
about to train



done training.. 

done with the first word p

In [12]:
pp= f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/LanguageModelling/LanguageModelling/sentencepiece_duplicatedTk/ger-patho-bert-v1_sp_1000_epochs_20_batch_size_4/trainer_state.json"
title = "v1_sp_1000_epochs_20_batch_size_4"
path_to_save_plot = "/".join(pp.split("/")[:-1]) +"/"+title +".png"
print(path_to_save_plot)
    #plt.savefig(path_to_save_plot+"/"+title)

/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/LanguageModelling/LanguageModelling/sentencepiece_duplicatedTk/ger-patho-bert-v1_sp_1000_epochs_20_batch_size_4/v1_sp_1000_epochs_20_batch_size_4.png


## clamp try 

In [19]:
import datasets 
import json

In [20]:
path_classi_data = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/DataNephroTexts/classification_data/data_files_synthetic_labeled_hf_dataset.json"
data = datasets.Dataset.from_json(path_classi_data)

Using custom data configuration default-516ada5b713e5b01


Downloading and preparing dataset json/default to /home/fb198/.cache/huggingface/datasets/json/default-516ada5b713e5b01/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /home/fb198/.cache/huggingface/datasets/json/default-516ada5b713e5b01/0.0.0. Subsequent calls will reuse this data.


In [24]:
data[0]

{'text': 'Klinische Angaben: multifokales Urothelkarzinom Harnblase pT1 G3 Harnleiterabsetzungsrand tumorfrei? Eingesandt wurde: 1 (HL-Absetzungsrand li) Zur Schnellschnittuntersuchung eingesandt wurde ein 18 x 13 x 7 cm messendes Harnblasenresektat mit anhängender Prostata von 6 x 5 x 5 cm. Im Bereich von Blasenboden/linker Seitenwand/Vorderwand ausgedehnte landkartenartige Ulzerationen von 7,5 x 3,8 cm ohne eindeutig abgrenzbaren Tumor. 1.1 SS Harnröhrenabsetzungsrand 1.2 Blasenpfeiler rechts 1.3 Blasenpfeiler links 1.4-1.9 Prostata rechts 1.10-1.15 Prostata links 1.16-1.25 Ulcusanteile, subtotal 1.16-1.31 Mapping Blasenschleimhaut Vorläufige Beurteilung gemäß der Gefrierschnittführung: Harnröhrenabsetzung tumorfrei. Auf die telefonische Schnellschnittbefunddurchsage darf verwiesen werden (Tel.nr. 8218, 11:18 Uhr). Beurteilung nach Paraffineinbettung: Harnblasenresektat mit an mehreren Stellen Residuen des vorbekannten, geringgradig differenzierten Urothelkarzinoms, welche beginnend 

## train test BPE

In [3]:
## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer

from transformers import PreTrainedTokenizerFast


## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace
from data_prep_utils import get_txt_from_dir
import os

In [91]:
unk_token = "[UNK]" # token for unknown words
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]  # special tokens

def prepare_tokenizer_trainer(alg, vocab_size):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token=unk_token))
        trainer = BpeTrainer(vocab_size = vocab_size, show_progress=True,
                             continuing_subword_prefix = "##", special_tokens=special_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(vocab_size = vocab_size, show_progress=True,
                                 continuing_subword_prefix = "##", unk_token=unk_token,
                                 special_tokens=special_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token=unk_token))
        trainer = WordPieceTrainer(vocab_size = vocab_size, show_progress=True,
                             continuing_subword_prefix = "##", special_tokens=special_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token=unk_token))
        trainer = WordLevelTrainer(vocab_size = vocab_size, show_progress=True,
                             continuing_subword_prefix = "##", special_tokens=special_tokens)

    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer

#‘WLV’ - Word Level Algorithm
#‘WPC’ - WordPiece Algorithm
#‘BPE’ - Byte Pair Encoding
#‘UNI’ - Unigram
def train_tokenizer(files, save_path, vocab_size, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg, vocab_size)
    tokenizer.train(files, trainer) # training the tokenzier
    
    _save_path = f"{save_path}/{alg}_{vocab_size}"
    if not os.path.exists(_save_path):
        os.makedirs(_save_path)
    
    tokenizer.save(f"{_save_path}/tokenizer.json")
    tokenizer_ = Tokenizer.from_file(f"{_save_path}/tokenizer.json")
    return tokenizer

def tokenize(input_string, tokenizer):
    """
    Tokenizes the input string using the tokenizer provided.
    """
    output = tokenizer.encode(input_string)
    return output

def fasttokenizer_wrapper(tokenizer_object, path_to_tk_dir, 
                          special_tokens, model_max_length=512):
    
    fast_tokenizer = PreTrainedTokenizerFast(
                        tokenizer_object=tokenizer_object,
                        name_or_path= path_to_tk_dir,
                        model_max_length=model_max_length,
                        special_tokens = special_tokens)

    tokenizer_object.mask_token="[MASK]"
    tokenizer_object.unk_token ="[UNK]"
    tokenizer_object.sep_token="[SEP]"
    tokenizer_object.pad_token="[PAD]"
    tokenizer_object.cls_token="[CLS]"

    fast_tokenizer.pad_token_id = tokenizer_object.token_to_id("[PAD]")
    fast_tokenizer.unk_token_id = tokenizer_object.token_to_id("[UNK]")
    fast_tokenizer.cls_token_id = tokenizer_object.token_to_id("[CLS]")
    fast_tokenizer.sep_token_id = tokenizer_object.token_to_id("[SEP]")
    fast_tokenizer.mask_token_id = tokenizer_object.token_to_id("[MASK]")

    if save_tokenizer:
        fast_tokenizer.save_pretrained(path_to_tk_dir)
        
    return fast_tokenizer

def main():
    save_path_web = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web"

    path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/dup_texts"
    list_of_files = get_txt_from_dir(path_to_data_files)
    print(len(list_of_files))

    input_string = ["Klinische Angaben: Adipositas rb-wb Wir erhielten: 1 (Resektat Magen): Ein schlauchförmiges,"
                   " klammernahtverschlossenes, 16,8 cm langes und max. 3,8 cm durchmessendes Magenteilresektat."
                   " Auf der Schleimhautoberfläche sowie den Schnittflächen kein Herdbefund abgrenzbar. 1.1: Einbettung"
                   " eines Tangentialschnitts auf die unterhalb der Klammernahtliegende Schleimhautabsetzung."
                   " 1.2: Einbettung eines exemplarischen zentralen Querschnitts. Beurteilung: Magenteilresektat mit "
                   "regelrechter, weitgehend entzündungsfreier Schleimhaut ohne Herdbefund. Im vorliegenden"
                   " Gewebematerial kein Anhalt für Malignität."
                   " Mit freundlichen kollegialen Grüßen Prof.Dr.med.A.Marx Dr.med.M.Hahn Tel.: 0621/383-4091"]
    print(input_string)

    tokens_dict = {}
    for files in [list_of_files]:
        #print(f"========Using vocabulary from {files}=======")
        print("number of training file: ", len(files))
        for alg in [ 'WPC']: #,'UNI','BPE']: #, 
            for vocab_size in [1000, 1200, 1500, 1800, 2000, 4000]:
                trained_tokenizer = train_tokenizer(files, save_path_web, vocab_size, alg)

                ## wrap it with transformers.PreTrainedTokenizerFast

                fasttokenizer_wrapper(trained_tokenizer, f"{save_path_web}/{alg}_{vocab_size}",
                                      special_tokens, model_max_length=512)

                output = tokenize(input_string[0], trained_tokenizer)
                tokens_dict[alg] = output.tokens
                print("----", alg, "----")
                print(output.tokens, "->", len(output.tokens))

In [None]:
main()

In [8]:
save_path_web = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web"

path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/txt_unique"
list_of_files = get_txt_from_dir(path_to_data_files)
print(len(list_of_files))

input_string = ["Klinische Angaben: Adipositas rb-wb Wir erhielten: 1 (Resektat Magen): Ein schlauchförmiges,"
               " klammernahtverschlossenes, 16,8 cm langes und max. 3,8 cm durchmessendes Magenteilresektat."
               " Auf der Schleimhautoberfläche sowie den Schnittflächen kein Herdbefund abgrenzbar. 1.1: Einbettung"
               " eines Tangentialschnitts auf die unterhalb der Klammernahtliegende Schleimhautabsetzung."
               " 1.2: Einbettung eines exemplarischen zentralen Querschnitts. Beurteilung: Magenteilresektat mit "
               "regelrechter, weitgehend entzündungsfreier Schleimhaut ohne Herdbefund. Im vorliegenden"
               " Gewebematerial kein Anhalt für Malignität."
               " Mit freundlichen kollegialen Grüßen Prof.Dr.med.A.Marx Dr.med.M.Hahn Tel.: 0621/383-4091"]
print(input_string)

tokens_dict = {}
for files in [list_of_files]:
    #print(f"========Using vocabulary from {files}=======")
    print("number of training file: ", len(files))
    for alg in ['WPC']: #, 
        for vocab_size in [1000, 1200, 1500, 1800, 2000, 4000]:
            trained_tokenizer = train_tokenizer(files, save_path_web, vocab_size, alg)


161295
['Klinische Angaben: Adipositas rb-wb Wir erhielten: 1 (Resektat Magen): Ein schlauchförmiges, klammernahtverschlossenes, 16,8 cm langes und max. 3,8 cm durchmessendes Magenteilresektat. Auf der Schleimhautoberfläche sowie den Schnittflächen kein Herdbefund abgrenzbar. 1.1: Einbettung eines Tangentialschnitts auf die unterhalb der Klammernahtliegende Schleimhautabsetzung. 1.2: Einbettung eines exemplarischen zentralen Querschnitts. Beurteilung: Magenteilresektat mit regelrechter, weitgehend entzündungsfreier Schleimhaut ohne Herdbefund. Im vorliegenden Gewebematerial kein Anhalt für Malignität. Mit freundlichen kollegialen Grüßen Prof.Dr.med.A.Marx Dr.med.M.Hahn Tel.: 0621/383-4091']
number of training file:  161295





In [92]:
path_to_txt = "/home/fb198/BA/DataNephroTexts/train_tokenizer_data/numbers_punt_filter"
list_of_files = get_txt_from_dir(path_to_txt)
save_path = f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/tokenizers/wordpiece_filtered_data"

tokens_dict = {}
for files in [list_of_files]:
    # print(f"========Using vocabulary from {files}=======")
    print("number of training file: ", len(files))
    for alg in ['WPC']:#, 'UNI', 'WPC']:  # ,
        for vocab_size in [1000, 1200, 1500, 1800, 2000, 4000]:
            trained_tokenizer = train_tokenizer(files, save_path, vocab_size, alg)

            ## wrap it with transformers.PreTrainedTokenizerFast

            fasttokenizer_wrapper(trained_tokenizer, f"{save_path}/{alg}_{vocab_size}_MAN",
                                  special_tokens, model_max_length=512)

number of training file:  338991





NameError: name 'save_tokenizer' is not defined

In [89]:
path_to_txt = "/home/fb198/BA/DataNephroTexts/train_tokenizer_data/numbers_punt_filter"
list_of_files = get_txt_from_dir(path_to_txt)
save_path = f"/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/tokenizers/wordpiece_filtered_data"

tokens_dict = {}
for files in [list_of_files]:
    #print(f"========Using vocabulary from {files}=======")
    print("number of training file: ", len(files))
    for alg in ['WPC']: #, 
        for vocab_size in [1000, 1200, 1500, 1800, 2000, 4000]:
            trained_tokenizer = train_tokenizer(files, save_path, vocab_size, alg)


number of training file:  338991




















In [61]:
trained_tokenizer = train_tokenizer(files, save_path_web, 1000, 'BPE')






In [87]:
output = tokenize(input_string[0], trained_tokenizer)
tokens_dict[alg] = output.tokens
print("----", alg, "----")
print(output.tokens, "->", len(output.tokens))

---- WPC ----
['Klinische', 'Angaben', ':', 'A', '##d', '##ip', '##osit', '##as', 'r', '##b', '-', 'wb', 'Wir', 'erhielten', ':', '1', '(', 'Resekt', '##at', 'Magen', '):', 'Ein', 's', '##chl', '##a', '##uch', '##f', '##ör', '##m', '##ig', '##es', ',', 'kl', '##amm', '##ern', '##ah', '##t', '##vers', '##chlossen', '##es', ',', '1', '##6', ',', '8', 'cm', 'langes', 'und', 'max', '<UNK>', '3', ',', '8', 'cm', 'durchmessendes', 'Ma', '##g', '##ent', '##eil', '##resektat', '<UNK>', 'Auf', 'der', 'Schleimhaut', '##o', '##ber', '##fläche', 'sowie', 'den', 'Schnitt', '##fläch', '##en', 'kein', 'H', '##erd', '##be', '##fund', 'abgrenzbar', '<UNK>', '1', '<UNK>', '1', ':', 'Einbettung', 'eines', 'T', '##angential', '##s', '##chnitt', '##s', 'auf', 'die', 'unter', '##hal', '##b', 'der', 'K', '##l', '##amm', '##ern', '##ah', '##t', '##lieg', '##ende', 'Schleimhaut', '##absetzung', '<UNK>', '1', '<UNK>', '2', ':', 'Einbettung', 'eines', 'ex', '##emplar', '##ischen', 'z', '##entr', '##alen', 'Quers

In [50]:
tk_vocab_path = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web/BPE_1200/tokenizer.json"
tokenizer = Tokenizer.from_file(tk_vocab_path)
tokenizer.model

<tokenizers.models.BPE at 0x7feed58384f0>

In [1]:
## importing the tokenizer and subword BPE trainer
import os
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer
from data_prep_utils import get_txt_from_dir
## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
unk_token = "[UNK]"  # token for unknown words
special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]  # special tokens


def prepare_tokenizer_trainer(alg, vocab_size):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token=unk_token))
        trainer = BpeTrainer(vocab_size=vocab_size, show_progress=True,
                             continuing_subword_prefix="##", special_tokens=special_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(vocab_size=vocab_size, show_progress=True,
                                 unk_token=unk_token, special_tokens=special_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token=unk_token))
        trainer = WordPieceTrainer(vocab_size=vocab_size, show_progress=True,
                                   continuing_subword_prefix="##", special_tokens=special_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token=unk_token))
        trainer = WordLevelTrainer(vocab_size=vocab_size, show_progress=True,
                                   continuing_subword_prefix="##", special_tokens=special_tokens)

    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer


# ‘WLV’ - Word Level Algorithm
# ‘WPC’ - WordPiece Algorithm
# ‘BPE’ - Byte Pair Encoding
# ‘UNI’ - Unigram
def train_tokenizer(files, save_path, vocab_size, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg, vocab_size)
    tokenizer.train(files, trainer)  # training the tokenzier

    _save_path = f"{save_path}/{alg}_{vocab_size}_dup"
    if not os.path.exists(_save_path):
        os.makedirs(_save_path)

    tokenizer.save(f"{_save_path}/tokenizer.json")
    tokenizer_ = Tokenizer.from_file(f"{_save_path}/tokenizer.json")
    return tokenizer


def tokenize(input_string, tokenizer):
    """
    Tokenizes the input string using the tokenizer provided.
    """
    output = tokenizer.encode(input_string)
    return output


def fasttokenizer_wrapper(tokenizer_object, path_to_tk_dir,
                          special_tokens, model_max_length=512, save_tokenizer=True):
    fast_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer_object,
        name_or_path=path_to_tk_dir,
        model_max_length=model_max_length,
        special_tokens=special_tokens)

    tokenizer_object.mask_token = "[MASK]"
    tokenizer_object.unk_token = "[UNK]"
    tokenizer_object.sep_token = "[SEP]"
    tokenizer_object.pad_token = "[PAD]"
    tokenizer_object.cls_token = "[CLS]"

    fast_tokenizer.pad_token_id = tokenizer_object.token_to_id("[PAD]")
    fast_tokenizer.unk_token_id = tokenizer_object.token_to_id("[UNK]")
    fast_tokenizer.cls_token_id = tokenizer_object.token_to_id("[CLS]")
    fast_tokenizer.sep_token_id = tokenizer_object.token_to_id("[SEP]")
    fast_tokenizer.mask_token_id = tokenizer_object.token_to_id("[MASK]")

    if save_tokenizer:
        fast_tokenizer.save_pretrained(path_to_tk_dir)

    return fast_tokenizer


def main():
    save_path_web = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web"

    path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/dup_texts"
    list_of_files = get_txt_from_dir(path_to_data_files)
    print(len(list_of_files))

    input_string = ["Klinische Angaben: Adipositas rb-wb Wir erhielten: 1 (Resektat Magen): Ein schlauchförmiges,"
                    " klammernahtverschlossenes, 16,8 cm langes und max. 3,8 cm durchmessendes Magenteilresektat."
                    " Auf der Schleimhautoberfläche sowie den Schnittflächen kein Herdbefund abgrenzbar. 1.1: Einbettung"
                    " eines Tangentialschnitts auf die unterhalb der Klammernahtliegende Schleimhautabsetzung."
                    " 1.2: Einbettung eines exemplarischen zentralen Querschnitts. Beurteilung: Magenteilresektat mit "
                    "regelrechter, weitgehend entzündungsfreier Schleimhaut ohne Herdbefund. Im vorliegenden"
                    " Gewebematerial kein Anhalt für Malignität."
                    " Mit freundlichen kollegialen Grüßen Prof.Dr.med.A.Marx Dr.med.M.Hahn Tel.: 0621/383-4091"]
    print(input_string)

    tokens_dict = {}
    for files in [list_of_files]:
        # print(f"========Using vocabulary from {files}=======")
        print("number of training file: ", len(files))
        for alg in ['BPE']:  # ,'UNI', 'WPC'
            for vocab_size in [1000, 2000, 4000]: #1200, 1500, 1800
                trained_tokenizer = train_tokenizer(files, save_path_web, vocab_size, alg)

                ## wrap it with transformers.PreTrainedTokenizerFast

                fasttokenizer_wrapper(trained_tokenizer, f"{save_path_web}/{alg}_{vocab_size}",
                                      special_tokens, model_max_length=512)

                output = tokenize(input_string[0], trained_tokenizer)
                tokens_dict[alg] = output.tokens
                print("----", alg, "----")
                print(output.tokens, "->", len(output.tokens))
#if __name__ == '__main__':
#    main()

In [3]:
import torch
torch.cuda.empty_cache()


In [4]:
save_path_web = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web"

path_to_data_files = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/txt_unique"
list_of_files = get_txt_from_dir(path_to_data_files)
print(len(list_of_files))

input_string = ["Klinische Angaben: Adipositas rb-wb Wir erhielten: 1 (Resektat Magen): Ein schlauchförmiges,"
                " klammernahtverschlossenes, 16,8 cm langes und max. 3,8 cm durchmessendes Magenteilresektat."
                " Auf der Schleimhautoberfläche sowie den Schnittflächen kein Herdbefund abgrenzbar. 1.1: Einbettung"
                " eines Tangentialschnitts auf die unterhalb der Klammernahtliegende Schleimhautabsetzung."
                " 1.2: Einbettung eines exemplarischen zentralen Querschnitts. Beurteilung: Magenteilresektat mit "
                "regelrechter, weitgehend entzündungsfreier Schleimhaut ohne Herdbefund. Im vorliegenden"
                " Gewebematerial kein Anhalt für Malignität."
                " Mit freundlichen kollegialen Grüßen Prof.Dr.med.A.Marx Dr.med.M.Hahn Tel.: 0621/383-4091"]
print(input_string)


161295
['Klinische Angaben: Adipositas rb-wb Wir erhielten: 1 (Resektat Magen): Ein schlauchförmiges, klammernahtverschlossenes, 16,8 cm langes und max. 3,8 cm durchmessendes Magenteilresektat. Auf der Schleimhautoberfläche sowie den Schnittflächen kein Herdbefund abgrenzbar. 1.1: Einbettung eines Tangentialschnitts auf die unterhalb der Klammernahtliegende Schleimhautabsetzung. 1.2: Einbettung eines exemplarischen zentralen Querschnitts. Beurteilung: Magenteilresektat mit regelrechter, weitgehend entzündungsfreier Schleimhaut ohne Herdbefund. Im vorliegenden Gewebematerial kein Anhalt für Malignität. Mit freundlichen kollegialen Grüßen Prof.Dr.med.A.Marx Dr.med.M.Hahn Tel.: 0621/383-4091']


In [94]:
for i in range(10):
    if( i not in [3,4,5]):
        continue
    print(i)

3
4
5


In [5]:
tokens_dict = {}
for files in [list_of_files]:
    # print(f"========Using vocabulary from {files}=======")
    print("number of training file: ", len(files))
    for alg in ['WPC']:#, 'UNI', 'WPC']:  # ,
        for vocab_size in [1000]:#, 1200, 1500, 1800, 2000, 4000]:
            trained_tokenizer = train_tokenizer(files, save_path_web, vocab_size, alg)

            ## wrap it with transformers.PreTrainedTokenizerFast

            fasttokenizer_wrapper(trained_tokenizer, f"{save_path_web}/{alg}_{vocab_size}_MAN",
                                  special_tokens, model_max_length=512)

number of training file:  161295





In [3]:
tokenizer = Tokenizer(BPE(unk_token=unk_token))
trainer = BpeTrainer(vocab_size=1000, show_progress=True,
continuing_subword_prefix="##", special_tokens=special_tokens)

   

In [4]:
tokenizer.pre_tokenizer = Whitespace()


In [None]:
tokenizer.train(list_of_files, trainer)  # training the tokenzier


In [None]:
_save_path = f"{save_path}/{alg}_1000_dup"
if not os.path.exists(_save_path):
    os.makedirs(_save_path)

tokenizer.save(f"{_save_path}/tokenizer.json")


## wordpiece example

In [None]:
"Klinische Angaben: Adipositas rb-wb Wir erhielten: 1 (Resektat Magen): Ein schlauchförmiges,"
                " klammernahtverschlossenes, 16,8 cm langes und max. 3,8 cm durchmessendes Magenteilresektat."
                " Auf der Schleimhautoberfläche sowie den Schnittflächen kein Herdbefund abgrenzbar. 1.1:Einbettung"
                " eines Tangentialschnitts auf die unterhalb der Klammernahtliegende Schleimhautabsetzung."
            " 1.2: Einbettung eines exemplarischen zentralen Querschnitts. Beurteilung:"
            ". Im vorliegenden"
                " Gewebematerial kein Anhalt für Malignität."

In [63]:
input_string = [ 
                " Magenteilresektat mit "
                "regelrechter, weitgehend entzündungsfreier Schleimhaut ohne Herdbefund."]

In [30]:
path_wp_1000 = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web/WPC_1000"
wp_1k = PreTrainedTokenizerFast.from_pretrained(path_wp_1000)
wp_1k
print(wp_1k.tokenize(input_string[0]))

['Magen', '##teil', '##resektat', 'mit', 're', '##ge', '##l', '##rech', '##ter', ',', 'wei', '##t', '##ge', '##h', '##end', 'ent', '##zünd', '##ung', '##s', '##freie', '##r', 'Schleimhaut', 'ohne', 'H', '##erd', '##be', '##fund']


In [35]:
path_wp_1200 = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web/WPC_1200"
wp_1200 = PreTrainedTokenizerFast.from_pretrained(path_wp_1200)
wp_1200
print(wp_1200.tokenize(input_string[0]))

['Magen', '##teil', '##resektat', 'mit', 'regelrecht', '##er', ',', 'wei', '##t', '##ge', '##hend', 'ent', '##zünd', '##ungs', '##freie', '##r', 'Schleimhaut', 'ohne', 'H', '##erd', '##befund']


In [38]:
path_wp_4000 = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web/WPC_4000"
wp_4k = PreTrainedTokenizerFast.from_pretrained(path_wp_4000)
wp_4k
print(wp_4k.tokenize(input_string[0]))

['Magenteilresektat', 'mit', 'regelrechter', ',', 'weitgehend', 'entzündungsfreie', '##r', 'Schleimhaut', 'ohne', 'Herdbefund']


In [49]:
path_bpe_drop_1200 = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/BPE_tokenizer_dropout/1_BPE_1_batch_size_8_vocab_4000_"
bpe_drop_1200 = PreTrainedTokenizerFast.from_pretrained(path_bpe_drop_1200)
bpe_drop_1200
print(bpe_drop_1200.tokenize(input_string[0]))

['Ġ', 'M', 'ag', 'en', 'te', 'i', 'l', 'r', 'e', 's', 'e', 'k', 't', 'at', 'Ġ', 'm', 'i', 't', 'Ġ', 'r', 'e', 'g', 'e', 'l', 'r', 'e', 'c', 'h', 'ter', ',', 'Ġ', 'w', 'e', 'i', 't', 'g', 'e', 'h', 'en', 'd', 'Ġ', 'en', 't', 'z', 'Ã', '¼', 'n', 'd', 'u', 'n', 'g', 's', 'f', 'r', 'e', 'i', 'er', 'Ġ', 'S', 'c', 'h', 'l', 'e', 'i', 'm', 'h', 'a', 'u', 't', 'Ġ', 'o', 'h', 'n', 'e', 'Ġ', 'H', 'er', 'd', 'b', 'e', 'f', 'u', 'n', 'd']


In [54]:
path_bpe_drop_1200 = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web/BPE_1200"
bpe_drop_1200 = PreTrainedTokenizerFast.from_pretrained(path_bpe_drop_1200)
bpe_drop_1200
print(bpe_drop_1200.tokenize(input_string[0]))

['Ma', '##g', '##ent', '##eil', '##resektat', 'mit', 'regel', '##rech', '##ter', ',', 'w', '##eit', '##ge', '##hend', 'e', '##ntzündung', '##s', '##frei', '##er', 'Schleimhaut', 'ohne', 'H', '##erd', '##befund']


In [59]:
path_bpe_drop_1500 = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/jupyter_stuff/tokenizers_vs4_12_10/test_toenizer_from_web/BPE_1000"
bpe_drop_1500 = PreTrainedTokenizerFast.from_pretrained(path_bpe_drop_1500)
bpe_drop_1500
print(bpe_drop_1500.tokenize(input_string[0]))

['Magen', '##teil', '##resektat', 'mit', 'r', '##egelre', '##ch', '##ter', ',', 'w', '##eit', '##ge', '##h', '##end', 'e', '##ntzündung', '##s', '##frei', '##er', 'Schleimhaut', 'ohne', 'H', '##erd', '##be', '##fund']


In [64]:
sp_path = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/LanguageModelling/LanguageModelling/mlm_evaluation_2/bert-1_sp_1_batch_size_8"
sp_tk = PreTrainedTokenizerFast.from_pretrained(sp_path)
print(sp_tk.tokenize(input_string[0]))

['▁Magen', 'teil', 'resektat', '▁mit', '▁regel', 'rech', 'ter', ',', '▁w', 'eit', 'ge', 'h', 'end', '▁en', 'tzündung', 's', 'frei', 'er', '▁Schleimhaut', '▁ohne', '▁H', 'erd', 'be', 'fund', '.']
