# Imports

In [9]:
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer

# Configs and hyperparameters

In [10]:

# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 2#16
EVAL_BATCH_SIZE = 2#16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

wandb: Network error (ConnectionError), entering retry loop.


# Load the dataset

In [11]:

# load data
dtf_mlm = pd.read_csv('../data/raw/mtsamples_cleaned.csv')
#dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)

# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['transcription']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['transcription']].dropna())

# chose the model

In [12]:
'''
bert-base-uncased  # 12-layer, 768-hidden, 12-heads, 109M parameters
distilbert-base-uncased  # 6-layer, 768-hidden, 12-heads, 65M parameters
'''

MODEL = 'bert'
bert_type = 'bert-base-cased'
checkpoint = 'emilyalsentzer/Bio_ClinicalBERT' #"bert-base-cased"
#tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
#model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

#tokenizer = AutoTokenizer.from_pretrained(checkpoint)

if MODEL == 'distilbert':
    TokenizerClass = DistilBertTokenizer 
    ModelClass = DistilBertForMaskedLM 
elif MODEL == 'bert':
    TokenizerClass = BertTokenizer
    ModelClass = BertForMaskedLM 
elif MODEL == 'roberta':
    TokenizerClass = RobertaTokenizer
    ModelClass = RobertaForMaskedLM
elif MODEL == 'scibert':
    TokenizerClass = AutoTokenizer
    ModelClass = AutoModelForMaskedLM


tokenizer = TokenizerClass.from_pretrained(
            bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
            )
model = ModelClass.from_pretrained(bert_type)

loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at /Users/tara-sophiatumbraegel/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json from cache at /Users/tara-sophiatumbraegel/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /Users/tara-sophiatumbraegel/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee9

# tokenize the  data and train test

In [13]:
def tokenize_function(row):
    return tokenizer(
        row['transcription'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

      

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

       

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

# Train the model

In [14]:
import wandb


# 1. Start a W&B run
wandb.init(project='keyberto2')
#!pip install wandb
#!wandb login

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

training_args = TrainingArguments(
    output_dir='./bert-news',
    logging_dir='./LMlogs',             
    num_train_epochs=2,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    save_steps=steps_per_epoch,
    save_total_limit=3,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED_TRAIN, 
    report_to = 'wandb'
)
small_train_df = train_dataset.select(range(80))
small_valid_df = valid_dataset.select(range(20))
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=small_train_df,
    eval_dataset=small_valid_df,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("../model/modelUnsupervised") #save your custom model

0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁██
train/global_step,▁██
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁

0,1
eval/loss,4.22527
eval/runtime,6.9124
eval/samples_per_second,2.893
eval/steps_per_second,0.289
train/epoch,2.0
train/global_step,10.0
train/total_flos,10528004997120.0
train/train_loss,4.54433
train/train_runtime,413.9781
train/train_samples_per_second,0.386


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667044899998776, max=1.0)…

PyTorch: setting up devices
The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 80
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 80
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/80 [00:00<?, ?it/s]

The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 2


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to ./bert-news/checkpoint-40
Configuration saved in ./bert-news/checkpoint-40/config.json


{'eval_loss': 3.954698085784912, 'eval_runtime': 5.5139, 'eval_samples_per_second': 3.627, 'eval_steps_per_second': 1.814, 'epoch': 1.0}


Model weights saved in ./bert-news/checkpoint-40/pytorch_model.bin
tokenizer config file saved in ./bert-news/checkpoint-40/tokenizer_config.json
Special tokens file saved in ./bert-news/checkpoint-40/special_tokens_map.json
Deleting older checkpoint [bert-news/checkpoint-208] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 2


  0%|          | 0/10 [00:00<?, ?it/s]

Saving model checkpoint to ./bert-news/checkpoint-80
Configuration saved in ./bert-news/checkpoint-80/config.json


{'eval_loss': 3.921980381011963, 'eval_runtime': 5.364, 'eval_samples_per_second': 3.729, 'eval_steps_per_second': 1.864, 'epoch': 2.0}


Model weights saved in ./bert-news/checkpoint-80/pytorch_model.bin
tokenizer config file saved in ./bert-news/checkpoint-80/tokenizer_config.json
Special tokens file saved in ./bert-news/checkpoint-80/special_tokens_map.json
Deleting older checkpoint [bert-news/checkpoint-5] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./bert-news/checkpoint-80 (score: 3.921980381011963).


{'train_runtime': 671.7744, 'train_samples_per_second': 0.238, 'train_steps_per_second': 0.119, 'train_loss': 4.2029365539550785, 'epoch': 2.0}


Saving model checkpoint to ../model/modelUnsupervised
Configuration saved in ../model/modelUnsupervised/config.json
Model weights saved in ../model/modelUnsupervised/pytorch_model.bin
tokenizer config file saved in ../model/modelUnsupervised/tokenizer_config.json
Special tokens file saved in ../model/modelUnsupervised/special_tokens_map.json


--- Logging error ---
Traceback (most recent call last):
  File "/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 28] No space left on device
Call stack:
  File "/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/threading.py", line 966, in _bootstrap
    self._bootstrap_inner()
  File "/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 50, in run
    self._run()
  File "/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 101, in _run
    self._process(record)
  File "/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/wandb/sdk/internal/in

# Keybert

In [15]:
from transformers import BertTokenizer, BertForMaskedLM, KeyBERT, pipeline
model = BertForMaskedLM.from_pretrained('../models/mtsamples') #/model
tokenizer = BertTokenizer.from_pretrained('../models/mtsamples')
hf_model = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

example_text = "I am a sentence that I want embeddings for. I am so sick my neck hurts"


kw_model = KeyBERT(model=hf_model)
keywords = kw_model.extract_keywords(example_text, keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     use_maxsum=True, 
                                     nr_candidates=10, 
                                     top_n=5, 
                                     use_mmr=True, 
                                     diversity=0.7, 
                                     only_keywords=True)

keywords

ImportError: cannot import name 'KeyBERT' from 'transformers' (/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/transformers/__init__.py)

# backups

In [None]:

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast = False, do_lower_case=True)
model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
import torch
import math

trainer = Trainer(
  model=model,
  data_collator=data_collator,
  #train_dataset=tokenized_dataset_2['train'],
  eval_dataset=valid_dataset,
  tokenizer=tokenizer,
  )

eval_results = trainer.evaluate()

print('Evaluation results: ', eval_results)
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.3f}")
print('-'*30)

In [None]:
# 2. Save model inputs and hyperparameters
config = trainer.model.config
config.save_pretrained(wandb.run.dir)
tokenizer.save_pretrained(wandb.run.dir)
        
