# Imports

In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer

# Configs and hyperparameters

In [2]:

# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 2#16
EVAL_BATCH_SIZE = 2#16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

# Load the dataset

In [3]:

# load data
dtf_mlm = pd.read_csv('../data/processed/nlp/mtsamples/mtsamples_cleaned.csv')
#dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)

# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['transcription']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['transcription']].dropna())

# chose the model

In [4]:
'''
bert-base-uncased  # 12-layer, 768-hidden, 12-heads, 109M parameters
distilbert-base-uncased  # 6-layer, 768-hidden, 12-heads, 65M parameters
'''

MODEL = 'bert'
bert_type = 'bert-base-cased'
checkpoint = 'emilyalsentzer/Bio_ClinicalBERT' #"bert-base-cased"
#tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
#model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

#tokenizer = AutoTokenizer.from_pretrained(checkpoint)

if MODEL == 'distilbert':
    TokenizerClass = DistilBertTokenizer 
    ModelClass = DistilBertForMaskedLM 
elif MODEL == 'bert':
    TokenizerClass = BertTokenizer
    ModelClass = BertForMaskedLM 
elif MODEL == 'roberta':
    TokenizerClass = RobertaTokenizer
    ModelClass = RobertaForMaskedLM
elif MODEL == 'scibert':
    TokenizerClass = AutoTokenizer
    ModelClass = AutoModelForMaskedLM


tokenizer = TokenizerClass.from_pretrained(
            bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
            )
model = ModelClass.from_pretrained(bert_type)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# tokenize the  data and train test

In [7]:

from datasets import load_dataset, load_metric, Dataset

In [5]:
def tokenize_function(row):
    return tokenizer(
        row['transcription'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

# Train the model

In [None]:
# ## train dataset with hyperparameter tuning
# import os
# output_direction=os.path.join("models", "nlp", "unsupervised", "model")
# from transformers import DataCollatorForLanguageModeling, AdamW
# optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
# training_args = TrainingArguments(
#     output_dir=output_direction, 
#      per_device_train_batch_size=16,
#     num_train_epochs=2
# )
# from transformers import Trainer

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=dataset
# )
# trainer.train()

In [9]:
import wandb


# 1. Start a W&B run
wandb.init(project='keyberto2')
#!pip install wandb
#!wandb login

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

training_args = TrainingArguments(
    output_dir='./bert-news',
    logging_dir='./LMlogs',             
    num_train_epochs=2,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    save_steps=steps_per_epoch,
    save_total_limit=3,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED_TRAIN, 
    report_to = 'wandb'
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01684075043328145, max=1.0)…

PyTorch: setting up devices


In [10]:


def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    
    logits, labels = eval_pred
    print('*******')
    print('eval pred',eval_pred)
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels)["precision"]
    recall = metric2.compute(predictions=predictions, references=labels)["recall"]
    print('precision',precision)
    print('recall',recall)
    return {"precision": precision, "recall": recall}


model.config.id2label


small_train_df = train_dataset.select(range(80))
small_valid_df = valid_dataset.select(range(20))
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=small_train_df,
    eval_dataset=small_valid_df,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
print
trainer.train()
trainer.save_model("../model/modelUnsupervisedtest") #save your custom model

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 80
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 80
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/80 [00:00<?, ?it/s]

The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 2


  0%|          | 0/10 [00:00<?, ?it/s]

*******
eval pred EvalPrediction(predictions=array([[[ -7.304184  ,  -7.282538  ,  -7.3578033 , ...,  -6.2020125 ,
          -5.956771  ,  -6.417085  ],
        [ -8.039103  ,  -8.308696  ,  -8.096915  , ...,  -6.8145266 ,
          -6.5551534 ,  -7.140764  ],
        [ -4.1957045 ,  -4.116461  ,  -4.1545744 , ...,  -2.784484  ,
          -1.8554838 ,  -1.774858  ],
        ...,
        [ -2.9543304 ,  -3.5675569 ,  -3.922279  , ...,  -3.9190314 ,
          -3.2979405 ,  -3.7784524 ],
        [ -2.70647   ,  -3.4541452 ,  -3.729055  , ...,  -3.6568832 ,
          -3.1170309 ,  -3.6506884 ],
        [ -3.191443  ,  -3.918459  ,  -4.1557612 , ...,  -3.7523413 ,
          -3.1288614 ,  -3.7408836 ]],

       [[ -6.9575644 ,  -6.914566  ,  -7.0810657 , ...,  -6.0459113 ,
          -5.996975  ,  -6.19377   ],
        [ -4.233566  ,  -4.627155  ,  -4.5597925 , ...,  -2.8972208 ,
          -3.383095  ,  -4.1050816 ],
        [ -4.4258666 ,  -4.838659  ,  -4.744806  , ...,  -2.876425  ,
      

TypeError: only size-1 arrays can be converted to Python scalars

# Keybert

In [15]:
from transformers import BertTokenizer, BertForMaskedLM, KeyBERT, pipeline
model = BertForMaskedLM.from_pretrained('../models/mtsamples') #/model
tokenizer = BertTokenizer.from_pretrained('../models/mtsamples')
hf_model = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

example_text = "I am a sentence that I want embeddings for. I am so sick my neck hurts"


kw_model = KeyBERT(model=hf_model)
keywords = kw_model.extract_keywords(example_text, keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     use_maxsum=True, 
                                     nr_candidates=10, 
                                     top_n=5, 
                                     use_mmr=True, 
                                     diversity=0.7, 
                                     only_keywords=True)

keywords

ImportError: cannot import name 'KeyBERT' from 'transformers' (/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/transformers/__init__.py)

# backups

In [None]:

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast = False, do_lower_case=True)
model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
import torch
import math

trainer = Trainer(
  model=model,
  data_collator=data_collator,
  #train_dataset=tokenized_dataset_2['train'],
  eval_dataset=valid_dataset,
  tokenizer=tokenizer,
  )

eval_results = trainer.evaluate()

print('Evaluation results: ', eval_results)
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.3f}")
print('-'*30)

In [None]:
# 2. Save model inputs and hyperparameters
config = trainer.model.config
config.save_pretrained(wandb.run.dir)
tokenizer.save_pretrained(wandb.run.dir)
        
