In [2]:
#!pip install transformers
#!pip install datasets

In [3]:
import numpy as np
import torch
import pandas as pd
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

In [4]:
####    import BERT uncased    #####
checkpoint = "bert-base-uncased"

In [5]:
ds = pd.read_csv("archive/pretrain_subset/valid.csv")

In [21]:
from datasets import Dataset
medal_train = Dataset.from_pandas(ds)



```
# Ce texte est au format code
```

# Fine tuning BERT with MLM

https://huggingface.co/course/chapter7/3

In [9]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [11]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## Preprocessing the data

In [17]:
train = medal_train.select(range(10000))

In [18]:
train

Dataset({
    features: ['ABSTRACT_ID', 'TEXT', 'LOCATION', 'LABEL'],
    num_rows: 10000
})

In [22]:
def tokenize_function(examples):
    result = tokenizer(examples["TEXT"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = train.map(
    tokenize_function, batched=True, remove_columns=["TEXT", "LABEL"]
)
tokenized_datasets = tokenized_datasets.remove_columns(['ABSTRACT_ID','LOCATION'])
tokenized_datasets

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
    num_rows: 10000
})

group text and resplit them into chunks of equal length

In [23]:
chunk_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 18775
})

## Fine tuning BERT

In [24]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
#TODO: whole word masking data collator

In [25]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] different electrocardiographic teaches have been described during thrombolytic therapy for aim to indicate successful reperfusion the occluded coronary artery also can be [MASK] by per [MASK]aneous tca ptca this study was performed to compare electro [MASK]io [MASK] changes during primary or rescue ptca and thrombolytic therapy the electrocardiographic changes were studied directly at the moment of reperfusion during ptca [SEP] [CLS] there are limited data [MASK] cdx expression in rectal carcinoma the ckck immunoprofile of [MASK]c has been described [MASK] studies which have [MASK] lumped tc [MASK] rectal'

'>>> pt together [MASK] this study we investigated the diagnostic utility of immunoh [MASK]chemical stains for ck ck and cdx in a series of rec [MASK] adenocarcinoma fifty [MASK]ve [MASK] of rec [MASK] adenocarcinomas were retrieved and immunostained for ck [MASK]ko [MASK] ck novo releasingra ncllck and cd [MASK] novocastra nclcdx thirty cases of pancreatic adenocarcinom

In [26]:
train_size = 1500
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 150
    })
})

In [27]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-MeDAL",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    report_to="none"
)



In [28]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [18]:
import os
os.environ["WANDB_DISABLED"] = "True"

In [19]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 64


>>> Perplexity: 47.43


In [20]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1500
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 72


Epoch,Training Loss,Validation Loss
1,No log,3.093573
2,No log,3.075434
3,No log,2.977961


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/mo

TrainOutput(global_step=72, training_loss=3.3853123982747397, metrics={'train_runtime': 67.0149, 'train_samples_per_second': 67.149, 'train_steps_per_second': 1.074, 'total_flos': 296105414400000.0, 'train_loss': 3.3853123982747397, 'epoch': 3.0})

In [21]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 64


>>> Perplexity: 19.31


# Preprocessing
Extracting the various abrevaiation

https://www.kaggle.com/code/jackshutler/medical-disambig-data-processing-models-and-cnn

In [22]:
#ds_short['location'] = [l[0] for l in ds_short['location']]
#ds_short['label'] = [l[0] for l in ds_short['label']]

In [23]:
import string

In [24]:
# Lets create a function to create a new feature 'ABV' from dataset
def createFeature(df):    
    return [x.split(' ')[y] for x,y in zip(df['text'], df['location'])]


# Lets create a function to remove all the Punctuations from Text
def removePunctuation(df):
    return [t.translate(str.maketrans('','',string.punctuation)) for t in df['text']]


# Lets create a function to Tokenize the Text column of dataset
def createTokens(df):
    return df['text'].apply(lambda x: x.split(' '))


#Lets create a function to drop "Abstract_id", "Location" and "TEXT" columns from dataset
def dropCols(df):
    return df.drop(columns=['abstract_id', 'location'])


# Lets create a function to remove stop words from the Text column
#def removeStop(df):
#    stopWords = spacy.lang.en.stop_words.STOP_WORDS
#    # Remove any stopwords which appear to be an Abbreviation
#    [stopWords.remove(t) for t in df['ABV'].str.lower() if t in stopWords]
#    return df['TOKEN'].apply(lambda x: [item for item in x if not item in stopWords])


def tolower(df):
    return [t.lower() for t in df['text']]



def preProcessData(df): 
    df['text'] = tolower(df)
    df['text'] = removePunctuation(df)
    #df['TOKEN'] = createTokens(df)
    df = dropCols(df)
    #df['TOKEN'] = removeStop(df)
    return df

train = ds_short

train['ABV'] = createFeature(train)
grouped = train.groupby(by=['ABV', 'label'], as_index = False, sort = False).count()
grouped = grouped.sort_values(by='text', ascending = False)

topAbv = grouped['ABV'][:20]
train = train[train['ABV'].isin(topAbv)]

train = preProcessData(train)

NameError: name 'ds_short' is not defined

In [None]:
train.head()

# classify text with abreviation

In [None]:
np.unique(train['ABV'])

In [None]:
ABBREVIATION_LIST = np.unique(train['ABV'])
print(ABBREVIATION_LIST)
def mapping(x):
    return np.where(ABBREVIATION_LIST == x)[0][0] +1

mapping('AUR')

In [None]:
train['ABV'] = train['ABV'].apply(mapping)

In [None]:
model

In [None]:
batch = train['TOKEN'].tolist()