In [1]:
import os
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
from sklearn.model_selection import train_test_split
import epitran
from functools import lru_cache
from difflib import SequenceMatcher

In [2]:
is_phonetic = False
epi = epitran.Epitran("eng-Latn")

@lru_cache(maxsize=None)
def xsampa_list(word: str) -> list:
    return epi.xsampa_list(word)

In [3]:
pd_dataset = pd.read_csv('/home/toure215/BERT_phonetic/DATASETS/verses/verse_dataset.csv')
pd_dataset.head()

Unnamed: 0,id,sentence1,sentence2,label
0,0,ah why this boding start this sudden pain,that wings my pulse and shoots from vein to vein,1
1,1,ah why this boding start this sudden pain,those parts of thee that the worlds eye doth view,0
2,2,what mean regardless of yon midnight bell,these earthborn visions saddening o'er my cell,1
3,3,what mean regardless of yon midnight bell,to save their matrons from the brutal rape,0
4,4,what strange disorder prompts these thoughts t...,these sighs to murmur and these tears to flow,1


In [4]:
pd_dataset = pd_dataset.loc[pd_dataset["label"] == 1].drop(columns=["label", "id"])
pd_dataset.head()

Unnamed: 0,sentence1,sentence2
0,ah why this boding start this sudden pain,that wings my pulse and shoots from vein to vein
2,what mean regardless of yon midnight bell,these earthborn visions saddening o'er my cell
4,what strange disorder prompts these thoughts t...,these sighs to murmur and these tears to flow
6,'tis she 'tis eloisa's form restor'd,strike the soft sweet harmonic chord
8,she comes in all her killing charms confest,glares thro' the gloom and pours upon my breast


In [5]:
def get_last_word(text):
    return text.split()[-1]

pd_dataset["label"] = pd_dataset["sentence1"].apply(get_last_word)
pd_dataset.head()

Unnamed: 0,sentence1,sentence2,label
0,ah why this boding start this sudden pain,that wings my pulse and shoots from vein to vein,pain
2,what mean regardless of yon midnight bell,these earthborn visions saddening o'er my cell,bell
4,what strange disorder prompts these thoughts t...,these sighs to murmur and these tears to flow,glow
6,'tis she 'tis eloisa's form restor'd,strike the soft sweet harmonic chord,restor'd
8,she comes in all her killing charms confest,glares thro' the gloom and pours upon my breast,confest


In [6]:
train, test = train_test_split(pd_dataset, test_size=0.1, random_state=42, shuffle=True)
train, val = train_test_split(train, test_size=0.1, random_state=42, shuffle=True)

In [7]:
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

In [8]:
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})
dataset = dataset.remove_columns(column_names=['__index_level_0__'])
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 80595
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8955
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 9951
    })
})

In [9]:
model_path = ['bert-base-uncased','psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1','psktoure/BERT_WordPiece_wikitext-103-raw-v1']

if is_phonetic:
    model = AutoModelForMaskedLM.from_pretrained(model_path[1])
    tokenizer = AutoTokenizer.from_pretrained(model_path[1])
else:
    model = AutoModelForMaskedLM.from_pretrained(model_path[-1])
    tokenizer = AutoTokenizer.from_pretrained(model_path[-1])

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of BertForMaskedLM were not initialized from the model checkpoint at psktoure/BERT_WordPiece_wikitext-103-raw-v1 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.tr

In [10]:
def translate_sentence(sentence: str) -> str:
    words = sentence.split()
    for i in range(len(words)):
        words[i] = ''.join(xsampa_list(words[i]))
    return ' '.join(words)

def translate_function(examples):
    examples['sentence1'] = [translate_sentence(sentence) for sentence in examples['sentence1']]
    examples['sentence2'] = [translate_sentence(sentence) for sentence in examples['sentence2']]
    examples['label'] = [''.join(xsampa_list(word)) for word in examples['label']]
    return examples

In [11]:
if is_phonetic:
    dataset = dataset.map(translate_function, batched=True, num_proc=15)

In [12]:
#dataset.save_to_disk('/home/toure215/BERT_phonetic/DATASETS/verses/rhyming_verses')

In [13]:
import torch   
from transformers import PreTrainedTokenizerBase

class CustomDataCollator:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, padding=True, max_length=256):
        self.tokenizer = tokenizer
        self.mask_token_id = tokenizer.mask_token_id
        self.padding = padding
        self.max_length = max_length

    def __call__(self, examples):
       
        sentence1 = [example["sentence1"] for example in examples]
        sentence2 = [example["sentence2"] for example in examples]
        targets = [example["label"] for example in examples]

        encoded_targets = self.tokenizer(targets, add_special_tokens=False)
            
        batch = self.tokenizer(
            sentence1,
            sentence2,
            padding=self.padding,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        input_ids = batch["input_ids"]
        labels = input_ids.clone()

        for i, idx in enumerate(input_ids):
            sep_token_indices = torch.where(idx == self.tokenizer.sep_token_id)[0]
            start = sep_token_indices[0] - len(encoded_targets[i])
            end = sep_token_indices[0]
            input_ids[i, start:end] = self.mask_token_id 
            labels[i, :start] = -100 
            labels[i, end:] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": batch["attention_mask"],
            "labels": labels,
        }


In [14]:
sample = dataset['train'][:1]
for key, value in sample.items():
    print(f'{key}: {value}')

sentence1: ["its native fierceness still the face retain'd"]
sentence2: ["sleep conscience sleep each awful thought be drown'd"]
label: ["retain'd"]


In [15]:
data_collator = CustomDataCollator(tokenizer)
sample_list = [{key: sample[key][i] for key in sample} for i in range(len(sample['sentence1']))]
c = data_collator(sample_list)
for key in c:
    print(key ,":", c[key])
e = tokenizer.encode("r\\Itejnd", add_special_tokens=False)
print(e) 
print(tokenizer.sep_token_id)

input_ids : tensor([[    1,  7267, 10378, 18782,  8302,  8185,  7057,  9646,     4,     4,
             4,     2, 12858, 22081, 12858,  7878, 28059,  8765,  7117, 15694,
            11,    46,     2]])
attention_mask : tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
labels : tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 15075,    11,
            46,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100]])
[60, 38, 7123, 4042, 4046, 7062]
2


In [16]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="mean")
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [17]:
training_args = TrainingArguments(
    output_dir="/tmp/fine_tuned_bert",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy='no',
    remove_unused_columns=False,
    fp16=True,
)

# Initialize the Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [18]:
trainer.train()

  0%|          | 0/3780 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

{'eval_loss': 3.3619561195373535, 'eval_runtime': 2.252, 'eval_samples_per_second': 3976.467, 'eval_steps_per_second': 62.167, 'epoch': 1.0}


  0%|          | 0/140 [00:00<?, ?it/s]

{'eval_loss': 2.938920736312866, 'eval_runtime': 2.248, 'eval_samples_per_second': 3983.594, 'eval_steps_per_second': 62.278, 'epoch': 2.0}


  0%|          | 0/140 [00:00<?, ?it/s]

{'eval_loss': 2.7815353870391846, 'eval_runtime': 2.2336, 'eval_samples_per_second': 4009.31, 'eval_steps_per_second': 62.68, 'epoch': 3.0}
{'train_runtime': 167.5698, 'train_samples_per_second': 1442.891, 'train_steps_per_second': 22.558, 'train_loss': 3.4605228484623014, 'epoch': 3.0}


TrainOutput(global_step=3780, training_loss=3.4605228484623014, metrics={'train_runtime': 167.5698, 'train_samples_per_second': 1442.891, 'train_steps_per_second': 22.558, 'total_flos': 3838217597329500.0, 'train_loss': 3.4605228484623014, 'epoch': 3.0})

In [19]:
def rhyme_score(word1: str, word2: str) -> int:
    if not is_phonetic:    
        end1 = xsampa_list(word1)
        end2 = xsampa_list(word2)
    else:
        end1 = word1
        end2 = word2
    length = min(len(end1), len(end2), 3)
    end1 = end1[-length:]
    end2 = end2[-length:]
    return SequenceMatcher(None, end1, end2).ratio()

In [20]:
def evaluate_rhyme(model, dataset, tokenizer):
    model = model.to("cuda")
    model.eval()
    rhyme_scores = []
    batch_size = 256

    for i in range(0, len(dataset), batch_size):
        print(f"Processing example {i}/{len(dataset)} ...", end="\r")
        batch = dataset[i : i + batch_size]
        batch_sequence = [{key: batch[key][j] for key in batch} for j in range(len(batch["sentence1"]))]
        inputs = data_collator(batch_sequence)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        for j in range(len(batch["sentence1"])):
            masked_token_index = torch.where(inputs["input_ids"][j] == tokenizer.mask_token_id)[0]
            predicted_index = logits[j, masked_token_index].argmax(-1)
            predicted_word = tokenizer.decode(predicted_index)
            target = tokenizer.decode(inputs["labels"][j, masked_token_index])
            if i < 16 and j < 8:
                print('predicted_word:', predicted_word, '-- target_word:', target)
            rhyme_scores.append(rhyme_score(predicted_word, target))

    return {"score": np.mean(rhyme_scores)}

In [21]:
def evaluate_rhyme_indices(model, dataset, tokenizer, k=5):
    import numpy as np
    from torch.utils.data import DataLoader
    
    model = model.to("cuda")
    model.eval()
    res = []
    batch_size = 256

    for i in range(0, len(dataset), batch_size):
        print(f"Processing batch {i}/{len(dataset)}...", end="\r")
        batch = dataset[i : i + batch_size]
        batch_sequence = [{key: batch[key][j] for key in batch} for j in range(len(batch["sentence1"]))]
        inputs = data_collator(batch_sequence)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            labels = inputs["labels"]

        count = 0

        for j in range(len(batch["sentence1"])):
            # Identify the position of the masked token
            masked_token_index = (inputs["input_ids"][j] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]

            # Get the target index
            # target_index = inputs["labels"][j, masked_token_index]
            
            targets = labels[j, masked_token_index]
            # Get the top-k predictions
            top_k_indices = logits[j, masked_token_index].topk(k).indices.squeeze(0)
            if i < 16 and j < 8:
                print('targets:', targets, '-- top_k_indices:', top_k_indices)

            # Check if the target index is in the top-k predictions
            ok = True
            for idx, target in enumerate(targets):
                if target not in top_k_indices[idx]:
                    ok = False
            if ok:
                count += 1

        # Append accuracy for this batch
        res.append(count / len(batch["sentence1"]))

    # Return the mean Top-k Accuracy
    return {"score": np.mean(res)}

In [22]:
evaluate_rhyme_indices(model, dataset['test'], tokenizer)

targets: tensor([7272], device='cuda:0') -- top_k_indices: tensor([7272, 7597, 8663, 8575, 7958], device='cuda:0')
targets: tensor([12407], device='cuda:0') -- top_k_indices: tensor([ 9905,  7702, 10258, 12407,  7762], device='cuda:0')
targets: tensor([9255], device='cuda:0') -- top_k_indices: tensor([ 9595, 12129, 13672,  9255, 21120], device='cuda:0')
targets: tensor([8788, 4040], device='cuda:0') -- top_k_indices: tensor([[ 8788,  7924, 27959, 16311, 14205],
        [ 4040,  7881,  7065, 10824,  7309]], device='cuda:0')
targets: tensor([ 7141, 11049, 10895], device='cuda:0') -- top_k_indices: tensor([[ 7314,  7750,  7141,  7937,  8002],
        [ 9422,    11,  7282,  7397, 13393],
        [ 7317,  8274,  4052,    11,  4042]], device='cuda:0')
targets: tensor([9266], device='cuda:0') -- top_k_indices: tensor([ 9266, 15699,  8800, 15638,  8370], device='cuda:0')
targets: tensor([11263], device='cuda:0') -- top_k_indices: tensor([11263, 12708, 11978, 16210, 10549], device='cuda:0')
tar

{'score': np.float64(0.38042572151316545)}

In [23]:
targets = torch.tensor([ 7141, 11049, 10895])
targets2 = torch.tensor([ 7750, 11, 4042])
top_k_indices = torch.tensor([[30141,  7750,  8958, 21849,  9905],
        [ 9422,    11, 11381,  4052,  7401],
        [ 7317,  4052,  8274,    46,  4042]])

for idx, target in enumerate(targets):
    print(target, target in top_k_indices[idx])

for idx, target in enumerate(targets2):
    print(target, target in top_k_indices[idx])

tensor(7141) False
tensor(11049) False
tensor(10895) False
tensor(7750) True
tensor(11) True
tensor(4042) True


In [24]:
targets = torch.tensor([7272])
top_k_indices = torch.tensor([7272, 8663, 7597, 8575, 8632])
for idx, target in enumerate(targets):
    print(target, target in top_k_indices)

tensor(7272) True
