In [34]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

In [35]:
pd_data_frame = pd.read_csv('/home/toure215/BERT_phonetic/DATASETS/verses/verses.csv')
pd_data_frame.head(10)

Unnamed: 0,id,Verse,Meter,char_count
0,0,ah why this boding start this sudden pain,iambic,6
1,1,that wings my pulse and shoots from vein to vein,iambic,6
2,2,what mean regardless of yon midnight bell,iambic,6
3,3,these earthborn visions saddening o'er my cell,iambic,6
4,4,what strange disorder prompts these thoughts t...,iambic,6
5,5,these sighs to murmur and these tears to flow,iambic,6
6,6,'tis she 'tis eloisa's form restor'd,iambic,6
7,7,once a pure saint and more than saints ador'd,iambic,6
8,8,she comes in all her killing charms confest,iambic,6
9,9,glares thro' the gloom and pours upon my breast,iambic,6


In [36]:
print(pd_data_frame['Meter'].nunique())
print(pd_data_frame.Meter.unique())
print(pd_data_frame.char_count.unique())


4
['iambic' 'anapaestic' 'trochaic' 'dactyl']
[ 6 10  8]


In [37]:
label_to_idx = {label: i for i, label in enumerate(pd_data_frame.Meter.unique())}
idx_to_label = {i: label for i, label in enumerate(pd_data_frame.Meter.unique())}
print(label_to_idx)
print(idx_to_label)

{'iambic': 0, 'anapaestic': 1, 'trochaic': 2, 'dactyl': 3}
{0: 'iambic', 1: 'anapaestic', 2: 'trochaic', 3: 'dactyl'}


In [38]:
pd_data_frame["label"] = pd_data_frame["Meter"].map(lambda x: label_to_idx[x])

In [39]:
print(pd_data_frame["label"].nunique())
pd_data_frame.head()

4


Unnamed: 0,id,Verse,Meter,char_count,label
0,0,ah why this boding start this sudden pain,iambic,6,0
1,1,that wings my pulse and shoots from vein to vein,iambic,6,0
2,2,what mean regardless of yon midnight bell,iambic,6,0
3,3,these earthborn visions saddening o'er my cell,iambic,6,0
4,4,what strange disorder prompts these thoughts t...,iambic,6,0


In [40]:
pd_data_frame = pd_data_frame.drop(columns="char_count")

In [41]:
pd_data_frame.head()

Unnamed: 0,id,Verse,Meter,label
0,0,ah why this boding start this sudden pain,iambic,0
1,1,that wings my pulse and shoots from vein to vein,iambic,0
2,2,what mean regardless of yon midnight bell,iambic,0
3,3,these earthborn visions saddening o'er my cell,iambic,0
4,4,what strange disorder prompts these thoughts t...,iambic,0


In [43]:
n0 = pd_data_frame["label"][pd_data_frame["label"] == 0].count()
n1 = pd_data_frame["label"][pd_data_frame["label"] == 1].count()
n2 = pd_data_frame["label"][pd_data_frame["label"] == 2].count()
n3 = pd_data_frame["label"][pd_data_frame["label"] == 3].count()
print("n0 :", n0)
print("n1 :", n1)
print("n2 :", n2)
print("n3 :", n3)

n0 : 186809
n1 : 5378
n2 : 5418
n3 : 1397


In [44]:
186809/ 5378

34.7357753811826

In [10]:
d0 = pd_data_frame[pd_data_frame["label"] == 0]
d1 = pd_data_frame[pd_data_frame["label"] == 1]
d2 = pd_data_frame[pd_data_frame["label"] == 2]
d3 = pd_data_frame[pd_data_frame["label"] == 3]

d1_duplicated = pd.concat([d1] * (n0 // n1), ignore_index=True)
d2_duplicated = pd.concat([d2] * (n0 // n2), ignore_index=True)
d3_duplicated = pd.concat([d3] * (n0 // n3), ignore_index=True)

pd_data_frame_duplicated = pd.concat([d0, d1_duplicated, d2_duplicated, d3_duplicated], ignore_index=True)
pd_data_frame_duplicated = pd_data_frame_duplicated.sample(frac=1).reset_index(drop=True)
pd_data_frame_duplicated.head()

Unnamed: 0,id,Verse,Meter,label
0,201577,while from its rocky caverns the deep voiced n...,dactyl,3
1,201474,day after day in the gray of the dawn as slow ...,dactyl,3
2,149202,less on exterior things than most suppose,iambic,0
3,200792,suddenly rose from the south a light as in aut...,dactyl,3
4,192587,the village vices drive them from the plain,iambic,0


In [12]:
print(len(pd_data_frame_duplicated))
print(pd_data_frame_duplicated.groupby("label").count())

739674
           id   Verse   Meter
label                        
0      186809  186809  186809
1      182852  182852  182852
2      184212  184212  184212
3      185801  185801  185801


In [45]:
pd_data_frame = pd_data_frame.sample(frac=1).reset_index(drop=True)

In [None]:
train, test = train_test_split(pd_data_frame_duplicated, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

In [47]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
val = Dataset.from_pandas(val)

In [48]:
verses_dataset = DatasetDict({"train": train, "test": test, "validation": val}).remove_columns(['__index_level_0__', 'Meter', 'id'])
verses_dataset

DatasetDict({
    train: Dataset({
        features: ['Verse', 'label'],
        num_rows: 161190
    })
    test: Dataset({
        features: ['Verse', 'label'],
        num_rows: 19901
    })
    validation: Dataset({
        features: ['Verse', 'label'],
        num_rows: 17911
    })
})

In [None]:
verses_dataset.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/verses/verses_dup_hf")

Saving the dataset (0/1 shards):   0%|          | 0/161190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19901 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17911 [00:00<?, ? examples/s]

In [17]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
    "psktoure/BERT_WordPiece_wikitext-103-raw-v1",
    num_labels=4,
    id2label=idx_to_label,
    label2id=label_to_idx,
    ignore_mismatched_sizes=True
)
tokenizer = AutoTokenizer.from_pretrained(
    "psktoure/BERT_WordPiece_wikitext-103-raw-v1"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at psktoure/BERT_WordPiece_wikitext-103-raw-v1 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def tokenize_function(examples):
	return tokenizer(examples['Verse'], padding=False, truncation=True, max_length=128)

dataset_hf_tokenized = verses_dataset.map(tokenize_function, remove_columns=['Verse'], num_proc=15)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(dataset_hf_tokenized)

Map (num_proc=15):   0%|          | 0/599135 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/73968 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/66571 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 599135
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 73968
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 66571
    })
})


In [19]:
from torch.utils.data import WeightedRandomSampler, DataLoader
import torch

# labels = dataset_hf_tokenized["train"]["label"]
# class_count = torch.bincount(torch.tensor(labels))
# class_weights = 1.0 / class_count.float()
# sample_weight = torch.tensor([class_weights[t] for t in labels])
# sampler = WeightedRandomSampler(weights=sample_weight, num_samples=len(sample_weight), replacement=True)

# train_loader = DataLoader(dataset_hf_tokenized["train"], batch_size=8, sampler=sampler, collate_fn=data_collator)

# for i, batch in enumerate(train_loader):
#     print(batch["labels"])
#     if i == 2:
#         break


In [20]:
from sklearn.utils.class_weight import compute_class_weight
import torch


class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.y_train = self.train_dataset["label"]
        class_count = torch.bincount(torch.tensor(self.y_train))
        class_weights = 1.0 / class_count.float()
        self.sample_weight = torch.tensor([class_weights[t] for t in self.y_train])
        self.sampler = WeightedRandomSampler(
            weights=self.sample_weight, num_samples=len(self.sample_weight), replacement=True
        )
    
    def get_train_dataloader(self):
        def collate_and_move_to_device(batch):
            batch = self.data_collator(batch)
            return {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in batch.items()}

        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=self.sampler,
            collate_fn=collate_and_move_to_device,
        )


    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        weight = torch.tensor(
            compute_class_weight(
                class_weight="balanced",
                classes=np.unique(self.y_train),
                y=self.y_train,
            ),
            device="cuda",
            dtype=torch.float,
        )
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="/tmp/ety_bert",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    eval_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to   ="none",
    fp16=True,
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=dataset_hf_tokenized["train"],
    eval_dataset=dataset_hf_tokenized["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [22]:
trainer.train()

  0%|          | 0/7023 [00:00<?, ?it/s]

  0%|          | 0/261 [00:00<?, ?it/s]

{'eval_loss': 0.03399595618247986, 'eval_runtime': 5.217, 'eval_samples_per_second': 12760.358, 'eval_steps_per_second': 50.029, 'epoch': 1.0}


  0%|          | 0/261 [00:00<?, ?it/s]

{'eval_loss': 0.022733153775334358, 'eval_runtime': 5.2319, 'eval_samples_per_second': 12724.137, 'eval_steps_per_second': 49.887, 'epoch': 2.0}


  0%|          | 0/261 [00:00<?, ?it/s]

{'eval_loss': 0.021089188754558563, 'eval_runtime': 5.5761, 'eval_samples_per_second': 11938.711, 'eval_steps_per_second': 46.807, 'epoch': 3.0}
{'train_runtime': 407.7416, 'train_samples_per_second': 4408.197, 'train_steps_per_second': 17.224, 'train_loss': 0.07677837655905151, 'epoch': 3.0}


TrainOutput(global_step=7023, training_loss=0.07677837655905151, metrics={'train_runtime': 407.7416, 'train_samples_per_second': 4408.197, 'train_steps_per_second': 17.224, 'total_flos': 1.8939668945203344e+16, 'train_loss': 0.07677837655905151, 'epoch': 3.0})

In [23]:
predictions = trainer.predict(dataset_hf_tokenized["test"])
preds, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(preds, axis=-1)
print(np.mean(preds == labels))

  0%|          | 0/289 [00:00<?, ?it/s]

0.9953628596149686


In [25]:
from torchmetrics.classification import MulticlassAccuracy

accuracy = MulticlassAccuracy(num_classes=4, average="macro")
acc = accuracy(torch.tensor(preds), torch.tensor(labels))
print(acc)

tensor(0.9954)


In [26]:
phonetic_bert = AutoModelForSequenceClassification.from_pretrained('psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1', num_labels=4)
tokenizer = AutoTokenizer.from_pretrained('psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from functools import lru_cache
import epitran

epi = epitran.Epitran("eng-Latn")


@lru_cache(maxsize=None)
def cahed_xsampa(word):
    return "".join(epi.xsampa_list(word))

def translate_verse_to_phonetic(verse):
    return " ".join([cahed_xsampa(word) for word in verse.split()])


def translate_to_phonetic(examples):
    return {"Verse": [translate_verse_to_phonetic(verse) for verse in examples["Verse"]]}


In [28]:
phonetic_dataset = verses_dataset.map(translate_to_phonetic, num_proc=15, batched=True)
phonetic_dataset.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/verses/phonetic_verses_dup_hf")

Map (num_proc=15):   0%|          | 0/599135 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/73968 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/66571 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/599135 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/73968 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/66571 [00:00<?, ? examples/s]

In [29]:
phonetic_dataset_tokenized = phonetic_dataset.map(tokenize_function, remove_columns=['Verse'], num_proc=15, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map (num_proc=15):   0%|          | 0/599135 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/73968 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/66571 [00:00<?, ? examples/s]

In [30]:
training_args = TrainingArguments(
    output_dir="/tmp/ety_bert_phonetic",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = CustomTrainer(
    model=phonetic_bert,
    args=training_args,
    train_dataset=phonetic_dataset_tokenized["train"],
    eval_dataset=phonetic_dataset_tokenized["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [31]:
trainer.train()

  0%|          | 0/7023 [00:00<?, ?it/s]

  0%|          | 0/261 [00:00<?, ?it/s]

{'eval_loss': 0.03673446550965309, 'eval_runtime': 22.4962, 'eval_samples_per_second': 2959.212, 'eval_steps_per_second': 11.602, 'epoch': 1.0}


  0%|          | 0/261 [00:00<?, ?it/s]

{'eval_loss': 0.024806290864944458, 'eval_runtime': 22.7104, 'eval_samples_per_second': 2931.302, 'eval_steps_per_second': 11.493, 'epoch': 2.0}


  0%|          | 0/261 [00:00<?, ?it/s]

{'eval_loss': 0.019678546115756035, 'eval_runtime': 22.5567, 'eval_samples_per_second': 2951.27, 'eval_steps_per_second': 11.571, 'epoch': 3.0}
{'train_runtime': 975.9155, 'train_samples_per_second': 1841.763, 'train_steps_per_second': 7.196, 'train_loss': 0.05656841353577086, 'epoch': 3.0}


TrainOutput(global_step=7023, training_loss=0.05656841353577086, metrics={'train_runtime': 975.9155, 'train_samples_per_second': 1841.763, 'train_steps_per_second': 7.196, 'total_flos': 3.087853102232309e+16, 'train_loss': 0.05656841353577086, 'epoch': 3.0})

In [32]:
predictions = trainer.predict(phonetic_dataset_tokenized["test"])
preds, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(preds, axis=-1) 
print(np.mean(preds == labels))

  0%|          | 0/289 [00:00<?, ?it/s]

0.9965390439108803


In [33]:
accuracy = MulticlassAccuracy(num_classes=4, average="macro")
acc = accuracy(torch.tensor(preds), torch.tensor(labels))
print(acc)

tensor(0.9966)
