In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

In [4]:
print(os.getcwd())
pd_data_frame = pd.read_csv("/home/toure215/BERT_phonetic/DATASETS/verses/verses.csv")
pd_data_frame.head(10)

/home/toure215/BERT_phonetic/test


Unnamed: 0,id,Verse,Meter,char_count
0,0,ah why this boding start this sudden pain,iambic,6
1,1,that wings my pulse and shoots from vein to vein,iambic,6
2,2,what mean regardless of yon midnight bell,iambic,6
3,3,these earthborn visions saddening o'er my cell,iambic,6
4,4,what strange disorder prompts these thoughts t...,iambic,6
5,5,these sighs to murmur and these tears to flow,iambic,6
6,6,'tis she 'tis eloisa's form restor'd,iambic,6
7,7,once a pure saint and more than saints ador'd,iambic,6
8,8,she comes in all her killing charms confest,iambic,6
9,9,glares thro' the gloom and pours upon my breast,iambic,6


In [5]:
print(pd_data_frame["Meter"].nunique())
print(pd_data_frame.Meter.unique())
print(pd_data_frame.char_count.unique())

4
['iambic' 'anapaestic' 'trochaic' 'dactyl']
[ 6 10  8]


In [6]:
label_to_idx = {label: i for i, label in enumerate(pd_data_frame.Meter.unique())}
idx_to_label = {i: label for i, label in enumerate(pd_data_frame.Meter.unique())}
print(label_to_idx)
print(idx_to_label)

{'iambic': 0, 'anapaestic': 1, 'trochaic': 2, 'dactyl': 3}
{0: 'iambic', 1: 'anapaestic', 2: 'trochaic', 3: 'dactyl'}


In [7]:
pd_data_frame["label"] = pd_data_frame["Meter"].map(lambda x: label_to_idx[x])

In [8]:
print(pd_data_frame["label"].nunique())
pd_data_frame.head()

4


Unnamed: 0,id,Verse,Meter,char_count,label
0,0,ah why this boding start this sudden pain,iambic,6,0
1,1,that wings my pulse and shoots from vein to vein,iambic,6,0
2,2,what mean regardless of yon midnight bell,iambic,6,0
3,3,these earthborn visions saddening o'er my cell,iambic,6,0
4,4,what strange disorder prompts these thoughts t...,iambic,6,0


In [9]:
pd_data_frame = pd_data_frame.drop(columns="char_count")

In [13]:
pd_data_frame.head()
pd_train, pd_test = train_test_split(pd_data_frame, test_size=0.2, random_state=42)
pd_train, pd_val = train_test_split(pd_train, test_size=0.2, random_state=42)

In [14]:
n0 = pd_train["label"][pd_train["label"] == 0].count()
n1 = pd_train["label"][pd_train["label"] == 1].count()
n2 = pd_train["label"][pd_train["label"] == 2].count()
n3 = pd_train["label"][pd_train["label"] == 3].count()
print("n0 :", n0)
print("n1 :", n1)
print("n2 :", n2)
print("n3 :", n3)

n0 : 119497
n1 : 3488
n2 : 3489
n3 : 886


In [19]:
d0 = pd_train[pd_train["label"] == 0]
d1 = pd_train[pd_train["label"] == 1]
d2 = pd_train[pd_train["label"] == 2]
d3 = pd_train[pd_train["label"] == 3]

d1_duplicated = pd.concat([d1] * (n0 // n1), ignore_index=True)
d2_duplicated = pd.concat([d2] * (n0 // n2), ignore_index=True)
d3_duplicated = pd.concat([d3] * (n0 // n3), ignore_index=True)

pd_train = pd.concat(
    [d0, d1_duplicated, d2_duplicated, d3_duplicated], ignore_index=True
)
pd_train = pd_train.sample(frac=1).reset_index(drop=True)
pd_train.head()

Unnamed: 0,id,Verse,Meter,label
0,55872,the windy summit wild and high,trochaic,2
1,200358,then came the shepherd back with his bleating ...,dactyl,3
2,181192,pale as i lay beneath thy ebon wand,iambic,0
3,201875,hedividedwithhispeople,trochaic,2
4,3126,pastora by chance hasten'd by,anapaestic,1


In [22]:
print(len(pd_train))
print(pd_train.groupby("label").count())

475439
           id   Verse   Meter
label                        
0      119497  119497  119497
1      118592  118592  118592
2      118626  118626  118626
3      118724  118724  118724


In [29]:
pd_train = pd_train.sample(frac=1).reset_index(drop=True)
pd_val = pd_val.sample(frac=1).reset_index(drop=True)
pd_test = pd_test.sample(frac=1).reset_index(drop=True)

In [30]:
train = Dataset.from_pandas(pd_train)
val = Dataset.from_pandas(pd_val)
test = Dataset.from_pandas(pd_test)

In [32]:
verses_dataset = DatasetDict({"train": train, "test": test, "validation": val})
verses_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'Verse', 'Meter', 'label'],
        num_rows: 475439
    })
    test: Dataset({
        features: ['id', 'Verse', 'Meter', 'label'],
        num_rows: 39801
    })
    validation: Dataset({
        features: ['id', 'Verse', 'Meter', 'label'],
        num_rows: 31841
    })
})

In [33]:
verses_dataset.save_to_disk(
    "/home/toure215/BERT_phonetic/DATASETS/verses/verses_dup_hf"
)

Saving the dataset (0/1 shards):   0%|          | 0/475439 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/39801 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/31841 [00:00<?, ? examples/s]

In [66]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4,
    id2label=idx_to_label,
    label2id=label_to_idx,
    ignore_mismatched_sizes=True,
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
def tokenize_function(examples):
    return tokenizer(examples["Verse"], padding=False, truncation=True, max_length=128)


dataset_hf_tokenized = verses_dataset.map(
    tokenize_function, remove_columns=["Verse"], num_proc=15
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(dataset_hf_tokenized)

Map (num_proc=15):   0%|          | 0/475439 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/39801 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/31841 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'Meter', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 475439
    })
    test: Dataset({
        features: ['id', 'Meter', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 39801
    })
    validation: Dataset({
        features: ['id', 'Meter', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 31841
    })
})


In [68]:
from torch.utils.data import WeightedRandomSampler, DataLoader
import torch

# labels = dataset_hf_tokenized["train"]["label"]
# class_count = torch.bincount(torch.tensor(labels))
# class_weights = 1.0 / class_count.float()
# sample_weight = torch.tensor([class_weights[t] for t in labels])
# sampler = WeightedRandomSampler(weights=sample_weight, num_samples=len(sample_weight), replacement=True)

# train_loader = DataLoader(dataset_hf_tokenized["train"], batch_size=8, sampler=sampler, collate_fn=data_collator)

# for i, batch in enumerate(train_loader):
#     print(batch["labels"])
#     if i == 2:
#         break

In [69]:
from sklearn.utils.class_weight import compute_class_weight
import torch


class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.y_train = self.train_dataset["label"]
        class_count = torch.bincount(torch.tensor(self.y_train))
        class_weights = 1.0 / class_count.float()
        self.sample_weight = torch.tensor([class_weights[t] for t in self.y_train])
        self.sampler = WeightedRandomSampler(
            weights=self.sample_weight,
            num_samples=len(self.sample_weight),
            replacement=True,
        )

    def get_train_dataloader(self):
        def collate_and_move_to_device(batch):
            batch = self.data_collator(batch)
            return {
                k: v.to("cuda") if isinstance(v, torch.Tensor) else v
                for k, v in batch.items()
            }

        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=self.sampler,
            collate_fn=collate_and_move_to_device,
        )

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        weight = torch.tensor(
            compute_class_weight(
                class_weight="balanced",
                classes=np.unique(self.y_train),
                y=self.y_train,
            ),
            device="cuda",
            dtype=torch.float,
        )
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [70]:
training_args = TrainingArguments(
    output_dir="/tmp/ety_bert",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    eval_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=dataset_hf_tokenized["train"],
    eval_dataset=dataset_hf_tokenized["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [71]:
trainer.train()

  0%|          | 0/5574 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3059937059879303, 'eval_runtime': 3.0041, 'eval_samples_per_second': 10599.034, 'eval_steps_per_second': 41.609, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.36933380365371704, 'eval_runtime': 2.8626, 'eval_samples_per_second': 11123.168, 'eval_steps_per_second': 43.667, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.40441596508026123, 'eval_runtime': 2.9692, 'eval_samples_per_second': 10723.63, 'eval_steps_per_second': 42.098, 'epoch': 3.0}
{'train_runtime': 388.6662, 'train_samples_per_second': 3669.774, 'train_steps_per_second': 14.341, 'train_loss': 0.05718245709887704, 'epoch': 3.0}


TrainOutput(global_step=5574, training_loss=0.05718245709887704, metrics={'train_runtime': 388.6662, 'train_samples_per_second': 3669.774, 'train_steps_per_second': 14.341, 'total_flos': 1.4802463655568264e+16, 'train_loss': 0.05718245709887704, 'epoch': 3.0})

In [72]:
predictions = trainer.predict(dataset_hf_tokenized["test"])
preds, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(preds, axis=-1)
print(np.mean(preds == labels))

  0%|          | 0/156 [00:00<?, ?it/s]

0.9490967563629055


In [73]:
from torchmetrics.classification import MulticlassAccuracy

accuracy = MulticlassAccuracy(num_classes=4, average="macro")
acc = accuracy(torch.tensor(preds), torch.tensor(labels))
print(acc)

tensor(0.6524)


In [53]:
phonetic_bert = AutoModelForSequenceClassification.from_pretrained(
    "psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1", num_labels=4
)
tokenizer = AutoTokenizer.from_pretrained(
    "psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1"
)

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [34]:
from functools import lru_cache
import epitran

epi = epitran.Epitran("eng-Latn")


@lru_cache(maxsize=None)
def cahed_xsampa(word):
    return "".join(epi.xsampa_list(word))


def translate_verse_to_phonetic(verse):
    return " ".join([cahed_xsampa(word) for word in verse.split()])


def translate_to_phonetic(examples):
    return {
        "Verse": [translate_verse_to_phonetic(verse) for verse in examples["Verse"]],
        "Original": examples["Verse"],
    }

In [35]:
phonetic_dataset = verses_dataset.map(translate_to_phonetic, num_proc=15, batched=True)
phonetic_dataset.save_to_disk(
    "/home/toure215/BERT_phonetic/DATASETS/verses/phonetic_verses_dup_hf"
)

Map (num_proc=15):   0%|          | 0/475439 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/39801 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/31841 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/475439 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/39801 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/31841 [00:00<?, ? examples/s]

In [55]:
phonetic_dataset_tokenized = phonetic_dataset.map(
    tokenize_function,
    remove_columns=["id", "Verse", "Meter", "Original"],
    num_proc=15,
    batched=True,
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map (num_proc=15):   0%|          | 0/475439 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/39801 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/31841 [00:00<?, ? examples/s]

In [56]:
training_args = TrainingArguments(
    output_dir="/tmp/ety_bert_phonetic",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=phonetic_bert,
    args=training_args,
    train_dataset=phonetic_dataset_tokenized["train"],
    eval_dataset=phonetic_dataset_tokenized["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [57]:
trainer.train()

  0%|          | 0/5574 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.2672980725765228, 'eval_runtime': 3.5802, 'eval_samples_per_second': 8893.713, 'eval_steps_per_second': 34.915, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.30624884366989136, 'eval_runtime': 3.6791, 'eval_samples_per_second': 8654.478, 'eval_steps_per_second': 33.975, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3388756215572357, 'eval_runtime': 3.4305, 'eval_samples_per_second': 9281.766, 'eval_steps_per_second': 36.438, 'epoch': 3.0}
{'train_runtime': 520.2915, 'train_samples_per_second': 2741.381, 'train_steps_per_second': 10.713, 'train_loss': 0.05331154814064182, 'epoch': 3.0}


TrainOutput(global_step=5574, training_loss=0.05331154814064182, metrics={'train_runtime': 520.2915, 'train_samples_per_second': 2741.381, 'train_steps_per_second': 10.713, 'total_flos': 2.459148172263276e+16, 'train_loss': 0.05331154814064182, 'epoch': 3.0})

In [58]:
predictions = trainer.predict(phonetic_dataset_tokenized["test"])
preds, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(preds, axis=-1)
print(np.mean(preds == labels))

  0%|          | 0/156 [00:00<?, ?it/s]

0.9554533805683274


In [59]:
accuracy = MulticlassAccuracy(num_classes=4, average="macro")
acc = accuracy(torch.tensor(preds), torch.tensor(labels))
print(acc)

tensor(0.6824)


In [38]:
train, test = train_test_split(pd_data_frame, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

train = train.sample(frac=1).reset_index(drop=True)
val = val.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

In [39]:
train = Dataset.from_pandas(train)
val = Dataset.from_pandas(val)
test = Dataset.from_pandas(test)

In [40]:
verse_hf = DatasetDict({"train": train, "test": test, "validation": val})
verse_hf

DatasetDict({
    train: Dataset({
        features: ['id', 'Verse', 'Meter', 'label'],
        num_rows: 161190
    })
    test: Dataset({
        features: ['id', 'Verse', 'Meter', 'label'],
        num_rows: 19901
    })
    validation: Dataset({
        features: ['id', 'Verse', 'Meter', 'label'],
        num_rows: 17911
    })
})

In [41]:
verse_hf.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/verses/verse_hf")

Saving the dataset (0/1 shards):   0%|          | 0/161190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19901 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17911 [00:00<?, ? examples/s]

In [42]:
phonetic_verse_hf = verse_hf.map(translate_to_phonetic, num_proc=15, batched=True)
phonetic_verse_hf

Map (num_proc=15):   0%|          | 0/161190 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/19901 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/17911 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'Verse', 'Meter', 'label', 'Original'],
        num_rows: 161190
    })
    test: Dataset({
        features: ['id', 'Verse', 'Meter', 'label', 'Original'],
        num_rows: 19901
    })
    validation: Dataset({
        features: ['id', 'Verse', 'Meter', 'label', 'Original'],
        num_rows: 17911
    })
})

In [43]:
phonetic_verse_hf.save_to_disk(
    "/home/toure215/BERT_phonetic/DATASETS/verses/phonetic_verse_hf"
)

Saving the dataset (0/1 shards):   0%|          | 0/161190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19901 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/17911 [00:00<?, ? examples/s]

In [79]:
import torch

del bert_model
del phonetic_bert

NameError: name 'bert_model' is not defined

In [80]:
import gc

torch.cuda.empty_cache()
gc.collect()

489