In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

In [2]:
pd_data_frame = pd.read_csv('/home/toure215/BERT_phonetic/DATASETS/verses/verses.csv')
pd_data_frame.head(10)

Unnamed: 0,id,Verse,Meter,char_count
0,0,ah why this boding start this sudden pain,iambic,6
1,1,that wings my pulse and shoots from vein to vein,iambic,6
2,2,what mean regardless of yon midnight bell,iambic,6
3,3,these earthborn visions saddening o'er my cell,iambic,6
4,4,what strange disorder prompts these thoughts t...,iambic,6
5,5,these sighs to murmur and these tears to flow,iambic,6
6,6,'tis she 'tis eloisa's form restor'd,iambic,6
7,7,once a pure saint and more than saints ador'd,iambic,6
8,8,she comes in all her killing charms confest,iambic,6
9,9,glares thro' the gloom and pours upon my breast,iambic,6


In [3]:
print(pd_data_frame['Meter'].nunique())
print(pd_data_frame.Meter.unique())
print(pd_data_frame.char_count.unique())


4
['iambic' 'anapaestic' 'trochaic' 'dactyl']
[ 6 10  8]


In [4]:
label_to_idx = {label: i for i, label in enumerate(pd_data_frame.Meter.unique())}
idx_to_label = {i: label for i, label in enumerate(pd_data_frame.Meter.unique())}
print(label_to_idx)
print(idx_to_label)

{'iambic': 0, 'anapaestic': 1, 'trochaic': 2, 'dactyl': 3}
{0: 'iambic', 1: 'anapaestic', 2: 'trochaic', 3: 'dactyl'}


In [5]:
pd_data_frame["label"] = pd_data_frame["Meter"].map(lambda x: label_to_idx[x])

In [6]:
print(pd_data_frame["label"].nunique())
pd_data_frame.head()

4


Unnamed: 0,id,Verse,Meter,char_count,label
0,0,ah why this boding start this sudden pain,iambic,6,0
1,1,that wings my pulse and shoots from vein to vein,iambic,6,0
2,2,what mean regardless of yon midnight bell,iambic,6,0
3,3,these earthborn visions saddening o'er my cell,iambic,6,0
4,4,what strange disorder prompts these thoughts t...,iambic,6,0


In [7]:
pd_data_frame = pd_data_frame.drop(columns="char_count")

In [8]:
pd_data_frame.head()

Unnamed: 0,id,Verse,Meter,label
0,0,ah why this boding start this sudden pain,iambic,0
1,1,that wings my pulse and shoots from vein to vein,iambic,0
2,2,what mean regardless of yon midnight bell,iambic,0
3,3,these earthborn visions saddening o'er my cell,iambic,0
4,4,what strange disorder prompts these thoughts t...,iambic,0


In [9]:
n0 = pd_data_frame["label"][pd_data_frame["label"] == 0].count()
n1 = pd_data_frame["label"][pd_data_frame["label"] == 1].count()
n2 = pd_data_frame["label"][pd_data_frame["label"] == 2].count()
n3 = pd_data_frame["label"][pd_data_frame["label"] == 3].count()
print("n0 :", n0)
print("n1 :", n1)
print("n2 :", n2)
print("n3 :", n3)

n0 : 186809
n1 : 5378
n2 : 5418
n3 : 1397


In [10]:
186809/ 5378

34.7357753811826

In [11]:
data_frames = [pd_data_frame[pd_data_frame["label"] == i][:n2] for i in range(4)]
balanced_pd = pd.concat(data_frames, axis=0)
print(len(balanced_pd))

17611


In [12]:
balanced_pd = balanced_pd.sample(frac=1)

In [13]:
train, test = train_test_split(balanced_pd, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)

In [14]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
val = Dataset.from_pandas(val)

In [15]:
verses_dataset = DatasetDict({"train": train, "test": test, "validation": val}).remove_columns(['__index_level_0__', 'Meter', 'id'])
verses_dataset

DatasetDict({
    train: Dataset({
        features: ['Verse', 'label'],
        num_rows: 14264
    })
    test: Dataset({
        features: ['Verse', 'label'],
        num_rows: 1762
    })
    validation: Dataset({
        features: ['Verse', 'label'],
        num_rows: 1585
    })
})

In [16]:
verses_dataset.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/verses/verses_hf")

Saving the dataset (0/1 shards):   0%|          | 0/14264 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1762 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1585 [00:00<?, ? examples/s]

In [17]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
    "psktoure/BERT_WordPiece_wikitext-103-raw-v1",
    num_labels=4,
    id2label=idx_to_label,
    label2id=label_to_idx,
    ignore_mismatched_sizes=True
)
tokenizer = AutoTokenizer.from_pretrained(
    "psktoure/BERT_WordPiece_wikitext-103-raw-v1"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at psktoure/BERT_WordPiece_wikitext-103-raw-v1 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def tokenize_function(examples):
	return tokenizer(examples['Verse'], padding=False, truncation=True, max_length=128)

dataset_hf_tokenized = verses_dataset.map(tokenize_function, remove_columns=['Verse'], num_proc=15)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(dataset_hf_tokenized)

Map (num_proc=15):   0%|          | 0/14264 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1762 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1585 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14264
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1762
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1585
    })
})


In [19]:
training_args = TrainingArguments(
    output_dir="/tmp/ety_bert",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=dataset_hf_tokenized["train"],
    eval_dataset=dataset_hf_tokenized["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [20]:
trainer.train()

  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.9194332361221313, 'eval_runtime': 0.1368, 'eval_samples_per_second': 11583.972, 'eval_steps_per_second': 51.159, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.8907392621040344, 'eval_runtime': 0.1358, 'eval_samples_per_second': 11667.98, 'eval_steps_per_second': 51.531, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.8690196871757507, 'eval_runtime': 0.134, 'eval_samples_per_second': 11827.216, 'eval_steps_per_second': 52.234, 'epoch': 3.0}
{'train_runtime': 9.4349, 'train_samples_per_second': 4535.493, 'train_steps_per_second': 17.806, 'train_loss': 0.8502011072067988, 'epoch': 3.0}


TrainOutput(global_step=168, training_loss=0.8502011072067988, metrics={'train_runtime': 9.4349, 'train_samples_per_second': 4535.493, 'train_steps_per_second': 17.806, 'total_flos': 440040586290240.0, 'train_loss': 0.8502011072067988, 'epoch': 3.0})

In [21]:
predictions = trainer.predict(dataset_hf_tokenized["test"])
preds, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(preds, axis=-1)
print(np.mean(preds == labels))

  0%|          | 0/7 [00:00<?, ?it/s]

0.6458569807037458


In [22]:
phonetic_bert = AutoModelForSequenceClassification.from_pretrained('psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1', num_labels=4)
tokenizer = AutoTokenizer.from_pretrained('psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at psktoure/BERT_BPE_phonetic_wikitext-103-raw-v1 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from functools import lru_cache
import epitran

epi = epitran.Epitran("eng-Latn")


@lru_cache(maxsize=None)
def cahed_xsampa(word):
    return "".join(epi.xsampa_list(word))

def translate_verse_to_phonetic(verse):
    return " ".join([cahed_xsampa(word) for word in verse.split()])


def translate_to_phonetic(examples):
    return {"Verse": [translate_verse_to_phonetic(verse) for verse in examples["Verse"]]}


In [24]:
phonetic_dataset = verses_dataset.map(translate_to_phonetic, num_proc=15, batched=True)
phonetic_dataset.save_to_disk("/home/toure215/BERT_phonetic/DATASETS/verses/phonetic_verses_hf")

Map (num_proc=15):   0%|          | 0/14264 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1762 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1585 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14264 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1762 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1585 [00:00<?, ? examples/s]

In [25]:
phonetic_dataset_tokenized = phonetic_dataset.map(tokenize_function, remove_columns=['Verse'], num_proc=15, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map (num_proc=15):   0%|          | 0/14264 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1762 [00:00<?, ? examples/s]

Map (num_proc=15):   0%|          | 0/1585 [00:00<?, ? examples/s]

In [26]:
training_args = TrainingArguments(
    output_dir="/tmp/ety_bert_phonetic",
    num_train_epochs=3,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    evaluation_strategy="epoch",
    logging_strategy="no",
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=phonetic_bert,
    args=training_args,
    train_dataset=phonetic_dataset_tokenized["train"],
    eval_dataset=phonetic_dataset_tokenized["validation"],
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [27]:
trainer.train()

  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.8154968619346619, 'eval_runtime': 0.1652, 'eval_samples_per_second': 9593.601, 'eval_steps_per_second': 42.369, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.7156322598457336, 'eval_runtime': 0.1697, 'eval_samples_per_second': 9342.616, 'eval_steps_per_second': 41.261, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.6925850510597229, 'eval_runtime': 0.1759, 'eval_samples_per_second': 9011.34, 'eval_steps_per_second': 39.798, 'epoch': 3.0}
{'train_runtime': 12.0926, 'train_samples_per_second': 3538.702, 'train_steps_per_second': 13.893, 'train_loss': 0.6957679476056781, 'epoch': 3.0}


TrainOutput(global_step=168, training_loss=0.6957679476056781, metrics={'train_runtime': 12.0926, 'train_samples_per_second': 3538.702, 'train_steps_per_second': 13.893, 'total_flos': 697084147523712.0, 'train_loss': 0.6957679476056781, 'epoch': 3.0})

In [28]:
predictions = trainer.predict(phonetic_dataset_tokenized["test"])
preds, labels = predictions.predictions, predictions.label_ids
preds = np.argmax(preds, axis=-1) 
print(np.mean(preds == labels))

  0%|          | 0/7 [00:00<?, ?it/s]

0.7094211123723042


In [29]:
phonetic_dataset.column_names

{'train': ['Verse', 'label'],
 'test': ['Verse', 'label'],
 'validation': ['Verse', 'label']}