In [None]:
# Define import
import os
import re
import epitran
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizerFast, DataCollatorWithPadding
from tokenizers import Tokenizer
from transformers import DataCollatorWithPadding

In [None]:
epi = epitran.Epitran('eng-Latn')
translated = epi.transliterate('hello')
print(translated)

həlow


In [None]:
task_to_fields = {
    'ax': ('premise', 'hypothesis'),
    'rte': ('sentence1', 'sentence2'),
    'sst2': ('sentence',),
    'qqp': ('question1', 'question2'),
    'mnli_mismatched': ('premise', 'hypothesis'),
    'mnli_matched': ('premise', 'hypothesis'),
    'qnli': ('question', 'sentence'),
    'cola': ('sentence',),
    'mrpc': ('sentence1', 'sentence2'),
    'stsb': ('sentence1', 'sentence2'),
    'wnli': ('sentence1', 'sentence2'),
}

In [None]:
def xsampa(sentence):
    """Translate to phonetic a sentence using x_sampa encoding"""
    words = re.findall( r'\w+|[^\s\w]+', sentence)
    return " ".join(map(lambda x: "".join(epi.xsampa_list(x)), words))

def translate_to_phonetic(dataset_dict, task_name):
    fields = task_to_fields.get(task_name, None)

    if not fields:
        raise ValueError(f"Task {task_name} not found in task_to_fields dictionary.")

    translated = {field: xsampa(dataset_dict[field]) for field in fields}

    return translated


In [None]:
task = "rte"
dataset = load_dataset("glue", task)
dataset

train-00000-of-00001.parquet:   0%|          | 0.00/584k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/621k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [None]:
phonetic_dataset = dataset.map(lambda example: translate_to_phonetic(example, task))

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# load model and tokenizer
file_path = os.listdir('./results')[0]
file_path = os.path.join('./results', file_path)
print(file_path)
model = BertForSequenceClassification.from_pretrained(file_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained('tokenizer')

./results/checkpoint-500


In [None]:
def tokenize_function(examples, task_name):
    fields = task_to_fields.get(task_name, None)

    if not fields:
        raise ValueError(f"Task {task_name} not found in task_to_fields dictionary.")

    if len(fields) == 1:
        # sst2 case
        return tokenizer(
            examples[fields[0]],
            truncation=True,
            max_length=512
        )
    else:
        # the rest hopefully
        return tokenizer(
            examples[fields[0]],
            examples[fields[1]],
            truncation=True,
            max_length=512
        )

tokenized_dataset = phonetic_dataset.map(lambda example: tokenize_function(example, task), batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.resize_token_embeddings(len(tokenizer))

Embedding(17737, 768, padding_idx=0)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-5, # default 5e-5
    num_train_epochs=3,
    weight_decay=3e-5,
    logging_dir='./logs',
    logging_steps=10,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7141,0.720279
2,0.7059,0.694235
3,0.7034,0.695984


TrainOutput(global_step=234, training_loss=0.6993148469517374, metrics={'train_runtime': 675.2972, 'train_samples_per_second': 11.062, 'train_steps_per_second': 0.347, 'total_flos': 1948781364846720.0, 'train_loss': 0.6993148469517374, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_dataset['validation'])
preds, labels, metrics = predictions
preds = preds.argmax(-1)
print(preds)
print(labels)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 0 1 1 0 1 1
 0 1 1 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 0 0 0 0 0 0 1 0
 0 1 0 1 1 0 1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 1 0 0 1
 0 0 1 1 0 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0
 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 1

In [None]:
# evalute
import evaluate
metric = evaluate.load("glue", task)
metric.compute(predictions=preds, references=labels)


{'accuracy': 0.4729241877256318}

In [None]:
!zip -r ./models.zip ./results/checkpoint-* ./tokenizer/ ./phonetic_wiki.txt

  adding: results/checkpoint-345/ (stored 0%)
  adding: results/checkpoint-345/optimizer.pt (deflated 20%)
  adding: results/checkpoint-345/trainer_state.json (deflated 78%)
  adding: results/checkpoint-345/special_tokens_map.json (deflated 36%)
  adding: results/checkpoint-345/training_args.bin (deflated 51%)
  adding: results/checkpoint-345/config.json (deflated 49%)
  adding: results/checkpoint-345/model.safetensors (deflated 7%)
  adding: results/checkpoint-345/tokenizer.json (deflated 73%)
  adding: results/checkpoint-345/rng_state.pth (deflated 25%)
  adding: results/checkpoint-345/tokenizer_config.json (deflated 96%)
  adding: results/checkpoint-345/scheduler.pt (deflated 56%)
  adding: tokenizer/ (stored 0%)
  adding: tokenizer/special_tokens_map.json (deflated 36%)
  adding: tokenizer/tokenizer.json (deflated 73%)
  adding: tokenizer/tokenizer_config.json (deflated 96%)
  adding: phonetic_wiki.txt (deflated 67%)


In [None]:
!du -sh /content/models.zip

966M	/content/models.zip


In [None]:
import gc
import torch

del model
gc.collect()
torch.cuda.empty_cache()