### Sonny

t5-small 128
MarianMTModel - 64

In [None]:
import os
from accelerate.utils import write_basic_config
write_basic_config() # Write a config file
os._exit(00) # Restart the notebook

In [1]:
!pip install sacremoses -q

[0m

In [24]:
!pip install sacrebleu -q

  pid, fd = os.forkpty()


[0m

In [None]:
from datasets import load_dataset, DatasetDict, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, get_scheduler
from transformers import MarianMTModel, MarianTokenizer

from torch.utils.data import DataLoader
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from torch.optim import AdamW
from time import time
import torch

In [2]:
dataset = load_dataset('Helsinki-NLP/opus_books', 'en-hu')

In [3]:
dataset['train'][0]

{'id': '0',
 'translation': {'en': 'Source: Project GutenbergAudiobook available here',
  'hu': 'Source: mek.oszk.huTranslation: Szenczi MiklósAudiobook available here'}}

In [4]:
val_test_set = dataset['train'].train_test_split(test_size=0.2, seed=42)
test_set = val_test_set['test'].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    'train': val_test_set['train'],
    'val': test_set['test'],
    'test': test_set['train']
})

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 109720
    })
    val: Dataset({
        features: ['id', 'translation'],
        num_rows: 13716
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 13715
    })
})

In [6]:
def get_config():
    config = {
        'model_name': 'Helsinki-NLP/opus-mt-en-hu', # 't5-small', 
        'max_length': 128,
        'batch_size': 64,
        'lr': 10 ** -5,
        'epochs': 10,
        'seed': 42,
        'metric_name': 'sacrebleu',
        'save_model': '/kaggle/working/model.pth',
        
    }
    return config

In [7]:
config = get_config()

### DataLoader

In [8]:
def get_dataloader(dataset, config, is_train):
    if is_train == True:
        return DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)
    else:
        return DataLoader(dataset, batch_size=config['batch_size'], drop_last=False, shuffle=False)

In [10]:
train_loader = get_dataloader(dataset['train'], config, True)
val_loader = get_dataloader(dataset['val'], config, False)
test_loader = get_dataloader(dataset['test'], config, False)

### Training

In [9]:
import wandb

# Replace 'your-api-key' with your actual wandb API key
wandb.login(key='06cc95a1f4faf48400aa0bf5e162b3ace6237a45')

[34m[1mwandb[0m: Currently logged in as: [33msonnyinkai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [13]:
def train():
    set_seed(config['seed'])
    accelerator = Accelerator(mixed_precision='fp16', log_with='wandb')

    # Initialize tokenizer and model
    tokenizer = MarianTokenizer.from_pretrained(config['model_name'])
    model = MarianMTModel.from_pretrained(config['model_name'])

    # Initialize dataloaders
    train_loader = get_dataloader(dataset['train'], config, True)
    val_loader = get_dataloader(dataset['val'], config, False)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=config['lr'])

    # Initialize lr scheduler
    num_training_steps = len(train_loader) * config['epochs']
    lr_scheduler = get_scheduler('cosine', optimizer=optimizer, num_training_steps=num_training_steps, num_warmup_steps=0)

    # Prepare with accelerator
    model, optimizer, train_loader, val_loader, lr_scheduler = accelerator.prepare(model, optimizer, train_loader, val_loader, lr_scheduler)

    accelerator.init_trackers('Machine Translation')

    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    accelerator.print(f'Number of training parameters: {n_parameters}')
    accelerator.print(f"Batch size: {config['batch_size']}")
    start_time = time()
    accelerator.print('======== Start training ======== ')
    
    # Training loop
    for epoch in range(config['epochs']):
        model.train()
        training_loss = 0
        for data in train_loader:
            optimizer.zero_grad()
            src = tokenizer(data['translation']['en'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(accelerator.device)
            labels = tokenizer(data['translation']['hu'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt')

            outputs = model(**src, labels=labels.input_ids.to(accelerator.device))
            loss = outputs.loss

            accelerator.backward(loss)
            optimizer.step()
            
            lr_scheduler.step()
            optimizer.zero_grad()

            training_loss += loss.item()
            
            accelerator.log({"Training loss": training_loss})

        training_loss = training_loss / len(train_loader)
        accelerator.log({"Training loss": training_loss})

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data in val_loader:
                src = tokenizer(data['translation']['en'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(accelerator.device)
                labels = tokenizer(data['translation']['hu'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt')

                outputs = model(**src, labels=labels.input_ids.to(accelerator.device))
                loss = outputs.loss

                val_loss += loss.item()

        val_loss = val_loss / len(val_loader)
        accelerator.log({"Val loss": val_loss})
        accelerator.print(f"Epoch {epoch + 1}: Training loss = {training_loss}, Val loss = {val_loss}")

    accelerator.print('======== End training ========')
    total_training_time = time() - start_time
    accelerator.print(f"Total training time: {total_training_time}")

    # Save model
    accelerator.wait_for_everyone()
    model = accelerator.unwrap_model(model)
    accelerator.save(model, config['save_model'])
    accelerator.end_training()

In [12]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [14]:
notebook_launcher(train, num_processes=2)

Launching training on 2 GPUs.


  self.pid = os.fork()
  self.pid = os.fork()
  return self.fget.__get__(instance, owner)()
  return self.fget.__get__(instance, owner)()


Number of training parameters: 76149760
Batch size: 64
Epoch 1: Training loss = 1.7197015921135883, Val loss = 1.2082806748372537
Epoch 2: Training loss = 1.1870478390952646, Val loss = 1.0266323757392388
Epoch 3: Training loss = 1.0526310902236502, Val loss = 0.9404577066500982
Epoch 4: Training loss = 0.9819862646656436, Val loss = 0.890795718188639
Epoch 5: Training loss = 0.9354820445284143, Val loss = 0.8604172170162201
Epoch 6: Training loss = 0.9096408127904771, Val loss = 0.8410754987487087
Epoch 7: Training loss = 0.8874246104336008, Val loss = 0.8301579163030341
Epoch 8: Training loss = 0.8770192352347163, Val loss = 0.8242287288109461
Epoch 9: Training loss = 0.8730430450889614, Val loss = 0.8222033762269549
Epoch 10: Training loss = 0.8676336767512324, Val loss = 0.8218545957847878
Total training time: 5034.0282661914825


VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded\r'), FloatProgress(value=0.07578827073918239, max=1.…

0,1
Training loss,█▄▃▂▂▁▁▁▁▁
Val loss,█▅▃▂▂▁▁▁▁▁

0,1
Training loss,0.86763
Val loss,0.82185


### Evaluation

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
model = torch.load('/kaggle/working/model.pth', map_location='cpu')

In [25]:
metric = load_metric("sacrebleu")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [26]:
test_loader = get_dataloader(dataset['test'], config, False)

In [27]:
tokenizer = MarianTokenizer.from_pretrained(config['model_name'])

In [33]:
def evaluate(model, tokenizer, test_loader, metric, device):
    test_loss = 0 
    bleu_score = 0 

    model.to(device)
    model.eval()
    
    with torch.no_grad():
        for data in test_loader:
            # Tokenize the input and labels
            src = tokenizer(data['translation']['en'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(device)
            labels = tokenizer(data['translation']['hu'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(device)
            
            # Forward pass
            outputs = model(**src, labels=labels.input_ids)
            loss = outputs.loss

            test_loss += loss.item()

            # Generate predictions
            predictions = model.generate(input_ids=src.input_ids, attention_mask=src.attention_mask)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels.input_ids, skip_special_tokens=True)

            # Calculate BLEU score
            decoded_labels = [[label] for label in decoded_labels]
            result = metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleu_score += result['score']

    # Calculate average loss and BLEU score
    test_loss = test_loss / len(test_loader)
    bleu_score = bleu_score / len(test_loader)
    print(f"Test loss is: {test_loss} and BLEU score is: {bleu_score}")

    return test_loss, bleu_score


In [34]:
evaluate(model, tokenizer, test_loader, metric, device)

Test loss is: 0.8217921686726947 and BLEU score is: 9.359375810821396


(0.8217921686726947, 9.359375810821396)