### Sonny

t5-small 128
MarianMTModel - 64

In [None]:
import os
from accelerate.utils import write_basic_config
write_basic_config() # Write a config file
os._exit(00) # Restart the notebook

In [1]:
!pip install sacremoses -q

[0m

In [2]:
!pip install sacrebleu -q

[0m

In [3]:
from datasets import load_dataset, DatasetDict, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, get_scheduler
from transformers import MarianMTModel, MarianTokenizer

from torch.utils.data import DataLoader
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed
from torch.optim import AdamW
from time import time
import torch

2024-06-03 04:44:06.641236: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-03 04:44:06.641356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-03 04:44:06.774549: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
dataset = load_dataset('Helsinki-NLP/opus_books', 'en-hu')

Downloading readme:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/137151 [00:00<?, ? examples/s]

In [5]:
dataset['train'][0]

{'id': '0',
 'translation': {'en': 'Source: Project GutenbergAudiobook available here',
  'hu': 'Source: mek.oszk.huTranslation: Szenczi MiklósAudiobook available here'}}

In [6]:
val_test_set = dataset['train'].train_test_split(test_size=0.2, seed=42)
test_set = val_test_set['test'].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    'train': val_test_set['train'],
    'val': test_set['test'],
    'test': test_set['train']
})

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 109720
    })
    val: Dataset({
        features: ['id', 'translation'],
        num_rows: 13716
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 13715
    })
})

In [8]:
def get_config():
    config = {
        'model_name': 'Helsinki-NLP/opus-mt-en-hu', # 't5-small', 
        'max_length': 128,
        'batch_size': 64,
        'lr': 10 ** -5,
        'epochs': 10,
        'seed': 42,
        'metric_name': 'sacrebleu',
        'save_model': '/kaggle/working/model.pth',
        
    }
    return config

In [9]:
config = get_config()

### DataLoader

In [10]:
def get_dataloader(dataset, config, is_train):
    if is_train == True:
        return DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)
    else:
        return DataLoader(dataset, batch_size=config['batch_size'], drop_last=False, shuffle=False)

In [11]:
train_loader = get_dataloader(dataset['train'], config, True)
val_loader = get_dataloader(dataset['val'], config, False)
test_loader = get_dataloader(dataset['test'], config, False)

### Training

In [12]:
import wandb

# Replace 'your-api-key' with your actual wandb API key
wandb.login(key='06cc95a1f4faf48400aa0bf5e162b3ace6237a45')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [13]:
def train():
    set_seed(config['seed'])
    accelerator = Accelerator(mixed_precision='fp16', log_with='wandb')

    # Initialize tokenizer and model
    tokenizer = MarianTokenizer.from_pretrained(config['model_name'])
    model = MarianMTModel.from_pretrained(config['model_name'])

    # Initialize dataloaders
    train_loader = get_dataloader(dataset['train'], config, True)
    val_loader = get_dataloader(dataset['val'], config, False)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=config['lr'])

    # Initialize lr scheduler
    num_training_steps = len(train_loader) * config['epochs']
    lr_scheduler = get_scheduler('cosine', optimizer=optimizer, num_training_steps=num_training_steps, num_warmup_steps=0)

    # Prepare with accelerator
    model, optimizer, train_loader, val_loader, lr_scheduler = accelerator.prepare(model, optimizer, train_loader, val_loader, lr_scheduler)

    accelerator.init_trackers('Machine Translation')

    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    accelerator.print(f'Number of training parameters: {n_parameters}')
    accelerator.print(f"Batch size: {config['batch_size']}")
    start_time = time()
    accelerator.print('======== Start training ======== ')
    
    # Training loop
    for epoch in range(config['epochs']):
        model.train()
        training_loss = 0
        for data in train_loader:
            optimizer.zero_grad()
            src = tokenizer(data['translation']['en'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(accelerator.device)
            labels = tokenizer(data['translation']['hu'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt')

            outputs = model(**src, labels=labels.input_ids.to(accelerator.device))
            loss = outputs.loss

            accelerator.backward(loss)
            optimizer.step()
            
            lr_scheduler.step()
            optimizer.zero_grad()

            training_loss += loss.item()
            

        training_loss = training_loss / len(train_loader)
        accelerator.log({"Training loss": training_loss})

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data in val_loader:
                src = tokenizer(data['translation']['en'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(accelerator.device)
                labels = tokenizer(data['translation']['hu'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt')

                outputs = model(**src, labels=labels.input_ids.to(accelerator.device))
                loss = outputs.loss

                val_loss += loss.item()

        val_loss = val_loss / len(val_loader)
        accelerator.log({"Val loss": val_loss})
        accelerator.print(f"Epoch {epoch + 1}: Training loss = {training_loss}, Val loss = {val_loss}")

    accelerator.print('======== End training ========')
    total_training_time = time() - start_time
    accelerator.print(f"Total training time: {total_training_time}")

    # Save model
    accelerator.wait_for_everyone()
    model = accelerator.unwrap_model(model)
    accelerator.save(model, config['save_model'])
    accelerator.end_training()

In [14]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [15]:
notebook_launcher(train, num_processes=1)

Launching training on one GPU.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/792k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/850k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.57M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33msonnyinkai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Number of training parameters: 76149760
Batch size: 64
Epoch 1: Training loss = 1.5007214059287535, Val loss = 1.0807931608931962
Epoch 2: Training loss = 1.0589040026720342, Val loss = 0.9280746623527172
Epoch 3: Training loss = 0.9452528844769426, Val loss = 0.8577722413595332
Epoch 4: Training loss = 0.8840221967015948, Val loss = 0.8187148321506589
Epoch 5: Training loss = 0.8456609952554063, Val loss = 0.794529776240504
Epoch 6: Training loss = 0.8214690263695341, Val loss = 0.7794745822285497
Epoch 7: Training loss = 0.8062448008414955, Val loss = 0.7708109947138054
Epoch 8: Training loss = 0.7972726668977876, Val loss = 0.7664283968681513
Epoch 9: Training loss = 0.792685797541204, Val loss = 0.7649582776912424
Epoch 10: Training loss = 0.7915419358206213, Val loss = 0.7646468841752341
Total training time: 8829.482492923737


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Training loss,▃▄▆█▁▃▄▅▁▃▃▅▁▃▃▅▂▂▄▅▁▂▄▄▁▂▄▄▁▃▃▅▂▂▃▅▁▂▄▅
Val loss,█▅▃▂▂▁▁▁▁▁

0,1
Training loss,0.79154
Val loss,0.76465


### Evaluation

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
model = torch.load('/kaggle/working/model.pth', map_location='cpu')

In [18]:
metric = load_metric("sacrebleu")

  metric = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [19]:
test_loader = get_dataloader(dataset['test'], config, False)

In [20]:
tokenizer = MarianTokenizer.from_pretrained(config['model_name'])

In [21]:
def evaluate(model, tokenizer, test_loader, metric, device):
    test_loss = 0 
    bleu_score = 0 

    model.to(device)
    model.eval()
    
    with torch.no_grad():
        for data in test_loader:
            # Tokenize the input and labels
            src = tokenizer(data['translation']['en'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(device)
            labels = tokenizer(data['translation']['hu'], max_length=config['max_length'], truncation=True, padding='longest', return_tensors='pt').to(device)
            
            # Forward pass
            outputs = model(**src, labels=labels.input_ids)
            loss = outputs.loss

            test_loss += loss.item()

            # Generate predictions
            predictions = model.generate(input_ids=src.input_ids, attention_mask=src.attention_mask)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels.input_ids, skip_special_tokens=True)

            # Calculate BLEU score
            decoded_labels = [[label] for label in decoded_labels]
            result = metric.compute(predictions=decoded_preds, references=decoded_labels)
            bleu_score += result['score']

    # Calculate average loss and BLEU score
    test_loss = test_loss / len(test_loader)
    bleu_score = bleu_score / len(test_loader)
    print(f"Test loss is: {test_loss} and BLEU score is: {bleu_score}")

    return test_loss, bleu_score


In [22]:
evaluate(model, tokenizer, test_loader, metric, device)

Test loss is: 0.7575763915860376 and BLEU score is: 10.135359818478044


(0.7575763915860376, 10.135359818478044)