First Training with the GPT-2 decoder 🤖 (bayes search)
-----------------------------------

In this notebook, we will train the pre-trained GPT-2 model provided by OPEN-AI. It only tests how the model can be accurate on the corpora we extracted. It is undoubtedly a partial model. We will add hyperparameter search with the `wandb` library.


We will make this with and without augmentation and see where we obtain better results. 

In [3]:
# let us extend the paths of the system
import sys

path = "/content/drive/MyDrive/Memoire/subject2/"

sys.path.extend([f"{path}new_data", f"{path}wolof-translate"])

In [4]:
# define environment
%env WANDB_LOG_MODEL=true
%env WANDB_NOTEBOOK_NAME=training_gpt2_2.ipynb
%env WANDB_API_KEY=237a8450cd2568ea1c8e1f8e0400708e79b6b4ee 

env: WANDB_LOG_MODEL=true
env: WANDB_NOTEBOOK_NAME=training_gpt2_2.ipynb
env: WANDB_API_KEY=237a8450cd2568ea1c8e1f8e0400708e79b6b4ee


In [5]:
!pip install -qq wandb --upgrade

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.7/201.7 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [6]:
!pip install evaluate -qq
!pip install sacrebleu -qq
# !pip install optuna -qq
!pip install transformers -qq 
!pip install tokenizers -qq
!pip install nlpaug -qq
!pip install ray[tune] -qq
!python -m spacy download fr_core_news_lg 

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [7]:
# let us import all necessary libraries
from wolof_translate.utils.sent_transformers import TransformerSequences
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer
from wolof_translate.data.dataset_v1 import SentenceDataset
from wolof_translate.utils.sent_corrections import *
from sklearn.model_selection import train_test_split
from nlpaug.augmenter import char as nac
from torch.utils.data import DataLoader
# from datasets  import load_metric # make pip install evaluate instead
# and pip install sacrebleu for instance
from functools import partial
from tqdm import tqdm
import pandas as pd
import numpy as np
import evaluate
import torch


We will create two models: 

- One translating the french corpus to a wolof corpus [french_to_wolof](#french-to-wolof)
- One translating the wolof corpus to a french corpus [wolof_to_french](#wolof-to-french)

--------------

## French to wolof

### Configure dataset 🔠

We can use the same custom dataset that we created in [text_augmentation](text_augmentation.ipynb). But we need to split the data between train and test sets and save them.

In [None]:
def split_data(random_state: int = 50):

  # load the corpora and split into train and test sets
  corpora = pd.read_csv(f"{path}new_data/sent_extraction.csv")

  train_set, test_set = train_test_split(corpora, test_size=0.1, random_state=random_state)

  # let us save the sets
  train_set.to_csv(f"{path}new_data/train_set.csv", index=False)

  test_set.to_csv(f"{path}new_data/test_set.csv", index=False)

Let us recuperate the datasets with and without augmentation.

In [None]:
def recuperate_datasets(fr_char_p: float, fr_word_p: float):

  # with augmentation
  fr_augmentation = TransformerSequences(nac.KeyboardAug(aug_char_p=fr_char_p, aug_word_p=fr_word_p),
                                        remove_mark_space, delete_guillemet_space)

  train_dataset_aug = SentenceDataset(f"{path}new_data/train_set.csv", 
                                  tokenizer_path = f"{path}wolof-translate/wolof_translate/tokenizers/tokenizer_v1.json",
                                  cp1_transformer=fr_augmentation, truncation=True,
                                  max_len=579)

  test_dataset = SentenceDataset(f"{path}new_data/test_set.csv",
                                tokenizer_path = f"{path}wolof-translate/wolof_translate/tokenizers/tokenizer_v1.json",
                                truncation=True, max_len=579)
  
  return train_dataset_aug, test_dataset

### Configure hyperparameter search ⚙️

We have to configure the search space and the search method ("random" in our case). .

In [None]:
import wandb
wandb.login(key="237a8450cd2568ea1c8e1f8e0400708e79b6b4ee")

# hyperparameters
sweep_config = {
    'method': 'bayes',
    'metric':{
          'goal': 'minimize',
          'name': 'eval_loss'
      },
    'parameters':
    {
      'epochs': {
          'value': 1
      },
      'batch_size': {
          'values': [2, 3, 5]
      },
      'learning_rate': {
          'distribution': 'log_uniform_values',
          'min': 1e-5,
          'max': 1e-3
      },
      'weight_decay': {
          'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
      },
     'fr_char_p': {
         'min': 0.0,
         'max': 0.7
     },
     'fr_word_p': {
          'min': 0.0,
          'max': 0.7
     },
     'random_state': {
         'values': [0, 10, 20, 30, 40, 50, 60, 70, 80, 100]
     }
    }
}

# Initialize the hyperparameter search
sweep_id = wandb.sweep(sweep_config, project = "gpt2-wolof-french-translation_bayes1")



[34m[1mwandb[0m: Currently logged in as: [33moumar-kane[0m ([33moumar-kane-team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: uy1shq94
Sweep URL: https://wandb.ai/oumar-kane-team/gpt2-wolof-french-translation_bayes1/sweeps/uy1shq94


### Configure the model and the evaluation function ⚙️

Let us recuperate the model and resize the token embeddings.

In [None]:
def gpt2_model_init(tokenizer):
  # set the mode name
  model_name = "gpt2"

  # recuperate the tokenizer from the dataset
  tokenizer = tokenizer

  # configure the model
  model = GPT2LMHeadModel.from_pretrained(model_name).cuda()

  # resize the token embeddings
  model.resize_token_embeddings(len(tokenizer))

  return model

Let us evaluate the predictions with the `bleu` metric.

In [None]:
# %%writefile wolof-translate/wolof_translate/utils/evaluation.py
from tokenizers import Tokenizer
from typing import *
import numpy as np
import evaluate

class TranslationEvaluation:
    
    def __init__(self, 
                 tokenizer: Tokenizer,
                 decoder: Union[Callable, None] = None,
                 metric = evaluate.load('sacrebleu'),
                 ):
        
        self.tokenizer = tokenizer
        
        self.decoder = decoder
        
        self.metric = metric
    
    def postprocess_text(self, preds, labels):
        
        preds = [pred.strip() for pred in preds]
        
        labels = [[label.strip()] for label in labels]
        
        return preds, labels

    def compute_metrics(self, eval_preds):
        
        preds, labels = eval_preds.preds.detach().cpu(), labels.detach().cpu()
        
        if isinstance(preds, tuple):
            
            preds = preds[0]
        
        if self.decoder is None:
            
            decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
            
            decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            decoded_preds, decoded_labels = self.postprocess_text(decoded_preds, decoded_labels)
            
            result = self.metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            result = {"bleu": result["score"]}
            
            prediction_lens = [np.count_nonzero(pred != self.tokenizer.pad_token_id) for pred in preds]
            
            result["gen_len"] = np.mean(prediction_lens)
        
        else:
            
            predictions = list(self.decoder(preds))
            
            labels = list(self.decoder(labels))
      
            decoded_preds, decoded_labels = self.postprocess_text(predictions, labels)
            
            result = self.metric.compute(predictions=predictions, references=labels)
            
            result = {"bleu": result["score"]}
        
        result = {k:round(v, 4) for k, v in result.items()}

        wandb.log("bleu", result["bleu"])
            
        return result

In [None]:
# %run wolof-translate/wolof_translate/utils/evaluation.py

Let us initialize the evaluation object.

In [None]:
# translation_eval = TranslationEvaluation(test_dataset.tokenizer)

### Searching for the best parameters 🕖

Let us define the data collator.

In [None]:
def data_collator(batch):
    """Generate a batch of data to provide to trainer

    Args:
        batch (_type_): The batch

    Returns:
        dict: A dictionary containing the ids, the attention mask and the labels
    """
    input_ids = torch.stack([b[0] for b in batch])
    
    attention_mask = torch.stack([b[1] for b in batch])
    
    labels = torch.stack([b[0] for b in batch])
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask,
            'labels': labels}

Let us initialize the training arguments and make random search.

In [None]:
# %%wandb

def train(config = None):

  with wandb.init(config = config):

    # seed
    torch.manual_seed(50)

    # set sweep configuration
    config = wandb.config

    # split the data
    split_data(config.random_state)

    # let us recuperate the datasets
    train_dataset, test_dataset = recuperate_datasets(config.fr_char_p, config.fr_word_p)

    # get train and test datasets according to the config

    # train_dataset = datasets[config.dataset_aug]['train_dataset']

    # test_dataset = datasets[config.dataset_aug]['test_dataset']

    # set training arguments
    training_args = TrainingArguments(f"{path}training2/results1",
                                      report_to = f"wandb",
                                      num_train_epochs=config.epochs,
                                      # logging_steps=100,
                                      load_best_model_at_end=True,
                                      save_strategy="epoch",
                                      evaluation_strategy="epoch",
                                      logging_strategy = 'epoch',
                                      per_device_train_batch_size=config.batch_size, 
                                      per_device_eval_batch_size=5,
                                      learning_rate=config.learning_rate,
                                      weight_decay=config.weight_decay,
                                      logging_dir=f'{path}gpt2_training_logs2',
                                      remove_unused_columns = False,
                                      fp16 = True,
                                      )   

    # define training loop
    trainer = Trainer(model_init=partial(gpt2_model_init, tokenizer = train_dataset.tokenizer),
                      args=training_args,
                      train_dataset=train_dataset, 
                      eval_dataset=test_dataset,
                      data_collator=data_collator,
                      # compute_metrics=translation_eval.compute_metrics
                      )

    # start training loop
    trainer.train()

agent = wandb.agent(sweep_id, train, count = 25)


[34m[1mwandb[0m: Agent Starting Run: 9e51e1u9 with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.5581741048902115
[34m[1mwandb[0m: 	fr_word_p: 0.6578266163044363
[34m[1mwandb[0m: 	learning_rate: 3.805299002293826e-05
[34m[1mwandb[0m: 	random_state: 100
[34m[1mwandb[0m: 	weight_decay: 0.4




Epoch,Training Loss,Validation Loss
1,1.622,0.906943


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.90694
eval/runtime,2.6736
eval/samples_per_second,30.671
eval/steps_per_second,6.359
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.622
train/total_flos,216590170752000.0
train/train_loss,1.62203


[34m[1mwandb[0m: Agent Starting Run: 77xbcu8n with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.04780608618206691
[34m[1mwandb[0m: 	fr_word_p: 0.2934642666867476
[34m[1mwandb[0m: 	learning_rate: 0.0004265318428674258
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.4




Epoch,Training Loss,Validation Loss
1,1.2504,0.778923


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.77892
eval/runtime,2.7289
eval/samples_per_second,30.049
eval/steps_per_second,6.23
train/epoch,1.0
train/global_step,367.0
train/learning_rate,1e-05
train/loss,1.2504
train/total_flos,216590170752000.0
train/train_loss,1.25037


[34m[1mwandb[0m: Agent Starting Run: 1iwqpkgm with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.16737852032273567
[34m[1mwandb[0m: 	fr_word_p: 0.28649966586939685
[34m[1mwandb[0m: 	learning_rate: 0.00013263986254450776
[34m[1mwandb[0m: 	random_state: 10
[34m[1mwandb[0m: 	weight_decay: 0.2




Epoch,Training Loss,Validation Loss
1,1.6627,0.840447


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.84045
eval/runtime,2.7528
eval/samples_per_second,29.787
eval/steps_per_second,6.175
train/epoch,1.0
train/global_step,147.0
train/learning_rate,1e-05
train/loss,1.6627
train/total_flos,216590170752000.0
train/train_loss,1.66269


[34m[1mwandb[0m: Agent Starting Run: 735lugar with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.526513857153737
[34m[1mwandb[0m: 	fr_word_p: 0.49282705038844304
[34m[1mwandb[0m: 	learning_rate: 0.0002043984784509529
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0




Epoch,Training Loss,Validation Loss
1,1.3835,0.877133


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.87713
eval/runtime,2.7615
eval/samples_per_second,29.694
eval/steps_per_second,6.156
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.3835
train/total_flos,216590170752000.0
train/train_loss,1.38353


[34m[1mwandb[0m: Agent Starting Run: 7kglau67 with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.6531041380804302
[34m[1mwandb[0m: 	fr_word_p: 0.23760096478570283
[34m[1mwandb[0m: 	learning_rate: 0.000718198892880793
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.4




Epoch,Training Loss,Validation Loss
1,1.3971,0.776812


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.77681
eval/runtime,2.7758
eval/samples_per_second,29.541
eval/steps_per_second,6.124
train/epoch,1.0
train/global_step,245.0
train/learning_rate,2e-05
train/loss,1.3971
train/total_flos,216590170752000.0
train/train_loss,1.39714


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4m2eka1h with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.6167489331342644
[34m[1mwandb[0m: 	fr_word_p: 0.24656203270287985
[34m[1mwandb[0m: 	learning_rate: 0.000451149817879716
[34m[1mwandb[0m: 	random_state: 0
[34m[1mwandb[0m: 	weight_decay: 0.3




Epoch,Training Loss,Validation Loss
1,1.3258,0.719152


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.71915
eval/runtime,2.7706
eval/samples_per_second,29.597
eval/steps_per_second,6.136
train/epoch,1.0
train/global_step,367.0
train/learning_rate,1e-05
train/loss,1.3258
train/total_flos,216590170752000.0
train/train_loss,1.32579


[34m[1mwandb[0m: Agent Starting Run: 9mvh0lnh with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.2264991217570484
[34m[1mwandb[0m: 	fr_word_p: 0.5839893860446403
[34m[1mwandb[0m: 	learning_rate: 0.00030585180789287445
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.3




Epoch,Training Loss,Validation Loss
1,1.6349,0.898007


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.89801
eval/runtime,2.7625
eval/samples_per_second,29.683
eval/steps_per_second,6.154
train/epoch,1.0
train/global_step,147.0
train/learning_rate,2e-05
train/loss,1.6349
train/total_flos,216590170752000.0
train/train_loss,1.63492


[34m[1mwandb[0m: Agent Starting Run: o5wp33w1 with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.2879121617388648
[34m[1mwandb[0m: 	fr_word_p: 0.2953003698700635
[34m[1mwandb[0m: 	learning_rate: 7.077734501713909e-05
[34m[1mwandb[0m: 	random_state: 0
[34m[1mwandb[0m: 	weight_decay: 0.4




Epoch,Training Loss,Validation Loss
1,1.5222,0.813414


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.81341
eval/runtime,2.7541
eval/samples_per_second,29.774
eval/steps_per_second,6.173
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.5222
train/total_flos,216590170752000.0
train/train_loss,1.5222


[34m[1mwandb[0m: Agent Starting Run: vlw3r86q with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.35615767375446367
[34m[1mwandb[0m: 	fr_word_p: 0.5877794126953338
[34m[1mwandb[0m: 	learning_rate: 8.97269044032181e-05
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.4




Epoch,Training Loss,Validation Loss
1,1.7636,0.962839


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.96284
eval/runtime,2.7583
eval/samples_per_second,29.729
eval/steps_per_second,6.163
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.7636
train/total_flos,216590170752000.0
train/train_loss,1.7636


[34m[1mwandb[0m: Agent Starting Run: zwrcu8nw with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.20880201195327625
[34m[1mwandb[0m: 	fr_word_p: 0.27812413200334757
[34m[1mwandb[0m: 	learning_rate: 0.0006676348627454577
[34m[1mwandb[0m: 	random_state: 20
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss
1,1.2909,0.818876


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.81888
eval/runtime,2.7529
eval/samples_per_second,29.786
eval/steps_per_second,6.175
train/epoch,1.0
train/global_step,367.0
train/learning_rate,1e-05
train/loss,1.2909
train/total_flos,216590170752000.0
train/train_loss,1.29091


[34m[1mwandb[0m: Agent Starting Run: 8tairrrb with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.15050441182036742
[34m[1mwandb[0m: 	fr_word_p: 0.6911094810657563
[34m[1mwandb[0m: 	learning_rate: 4.216059735391623e-05
[34m[1mwandb[0m: 	random_state: 80
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss
1,1.7536,0.924779


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.92478
eval/runtime,2.7592
eval/samples_per_second,29.719
eval/steps_per_second,6.161
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.7536
train/total_flos,216590170752000.0
train/train_loss,1.75356


[34m[1mwandb[0m: Agent Starting Run: 0m1wbxrq with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.39337281602009766
[34m[1mwandb[0m: 	fr_word_p: 0.2975121495884152
[34m[1mwandb[0m: 	learning_rate: 0.00025823365169873195
[34m[1mwandb[0m: 	random_state: 70
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss
1,1.4459,0.830113


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.83011
eval/runtime,2.7731
eval/samples_per_second,29.57
eval/steps_per_second,6.13
train/epoch,1.0
train/global_step,245.0
train/learning_rate,1e-05
train/loss,1.4459
train/total_flos,216590170752000.0
train/train_loss,1.44591


[34m[1mwandb[0m: Agent Starting Run: arpwebjc with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.3862316743502598
[34m[1mwandb[0m: 	fr_word_p: 0.47220106624955194
[34m[1mwandb[0m: 	learning_rate: 7.925827727510921e-05
[34m[1mwandb[0m: 	random_state: 70
[34m[1mwandb[0m: 	weight_decay: 0.5




Epoch,Training Loss,Validation Loss
1,1.7408,0.915819


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.91582
eval/runtime,2.7545
eval/samples_per_second,29.77
eval/steps_per_second,6.172
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.7408
train/total_flos,216590170752000.0
train/train_loss,1.74081


[34m[1mwandb[0m: Agent Starting Run: wcpwhyq3 with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.11800051372162204
[34m[1mwandb[0m: 	fr_word_p: 0.4564208025649316
[34m[1mwandb[0m: 	learning_rate: 0.00018349848970390976
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.2




Epoch,Training Loss,Validation Loss
1,1.6381,0.927052


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.92705
eval/runtime,2.7581
eval/samples_per_second,29.731
eval/steps_per_second,6.164
train/epoch,1.0
train/global_step,147.0
train/learning_rate,1e-05
train/loss,1.6381
train/total_flos,216590170752000.0
train/train_loss,1.63813


[34m[1mwandb[0m: Agent Starting Run: ftqg9xyt with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.4681800871709539
[34m[1mwandb[0m: 	fr_word_p: 0.40108962433453255
[34m[1mwandb[0m: 	learning_rate: 0.00030905704557136456
[34m[1mwandb[0m: 	random_state: 20
[34m[1mwandb[0m: 	weight_decay: 0.3




Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,1.4648,0.849067


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.84907
eval/runtime,2.7454
eval/samples_per_second,29.868
eval/steps_per_second,6.192
train/epoch,1.0
train/global_step,245.0
train/learning_rate,1e-05
train/loss,1.4648
train/total_flos,216590170752000.0
train/train_loss,1.46477


[34m[1mwandb[0m: Agent Starting Run: qjf7ecgz with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.0942741739833867
[34m[1mwandb[0m: 	fr_word_p: 0.11077105731033872
[34m[1mwandb[0m: 	learning_rate: 9.195377985553066e-05
[34m[1mwandb[0m: 	random_state: 0
[34m[1mwandb[0m: 	weight_decay: 0.4




Epoch,Training Loss,Validation Loss
1,1.2511,0.763528


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.76353
eval/runtime,2.7505
eval/samples_per_second,29.813
eval/steps_per_second,6.181
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.2511
train/total_flos,216590170752000.0
train/train_loss,1.25109


[34m[1mwandb[0m: Agent Starting Run: f1cid8te with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.6729225034492003
[34m[1mwandb[0m: 	fr_word_p: 0.5852055032457493
[34m[1mwandb[0m: 	learning_rate: 8.41693533259522e-05
[34m[1mwandb[0m: 	random_state: 20
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss
1,1.423,0.910818


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.91082
eval/runtime,2.7587
eval/samples_per_second,29.724
eval/steps_per_second,6.162
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.423
train/total_flos,216590170752000.0
train/train_loss,1.42299


[34m[1mwandb[0m: Agent Starting Run: kgcnthew with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.17242885520656936
[34m[1mwandb[0m: 	fr_word_p: 0.3844696616235247
[34m[1mwandb[0m: 	learning_rate: 5.146643120298594e-05
[34m[1mwandb[0m: 	random_state: 0
[34m[1mwandb[0m: 	weight_decay: 0.1




Epoch,Training Loss,Validation Loss
1,1.5024,0.824268


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.82427
eval/runtime,2.7579
eval/samples_per_second,29.733
eval/steps_per_second,6.164
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.5024
train/total_flos,216590170752000.0
train/train_loss,1.5024


[34m[1mwandb[0m: Agent Starting Run: 9hhnfdly with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.5524365189614215
[34m[1mwandb[0m: 	fr_word_p: 0.1326794146861514
[34m[1mwandb[0m: 	learning_rate: 7.757229279228031e-05
[34m[1mwandb[0m: 	random_state: 70
[34m[1mwandb[0m: 	weight_decay: 0.3




Epoch,Training Loss,Validation Loss
1,1.3292,0.863907


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.86391
eval/runtime,2.7623
eval/samples_per_second,29.685
eval/steps_per_second,6.154
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.3292
train/total_flos,216590170752000.0
train/train_loss,1.32921


[34m[1mwandb[0m: Agent Starting Run: 0iaeouhb with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.16205938534015293
[34m[1mwandb[0m: 	fr_word_p: 0.6189463501369112
[34m[1mwandb[0m: 	learning_rate: 4.122207999993984e-05
[34m[1mwandb[0m: 	random_state: 10
[34m[1mwandb[0m: 	weight_decay: 0.4




Epoch,Training Loss,Validation Loss
1,1.5444,0.865255


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.86525
eval/runtime,2.7603
eval/samples_per_second,29.707
eval/steps_per_second,6.159
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.5444
train/total_flos,216590170752000.0
train/train_loss,1.54436


[34m[1mwandb[0m: Agent Starting Run: r0cxa419 with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.4361037612688261
[34m[1mwandb[0m: 	fr_word_p: 0.5717770674686344
[34m[1mwandb[0m: 	learning_rate: 0.00016029380823855152
[34m[1mwandb[0m: 	random_state: 100
[34m[1mwandb[0m: 	weight_decay: 0




Epoch,Training Loss,Validation Loss
1,1.389,0.825492


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.82549
eval/runtime,2.8054
eval/samples_per_second,29.229
eval/steps_per_second,6.06
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.389
train/total_flos,216590170752000.0
train/train_loss,1.38902


[34m[1mwandb[0m: Agent Starting Run: i6yhiihu with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.16560673137459947
[34m[1mwandb[0m: 	fr_word_p: 0.12730656857678593
[34m[1mwandb[0m: 	learning_rate: 2.1355786362382953e-05
[34m[1mwandb[0m: 	random_state: 10
[34m[1mwandb[0m: 	weight_decay: 0.4


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669043583321278, max=1.0…



Epoch,Training Loss,Validation Loss
1,1.3835,0.856237


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.85624
eval/runtime,2.7556
eval/samples_per_second,29.758
eval/steps_per_second,6.169
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.3835
train/total_flos,216590170752000.0
train/train_loss,1.38352


[34m[1mwandb[0m: Agent Starting Run: 69peb7ro with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.019731407124009847
[34m[1mwandb[0m: 	fr_word_p: 0.44993573909979334
[34m[1mwandb[0m: 	learning_rate: 0.0004290223514951012
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.2




Epoch,Training Loss,Validation Loss
1,1.5844,0.809303


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.8093
eval/runtime,2.744
eval/samples_per_second,29.883
eval/steps_per_second,6.195
train/epoch,1.0
train/global_step,147.0
train/learning_rate,2e-05
train/loss,1.5844
train/total_flos,216590170752000.0
train/train_loss,1.58441


[34m[1mwandb[0m: Agent Starting Run: sajbsm4i with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.3672702769661469
[34m[1mwandb[0m: 	fr_word_p: 0.06541575658197718
[34m[1mwandb[0m: 	learning_rate: 0.00021593199008898315
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.2




Epoch,Training Loss,Validation Loss
1,1.1961,0.773625


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.77363
eval/runtime,2.7907
eval/samples_per_second,29.383
eval/steps_per_second,6.092
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.1961
train/total_flos,216590170752000.0
train/train_loss,1.19612


[34m[1mwandb[0m: Agent Starting Run: cwc9cly1 with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	fr_char_p: 0.10116489699879488
[34m[1mwandb[0m: 	fr_word_p: 0.2114979131069878
[34m[1mwandb[0m: 	learning_rate: 0.000117418264348541
[34m[1mwandb[0m: 	random_state: 70
[34m[1mwandb[0m: 	weight_decay: 0




Epoch,Training Loss,Validation Loss
1,1.4136,0.863103


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.8631
eval/runtime,2.7521
eval/samples_per_second,29.795
eval/steps_per_second,6.177
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.4136
train/total_flos,216590170752000.0
train/train_loss,1.41359


------------------

## Wolof to french

The only thing that we will change is the order of sentences. The wolof sentence is the first one to write.

### Configure dataset 🔠

We can use the same custom dataset that we created in [text_augmentation](text_augmentation.ipynb). But we need to split the data between train and test sets and save them.

In [8]:
def split_data(random_state: int = 50):

  # load the corpora and split into train and test sets
  corpora = pd.read_csv(f"{path}new_data/sent_extraction.csv")

  train_set, test_set = train_test_split(corpora, test_size=0.1, random_state=random_state)

  # let us save the sets
  train_set.to_csv(f"{path}new_data/train_set.csv", index=False)

  test_set.to_csv(f"{path}new_data/test_set.csv", index=False)

Let us recuperate the datasets.

In [9]:
def recuperate_datasets(wf_char_p: float, wf_word_p):

  # with augmentation
  wf_augmentation = TransformerSequences(nac.KeyboardAug(aug_char_p=wf_char_p, aug_word_p=wf_word_p),
                                        remove_mark_space, delete_guillemet_space)

  train_dataset_aug = SentenceDataset(f"{path}new_data/train_set.csv", 
                                  tokenizer_path = f"{path}wolof-translate/wolof_translate/tokenizers/tokenizer_v1.json",
                                  corpus_1="wolof_corpus",
                                  corpus_2="french_corpus",
                                  cp1_transformer=wf_augmentation, truncation=True,
                                  max_len=579)

  test_dataset = SentenceDataset(f"{path}new_data/test_set.csv",
                                tokenizer_path = f"{path}wolof-translate/wolof_translate/tokenizers/tokenizer_v1.json",
                                corpus_1="wolof_corpus",
                                corpus_2="french_corpus",
                                truncation=True, max_len=579)
  
  return train_dataset_aug, test_dataset

### Configure hyperparameter search ⚙️

We have to configure the search space and the search method ("random" in our case). .

In [10]:
import wandb
wandb.login(key="237a8450cd2568ea1c8e1f8e0400708e79b6b4ee")

# hyperparameters
sweep_config = {
    'method': 'bayes',
    'metric':{
          'goal': 'minimize',
          'name': 'eval_loss'
      },
    'parameters':
    {
      'epochs': {
          'value': 1
      },
      'batch_size': {
          'values': [2, 3, 5]
      },
      'learning_rate': {
          'distribution': 'log_uniform_values',
          'min': 1e-5,
          'max': 1e-3
      },
      'weight_decay': {
          'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
      },
     'wf_char_p': {
          'min': 0.0,
          'max': 0.7
     },
     'wf_word_p': {
          'min': 0.0,
          'max': 0.7
     },
     'random_state': {
         'values': [0, 10, 20, 30, 40, 50, 60, 70, 80, 100]
     }
    }
}

# Initialize the hyperparameter search
sweep_id = wandb.sweep(sweep_config, project = "gpt2-wolof-french-translation_bayes1_1")



[34m[1mwandb[0m: Currently logged in as: [33moumar-kane[0m ([33moumar-kane-team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: alygo14y
Sweep URL: https://wandb.ai/oumar-kane-team/gpt2-wolof-french-translation_bayes1_1/sweeps/alygo14y


### Configure the model and the evaluation function ⚙️

Let us recuperate the model and resize the token embeddings.

In [11]:
def gpt2_model_init(tokenizer):
  # set the mode name
  model_name = "gpt2"

  # recuperate the tokenizer from the dataset
  tokenizer = tokenizer

  # configure the model
  model = GPT2LMHeadModel.from_pretrained(model_name).cuda()

  # resize the token embeddings
  model.resize_token_embeddings(len(tokenizer))

  return model

Let us evaluate the predictions with the `bleu` metric.

In [12]:
# %%writefile wolof-translate/wolof_translate/utils/evaluation.py
from tokenizers import Tokenizer
from typing import *
import numpy as np
import evaluate

class TranslationEvaluation:
    
    def __init__(self, 
                 tokenizer: Tokenizer,
                 decoder: Union[Callable, None] = None,
                 metric = evaluate.load('sacrebleu'),
                 ):
        
        self.tokenizer = tokenizer
        
        self.decoder = decoder
        
        self.metric = metric
    
    def postprocess_text(self, preds, labels):
        
        preds = [pred.strip() for pred in preds]
        
        labels = [[label.strip()] for label in labels]
        
        return preds, labels

    def compute_metrics(self, eval_preds):
        
        preds, labels = eval_preds.preds.detach().cpu(), labels.detach().cpu()
        
        if isinstance(preds, tuple):
            
            preds = preds[0]
        
        if self.decoder is None:
            
            decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
            
            decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            decoded_preds, decoded_labels = self.postprocess_text(decoded_preds, decoded_labels)
            
            result = self.metric.compute(predictions=decoded_preds, references=decoded_labels)
            
            result = {"bleu": result["score"]}
            
            prediction_lens = [np.count_nonzero(pred != self.tokenizer.pad_token_id) for pred in preds]
            
            result["gen_len"] = np.mean(prediction_lens)
        
        else:
            
            predictions = list(self.decoder(preds))
            
            labels = list(self.decoder(labels))
      
            decoded_preds, decoded_labels = self.postprocess_text(predictions, labels)
            
            result = self.metric.compute(predictions=predictions, references=labels)
            
            result = {"bleu": result["score"]}
        
        result = {k:round(v, 4) for k, v in result.items()}

        wandb.log("bleu", result["bleu"])
            
        return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [13]:
# %run wolof-translate/wolof_translate/utils/evaluation.py

Let us initialize the evaluation object.

In [14]:
# translation_eval = TranslationEvaluation(test_dataset.tokenizer)

### Searching for the best parameters 🕖

Let us define the data collator.

In [15]:
def data_collator(batch):
    """Generate a batch of data to provide to trainer

    Args:
        batch (_type_): The batch

    Returns:
        dict: A dictionary containing the ids, the attention mask and the labels
    """
    input_ids = torch.stack([b[0] for b in batch])
    
    attention_mask = torch.stack([b[1] for b in batch])
    
    labels = torch.stack([b[0] for b in batch])
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask,
            'labels': labels}

Let us initialize the training arguments and make random search.

In [16]:
# %%wandb

def train(config = None):

  with wandb.init(config = config):

    # seed
    torch.manual_seed(50)

    # set sweep configuration
    config = wandb.config

    # split the data
    split_data(config.random_state)

    # let us recuperate the datasets
    train_dataset, test_dataset = recuperate_datasets(config.wf_char_p, config.wf_word_p)

    # get train and test datasets according to the config

    # train_dataset = datasets[config.dataset_aug]['train_dataset']

    # test_dataset = datasets[config.dataset_aug]['test_dataset']

    # set training arguments
    training_args = TrainingArguments(f"{path}training2/Results1",
                                      report_to = f"wandb",
                                      num_train_epochs=config.epochs,
                                      # logging_steps=100,
                                      load_best_model_at_end=True,
                                      save_strategy="epoch",
                                      evaluation_strategy="epoch",
                                      logging_strategy = 'epoch',
                                      per_device_train_batch_size=config.batch_size, 
                                      per_device_eval_batch_size=5,
                                      learning_rate=config.learning_rate,
                                      weight_decay=config.weight_decay,
                                      remove_unused_columns = False,
                                      fp16 = True,
                                      )   

    # define training loop
    trainer = Trainer(model_init=partial(gpt2_model_init, tokenizer = train_dataset.tokenizer),
                      args=training_args,
                      train_dataset=train_dataset, 
                      eval_dataset=test_dataset,
                      data_collator=data_collator,
                      # compute_metrics=translation_eval.compute_metrics
                      )

    # start training loop
    trainer.train()

agent = wandb.agent(sweep_id, train, count = 25)


[34m[1mwandb[0m: Agent Starting Run: a0u0t6k2 with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 6.702369179262155e-05
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.4
[34m[1mwandb[0m: 	wf_char_p: 0.2819068695463206
[34m[1mwandb[0m: 	wf_word_p: 0.3271474379445852


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,1.3314,0.972646


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.97265
eval/runtime,2.7358
eval/samples_per_second,29.973
eval/steps_per_second,6.214
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.3314
train/total_flos,216590170752000.0
train/train_loss,1.33135


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8p97mqyj with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.000687378518112751
[34m[1mwandb[0m: 	random_state: 20
[34m[1mwandb[0m: 	weight_decay: 0.2
[34m[1mwandb[0m: 	wf_char_p: 0.10242928904824668
[34m[1mwandb[0m: 	wf_word_p: 0.3761238934195836




Epoch,Training Loss,Validation Loss
1,1.2118,0.902809


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.90281
eval/runtime,2.7187
eval/samples_per_second,30.162
eval/steps_per_second,6.253
train/epoch,1.0
train/global_step,367.0
train/learning_rate,1e-05
train/loss,1.2118
train/total_flos,216590170752000.0
train/train_loss,1.21175


[34m[1mwandb[0m: Agent Starting Run: cufx9n8t with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1.226951404890168e-05
[34m[1mwandb[0m: 	random_state: 70
[34m[1mwandb[0m: 	weight_decay: 0.2
[34m[1mwandb[0m: 	wf_char_p: 0.23759176734591644
[34m[1mwandb[0m: 	wf_word_p: 0.13227163155096622




Epoch,Training Loss,Validation Loss
1,1.5713,0.956762


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.95676
eval/runtime,2.7129
eval/samples_per_second,30.226
eval/steps_per_second,6.266
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.5713
train/total_flos,216590170752000.0
train/train_loss,1.57131


[34m[1mwandb[0m: Agent Starting Run: y2zo0avu with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0007890211017920526
[34m[1mwandb[0m: 	random_state: 20
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	wf_char_p: 0.2153084724244564
[34m[1mwandb[0m: 	wf_word_p: 0.03750263309251056




Epoch,Training Loss,Validation Loss
1,1.3947,0.852979


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.85298
eval/runtime,2.7128
eval/samples_per_second,30.227
eval/steps_per_second,6.267
train/epoch,1.0
train/global_step,147.0
train/learning_rate,4e-05
train/loss,1.3947
train/total_flos,216590170752000.0
train/train_loss,1.39468


[34m[1mwandb[0m: Agent Starting Run: 6zl4nshz with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.00013060682353049685
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: 	wf_char_p: 0.6581358201308465
[34m[1mwandb[0m: 	wf_word_p: 0.010288732393246668




Epoch,Training Loss,Validation Loss
1,1.1252,0.843749


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.84375
eval/runtime,2.7146
eval/samples_per_second,30.207
eval/steps_per_second,6.262
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.1252
train/total_flos,216590170752000.0
train/train_loss,1.12516


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lmlh43bi with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 4.6544061486102166e-05
[34m[1mwandb[0m: 	random_state: 20
[34m[1mwandb[0m: 	weight_decay: 0.4
[34m[1mwandb[0m: 	wf_char_p: 0.601761705072325
[34m[1mwandb[0m: 	wf_word_p: 0.5485382582443594
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,1.5112,0.959907


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.95991
eval/runtime,2.7127
eval/samples_per_second,30.228
eval/steps_per_second,6.267
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.5112
train/total_flos,216590170752000.0
train/train_loss,1.5112


[34m[1mwandb[0m: Agent Starting Run: b3709lrr with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 8.01812368676995e-05
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: 	wf_char_p: 0.3414514505517595
[34m[1mwandb[0m: 	wf_word_p: 0.06590909422021479




Epoch,Training Loss,Validation Loss
1,1.2984,0.943387


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.94339
eval/runtime,2.7141
eval/samples_per_second,30.212
eval/steps_per_second,6.264
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.2984
train/total_flos,216590170752000.0
train/train_loss,1.2984


[34m[1mwandb[0m: Agent Starting Run: ixh1wkga with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 7.665404061522188e-05
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	wf_char_p: 0.2025899023149373
[34m[1mwandb[0m: 	wf_word_p: 0.30403171896970477




Epoch,Training Loss,Validation Loss
1,1.6004,0.898786


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.89879
eval/runtime,2.7076
eval/samples_per_second,30.285
eval/steps_per_second,6.279
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.6004
train/total_flos,216590170752000.0
train/train_loss,1.60037


[34m[1mwandb[0m: Agent Starting Run: yqxwql6m with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0009940796140095907
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.4
[34m[1mwandb[0m: 	wf_char_p: 0.5740543904824967
[34m[1mwandb[0m: 	wf_word_p: 0.2462419315938406




Epoch,Training Loss,Validation Loss
1,1.2845,0.848199


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.8482
eval/runtime,2.7189
eval/samples_per_second,30.159
eval/steps_per_second,6.253
train/epoch,1.0
train/global_step,367.0
train/learning_rate,2e-05
train/loss,1.2845
train/total_flos,216590170752000.0
train/train_loss,1.28448


[34m[1mwandb[0m: Agent Starting Run: 6z8kvttx with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 3.136396657280805e-05
[34m[1mwandb[0m: 	random_state: 80
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: 	wf_char_p: 0.496992386293468
[34m[1mwandb[0m: 	wf_word_p: 0.09128048050662484


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668863250000263, max=1.0…



Epoch,Training Loss,Validation Loss
1,1.6337,0.925104


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.9251
eval/runtime,2.7135
eval/samples_per_second,30.22
eval/steps_per_second,6.265
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.6337
train/total_flos,216590170752000.0
train/train_loss,1.63375


[34m[1mwandb[0m: Agent Starting Run: m9sb4aqc with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 2.292925960874299e-05
[34m[1mwandb[0m: 	random_state: 80
[34m[1mwandb[0m: 	weight_decay: 0.2
[34m[1mwandb[0m: 	wf_char_p: 0.1392332134258406
[34m[1mwandb[0m: 	wf_word_p: 0.33276367556881936




Epoch,Training Loss,Validation Loss
1,1.3772,0.921299


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.9213
eval/runtime,2.7224
eval/samples_per_second,30.12
eval/steps_per_second,6.244
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.3772
train/total_flos,216590170752000.0
train/train_loss,1.37722


[34m[1mwandb[0m: Agent Starting Run: jbput8ui with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 6.444484556126347e-05
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	wf_char_p: 0.29893712569494857
[34m[1mwandb[0m: 	wf_word_p: 0.26117783521919574




Epoch,Training Loss,Validation Loss
1,1.4321,0.896062


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.89606
eval/runtime,2.708
eval/samples_per_second,30.281
eval/steps_per_second,6.278
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.4321
train/total_flos,216590170752000.0
train/train_loss,1.43209


[34m[1mwandb[0m: Agent Starting Run: 4fwhdrrr with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 2.5593238990374552e-05
[34m[1mwandb[0m: 	random_state: 0
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	wf_char_p: 0.2943749543935819
[34m[1mwandb[0m: 	wf_word_p: 0.12398175089457102




Epoch,Training Loss,Validation Loss
1,1.6999,0.85214


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.85214
eval/runtime,2.7191
eval/samples_per_second,30.157
eval/steps_per_second,6.252
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.6999
train/total_flos,216590170752000.0
train/train_loss,1.69991


[34m[1mwandb[0m: Agent Starting Run: 1bz3dbic with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 3.890480473611398e-05
[34m[1mwandb[0m: 	random_state: 10
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: 	wf_char_p: 0.2378445346533837
[34m[1mwandb[0m: 	wf_word_p: 0.4209643755950993




Epoch,Training Loss,Validation Loss
1,1.4832,0.875917


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.87592
eval/runtime,2.7525
eval/samples_per_second,29.792
eval/steps_per_second,6.176
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.4832
train/total_flos,216590170752000.0
train/train_loss,1.48315


[34m[1mwandb[0m: Agent Starting Run: ho6ta14m with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0002774553527447285
[34m[1mwandb[0m: 	random_state: 40
[34m[1mwandb[0m: 	weight_decay: 0.2
[34m[1mwandb[0m: 	wf_char_p: 0.6714497099264989
[34m[1mwandb[0m: 	wf_word_p: 0.03656663081592687




Epoch,Training Loss,Validation Loss
1,1.1247,0.822917


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.82292
eval/runtime,2.7172
eval/samples_per_second,30.178
eval/steps_per_second,6.256
train/epoch,1.0
train/global_step,367.0
train/learning_rate,1e-05
train/loss,1.1247
train/total_flos,216590170752000.0
train/train_loss,1.12474


[34m[1mwandb[0m: Agent Starting Run: wmlfiw0r with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.0003923791647223955
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.2
[34m[1mwandb[0m: 	wf_char_p: 0.06807537344840917
[34m[1mwandb[0m: 	wf_word_p: 0.5604330614348828




Epoch,Training Loss,Validation Loss
1,1.5006,0.949637


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.94964
eval/runtime,2.7103
eval/samples_per_second,30.255
eval/steps_per_second,6.272
train/epoch,1.0
train/global_step,147.0
train/learning_rate,2e-05
train/loss,1.5006
train/total_flos,216590170752000.0
train/train_loss,1.5006


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: r8gcrole with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 1.821341363881095e-05
[34m[1mwandb[0m: 	random_state: 50
[34m[1mwandb[0m: 	weight_decay: 0.3
[34m[1mwandb[0m: 	wf_char_p: 0.4377052667800509
[34m[1mwandb[0m: 	wf_word_p: 0.28337028938356845




Epoch,Training Loss,Validation Loss
1,1.8462,0.981831


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.98183
eval/runtime,2.7136
eval/samples_per_second,30.218
eval/steps_per_second,6.265
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.8462
train/total_flos,216590170752000.0
train/train_loss,1.8462


[34m[1mwandb[0m: Agent Starting Run: phwfj18n with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 7.811279212646426e-05
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	wf_char_p: 0.2317434997460037
[34m[1mwandb[0m: 	wf_word_p: 0.4361895012572056




Epoch,Training Loss,Validation Loss
1,1.6396,0.980084


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.98008
eval/runtime,2.767
eval/samples_per_second,29.635
eval/steps_per_second,6.144
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.6396
train/total_flos,216590170752000.0
train/train_loss,1.63955


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jlqkadbj with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.00010056956001033137
[34m[1mwandb[0m: 	random_state: 10
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	wf_char_p: 0.6390337876205812
[34m[1mwandb[0m: 	wf_word_p: 0.23792329132405943




Epoch,Training Loss,Validation Loss
1,1.6119,0.873367


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.87337
eval/runtime,2.7126
eval/samples_per_second,30.23
eval/steps_per_second,6.267
train/epoch,1.0
train/global_step,147.0
train/learning_rate,1e-05
train/loss,1.6119
train/total_flos,216590170752000.0
train/train_loss,1.61191


[34m[1mwandb[0m: Agent Starting Run: 6p270685 with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 7.129742665577746e-05
[34m[1mwandb[0m: 	random_state: 20
[34m[1mwandb[0m: 	weight_decay: 0.5
[34m[1mwandb[0m: 	wf_char_p: 0.39725467091195105
[34m[1mwandb[0m: 	wf_word_p: 0.5004309074101329




Epoch,Training Loss,Validation Loss
1,1.4593,0.955068


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.95507
eval/runtime,2.7243
eval/samples_per_second,30.099
eval/steps_per_second,6.24
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.4593
train/total_flos,216590170752000.0
train/train_loss,1.45931


[34m[1mwandb[0m: Agent Starting Run: gcv6axp9 with config:
[34m[1mwandb[0m: 	batch_size: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 6.252438217932992e-05
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: 	wf_char_p: 0.29046645972247725
[34m[1mwandb[0m: 	wf_word_p: 0.2600785813543463




Epoch,Training Loss,Validation Loss
1,1.3169,0.971379


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.97138
eval/runtime,2.7496
eval/samples_per_second,29.823
eval/steps_per_second,6.183
train/epoch,1.0
train/global_step,367.0
train/learning_rate,0.0
train/loss,1.3169
train/total_flos,216590170752000.0
train/train_loss,1.31693


[34m[1mwandb[0m: Agent Starting Run: yhyy9e23 with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 3.647196100197014e-05
[34m[1mwandb[0m: 	random_state: 10
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: 	wf_char_p: 0.1717366963251063
[34m[1mwandb[0m: 	wf_word_p: 0.1316624948710261




Epoch,Training Loss,Validation Loss
1,1.3969,0.856822


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.85682
eval/runtime,2.7137
eval/samples_per_second,30.217
eval/steps_per_second,6.265
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.3969
train/total_flos,216590170752000.0
train/train_loss,1.3969


[34m[1mwandb[0m: Agent Starting Run: xup68ew4 with config:
[34m[1mwandb[0m: 	batch_size: 5
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 3.637652455538814e-05
[34m[1mwandb[0m: 	random_state: 60
[34m[1mwandb[0m: 	weight_decay: 0.3
[34m[1mwandb[0m: 	wf_char_p: 0.677355252029728
[34m[1mwandb[0m: 	wf_word_p: 0.16163588455481578




Epoch,Training Loss,Validation Loss
1,1.6871,0.952795


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.9528
eval/runtime,2.7376
eval/samples_per_second,29.953
eval/steps_per_second,6.21
train/epoch,1.0
train/global_step,147.0
train/learning_rate,0.0
train/loss,1.6871
train/total_flos,216590170752000.0
train/train_loss,1.68711


[34m[1mwandb[0m: Agent Starting Run: wa2re6yd with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 0.00015372099844614283
[34m[1mwandb[0m: 	random_state: 0
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: 	wf_char_p: 0.4593404124892092
[34m[1mwandb[0m: 	wf_word_p: 0.5384324544438147




Epoch,Training Loss,Validation Loss
1,1.4028,0.823874


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.82387
eval/runtime,2.7275
eval/samples_per_second,30.064
eval/steps_per_second,6.233
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.4028
train/total_flos,216590170752000.0
train/train_loss,1.40277


[34m[1mwandb[0m: Agent Starting Run: i305ffth with config:
[34m[1mwandb[0m: 	batch_size: 3
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	learning_rate: 4.284698923411304e-05
[34m[1mwandb[0m: 	random_state: 30
[34m[1mwandb[0m: 	weight_decay: 0.4
[34m[1mwandb[0m: 	wf_char_p: 0.6295786463189464
[34m[1mwandb[0m: 	wf_word_p: 0.6968639258681786




Epoch,Training Loss,Validation Loss
1,1.5093,0.986804


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁
train/global_step,▁▁▁
train/learning_rate,▁
train/loss,▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.9868
eval/runtime,2.7162
eval/samples_per_second,30.189
eval/steps_per_second,6.259
train/epoch,1.0
train/global_step,245.0
train/learning_rate,0.0
train/loss,1.5093
train/total_flos,216590170752000.0
train/train_loss,1.5093


-----------

## Colab download and remove step

In [2]:
import shutil

# shutil.rmtree('/content/drive/MyDrive/Memoire/subject2/training2/Results1')
# shutil.rmtree('wandb')
# shutil.make_archive('wandb', 'zip', 'wanbd')