# 🤗 HuggingFace Neural Network Notebook - Hyperparameter Tuning

## Version History

### 1.0
Copied from CV Split training notebook.

## Setup

### Environment Variables

In [1]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


### Import Libraries

In [2]:
import os
import random
from typing import Tuple, List
import warnings
import pickle as pkl

from dotenv import load_dotenv
import wandb
from tqdm import tqdm
import torch
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from datasets import Dataset
from tokenizers import AddedToken
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    cohen_kappa_score,
)

In [3]:
os.chdir("../../")
warnings.simplefilter('ignore')
load_dotenv()

True

In [4]:
from lib.criterion.metrics import log_metrics
from lib.utils.find_threshold import find_thresholds

### Paths

In [5]:
class Paths:
    # Competition data with added topic column
    train_csv: str = "data/processed/train.csv"
    test_csv: str = "data/processed/test.csv"

    # Output path
    output_path: str = "output/model_dir_ht"
    model_path: str = os.path.join(output_path, "{model_name}")
    tokenizer_path: str =  os.path.join(model_path, "{model_name}_tokenizer")
    threshold_path: str = os.path.join(model_path, "threshold.pkl")
    logging_path: str = os.path.join(model_path, "logging")

### Configurations

In [6]:
DEBUG = False
DO_SLIDING_WINDOW = True
DO_REGRESSION = True

In [7]:
class CFG:
    backbone_model: str = 'microsoft/deberta-v3-xsmall'
    max_length: int = 512
    num_labels: int = 6
    num_workers: int = 6
    seed: int = 20
    stride_length: int = 384

In [8]:
if DEBUG:
    CFG.num_epochs = 1

In [9]:
MODEL_NAME = CFG.backbone_model.split("/")[-1]

### Setting Random Seed

In [10]:
def seed_everything() -> None:
    """Seed everything to ensure reproducibility

    Sources:
    1. https://www.kaggle.com/code/alejopaullier/aes-2-multi-class-classification-train
    2. https://www.kaggle.com/code/hashidoyuto/deberta-baseline-aes2-0-train
    """
    random.seed(CFG.seed)
    os.environ["PYTHONHASHCFG.SEED"] = str(CFG.seed)
    np.random.seed(CFG.seed)
    torch.manual_seed(CFG.seed)
    torch.cuda.manual_seed(CFG.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything()

### Sweep Configuration

In [11]:
sweep_config = {
    # How to perform hyperparameter tuning
    "method": "random",
    # How to evaluate which hyperparameter combination is good
    "metric": {
        "name": "QWK",
        "goal": "maximize",
    },
    # Hyperparameters to tune
    "parameters": {
        # Hyperparameters that will change
        "lr": {"distribution": "uniform", "min": 1e-5, "max": 1e-3},
        "weight_decay": {"distribution": "uniform", "min": 0.01, "max": 0.1},
        "num_epochs": {"values": [3, 4, 5]},
        "warmup_ratio": {"distribution": "uniform", "min": 0.01, "max": 0.1},
        "lr_scheduler_type": {"values": ["cosine", "linear"]},
        "batch_size": {"values": [8, 16, 32]},
    },
}

### WandB setup

In [12]:
WANDB_PROJECT = "Kaggle_ASE_2.0"
EXPERIMENT = f"ASE-sweep-001"

In [13]:
wandb.login(key=os.environ.get('WANDB_API_KEY'))
sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m ([33mlaplacesdemon43[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ishfar/.netrc


Create sweep with ID: norrpezc
Sweep URL: https://wandb.ai/laplacesdemon43/Kaggle_ASE_2.0/sweeps/norrpezc


## Data Preparation

### Tokenizer

In [14]:
class Tokenize(object):
    def __init__(self, train, valid, test, tokenizer):
        self.tokenizer = tokenizer
        self.train = train
        self.valid = valid
        self.test = test

    def get_dataset(self, df):
        ds = Dataset.from_dict(
            {
                "essay_id": [e for e in df["essay_id"]],
                "full_text": [ft for ft in df["full_text"]],
                "label": [s for s in df["label"]],
            }
        )
        return ds

    def tokenize_function(self, example):
        tokenized_inputs = self.tokenizer(
            example["full_text"],
            truncation=True,
            max_length=CFG.max_length,
            padding="max_length",
            
        )
        return tokenized_inputs

    def __call__(self):
        train_ds = self.get_dataset(self.train)
        valid_ds = self.get_dataset(self.valid)
        test_ds = self.get_dataset(self.test)

        tokenized_train = train_ds.map(self.tokenize_function, batched=True)
        tokenized_valid = valid_ds.map(self.tokenize_function, batched=True)
        tokenized_test = test_ds.map(self.tokenize_function, batched=True)

        return tokenized_train, tokenized_valid, tokenized_test, self.tokenizer

In [15]:
tokenizer = AutoTokenizer.from_pretrained(CFG.backbone_model)
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" " * 2, normalized=False)])
tokenizer.save_pretrained(Paths.tokenizer_path.format(model_name=MODEL_NAME))

('output/model_dir_ht/deberta-v3-xsmall/deberta-v3-xsmall_tokenizer/tokenizer_config.json',
 'output/model_dir_ht/deberta-v3-xsmall/deberta-v3-xsmall_tokenizer/special_tokens_map.json',
 'output/model_dir_ht/deberta-v3-xsmall/deberta-v3-xsmall_tokenizer/spm.model',
 'output/model_dir_ht/deberta-v3-xsmall/deberta-v3-xsmall_tokenizer/added_tokens.json',
 'output/model_dir_ht/deberta-v3-xsmall/deberta-v3-xsmall_tokenizer/tokenizer.json')

### Data Preprocessing

1. Convert `label` to be in `scores` which are processed to be in range $[0-5]$.
2. Convert `label` data type based on whether we are doing regression or classification.

In [16]:
data = pd.read_csv(Paths.train_csv, low_memory=False)
print("Training dataset has shape:", data.shape)

data["label"] = data["score"].map(lambda x: x- 1)

if DO_REGRESSION:
    data["label"] = data["label"].astype(pd.Float32Dtype())
else:
    data["label"] = data["label"].astype(pd.Int32Dtype())

data.head(3)

Training dataset has shape: (17307, 4)


Unnamed: 0,essay_id,full_text,score,topic,label
0,000d118,Many people have car where they live. The thin...,3,5,2.0
1,000fe60,I am a scientist at NASA that is discussing th...,3,3,2.0
2,001ab80,People always wish they had the same technolog...,4,0,3.0


### Train-Valid-Test Splitting

In [17]:
train_data, test_data  = train_test_split(
    data,
    test_size=0.1,
    shuffle=True,
    stratify=data["label"],
    random_state=CFG.seed,
)
train_data, valid_data  = train_test_split(
    train_data,
    test_size=0.1,
    shuffle=True,
    stratify=train_data["label"],
    random_state=CFG.seed,
)

# Only use 20% of training dataset ofr hyper parameter tuning
train_data, _ = train_test_split(
    data,
    test_size=0.8,
    shuffle=True,
    stratify=data["label"],
    random_state=CFG.seed,
)

print("Shapes of train, valid and test data:", train_data.shape, valid_data.shape, test_data.shape)

Shapes of train, valid and test data: (3461, 5) (1558, 5) (1731, 5)


### Sliding Window

Essays can have varying lengths. Instead of truncating, see the entire essay as windows of length `CFG.max_length` which are strided with `CFG.stride_length`.

In [18]:
def construct_new_row(old_row, text):
    new_row = {key: old_row[key] for key in old_row.keys() if key != "index"}
    new_row["full_text"] = text
    return new_row

In [19]:
def split_tokens(tokens, stride):
    """Splits `tokens` into multiple sequences that have at most
    `CFG.max_length` tokens. Uses `CFG.stride` for sliding
    window.

    Args:
        tokens (List): List of tokens.
        stride (int): Stride length.

    Returns:
        List[List[int]]: List of split token sequences.
    """
    start = 0
    sequence_list = []

    while start < len(tokens):
        remaining_tokens = len(tokens) - start

        if remaining_tokens < CFG.max_length and start > 0:
            start = max(0, len(tokens) - CFG.max_length)

        end = min(start + CFG.max_length, len(tokens))
        sequence_list.append(tokens[start:end])

        if remaining_tokens >= CFG.max_length:
            start += stride
        else:
            break

    return sequence_list

In [20]:
def sliding_window(df, tokenizer):
    """Splits rows of `df` so that each row's text has at most
    `CFG.max_length` number of tokens.

    Args:
        df (pd.DataFrame): Input data frame.
        tokenizer (_type_): Tokenizer used to encode and decode text.

    Returns:
        pd.DataFrame: Newly constructed dataframe.
    """

    new_df = []

    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        tokens = tokenizer.encode(row["full_text"], add_special_tokens=False)

        if len(tokens) <= CFG.max_length:
            new_df.append(construct_new_row(row, row["full_text"]))
        else:
            sequence_list = split_tokens(tokens, CFG.stride_length)

            for seq in sequence_list:
                new_df.append(
                    construct_new_row(
                        row,
                        tokenizer.decode(seq, skip_special_tokens=True),
                    )
                )

    return pd.DataFrame(new_df)

In [21]:
if DO_SLIDING_WINDOW:
    train_data = sliding_window(train_data, tokenizer)
    valid_data = sliding_window(valid_data, tokenizer)
    test_data = sliding_window(test_data, tokenizer)
    print("Shapes of train, valid and test data:", train_data.shape, valid_data.shape, test_data.shape)

100%|██████████| 3461/3461 [00:06<00:00, 569.83it/s]
100%|██████████| 1558/1558 [00:02<00:00, 576.64it/s]
100%|██████████| 1731/1731 [00:03<00:00, 571.57it/s]

Shapes of train, valid and test data: (4568, 5) (2054, 5) (2267, 5)





## Metrics

### QWK Score Calculation

In [22]:
def compute_metrics_for_regression(eval_pred):
    predictions, labels = eval_pred

    qwk = cohen_kappa_score(
        labels,
        predictions.clip(0, 5).round(0),
        weights="quadratic",
    )

    return {"qwk": qwk}

In [23]:
def compute_metrics_for_classification(eval_pred):
    predictions, labels = eval_pred

    qwk = cohen_kappa_score(
        labels,
        predictions.argmax(-1),
        weights="quadratic",
    )
    
    return {"qwk": qwk}

In [24]:
if DO_REGRESSION:
    compute_metrics = compute_metrics_for_regression
else:
    compute_metrics = compute_metrics_for_classification

## Training

### Utility Functions

In [25]:
def tokenize_data(
    train: pd.DataFrame,
    valid: pd.DataFrame,
    test: pd.DataFrame,
) -> Tuple[Dataset, Dataset, Dataset]:
    tokenize = Tokenize(train, valid, test, tokenizer)
    tokenized_train, tokenized_valid, tokenized_test, _ = tokenize()
    return tokenized_train, tokenized_valid, tokenized_test

In [26]:
def configure():
    config = AutoConfig.from_pretrained(CFG.backbone_model)
    
    if DO_REGRESSION:
        config.attention_probs_dropout_prob = 0.0
        config.hidden_dropout_prob = 0.0
        config.num_labels = 1
    else:
        config.num_labels = CFG.num_labels

    return config

In [27]:
def get_model(config):
    backbone_model = AutoModelForSequenceClassification.from_pretrained(
        CFG.backbone_model,
        config=config,
    )
    backbone_model.resize_token_embeddings(len(tokenizer))
    return backbone_model

In [28]:
def post_process_predictions(predictions0):
    if DO_REGRESSION:
        predictions = predictions0.clip(0, 5).round(0)
    else:
        predictions = predictions0.argmax(axis=1)

    return predictions

In [29]:
def score_model(trainer, tokenized_test, test_data):
    y_true = test_data["label"].values
    logits = trainer.predict(tokenized_test).predictions
    y_pred = post_process_predictions(logits)

    score = compute_metrics((y_pred, y_true))["qwk"]
    wandb.log({"QWK": score})

### Training Model

In [30]:
tokenized_train, tokenized_valid, tokenized_test = tokenize_data(
    train_data, valid_data, test_data
)

Map:   0%|          | 0/4568 [00:00<?, ? examples/s]

Map:   0%|          | 0/2054 [00:00<?, ? examples/s]

Map:   0%|          | 0/2267 [00:00<?, ? examples/s]

In [31]:
def main(sweep_config=None):
    with wandb.init(config=sweep_config):
        sweep_config = wandb.config
        seed_everything()

        backbone_model = get_model(configure())

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

        training_args = TrainingArguments(
            data_seed=CFG.seed,
            dataloader_num_workers=CFG.num_workers,
            evaluation_strategy="epoch",
            fp16=True,
            learning_rate=sweep_config.lr,
            load_best_model_at_end=True,
            logging_first_step=True,
            logging_steps=250,
            logging_dir=Paths.logging_path,
            lr_scheduler_type=sweep_config.lr_scheduler_type,
            metric_for_best_model="qwk",
            num_train_epochs=sweep_config.num_epochs,
            output_dir=Paths.output_path,
            optim="adamw_torch",
            per_device_eval_batch_size=sweep_config.batch_size,
            per_device_train_batch_size=sweep_config.batch_size,
            report_to="wandb",
            seed=CFG.seed,
            save_total_limit=1,
            save_strategy="epoch",
            weight_decay=sweep_config.weight_decay,
            warmup_ratio=sweep_config.warmup_ratio,
        )

        trainer = Trainer(
            model=backbone_model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        score_model(trainer, tokenized_test, test_data)

### Initiate Sweep

In [32]:
wandb.agent(sweep_id, main, count=3, project=WANDB_PROJECT)

[34m[1mwandb[0m: Agent Starting Run: uayh872x with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	gradient_checkpointing: False
[34m[1mwandb[0m: 	lr: 0.00047506985383954784
[34m[1mwandb[0m: 	lr_scheduler_type: linear
[34m[1mwandb[0m: 	num_epochs: 3
[34m[1mwandb[0m: 	warmup_ratio: 0.09660615857749676
[34m[1mwandb[0m: 	weight_decay: 0.019338687399429386
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/213 [00:00<?, ?it/s]

{'loss': 7.4293, 'grad_norm': inf, 'learning_rate': 0.0, 'epoch': 0.01}


  0%|          | 0/65 [00:00<?, ?it/s]

{'eval_loss': 0.7312950491905212, 'eval_qwk': 0.6626908640961982, 'eval_runtime': 6.0628, 'eval_samples_per_second': 338.785, 'eval_steps_per_second': 10.721, 'epoch': 0.99}


  0%|          | 0/65 [00:00<?, ?it/s]

{'eval_loss': 0.44686359167099, 'eval_qwk': 0.7349368723919223, 'eval_runtime': 6.158, 'eval_samples_per_second': 333.548, 'eval_steps_per_second': 10.555, 'epoch': 2.0}


  0%|          | 0/65 [00:00<?, ?it/s]

{'eval_loss': 0.3670250475406647, 'eval_qwk': 0.8148729032104691, 'eval_runtime': 6.1966, 'eval_samples_per_second': 331.473, 'eval_steps_per_second': 10.49, 'epoch': 2.98}
{'train_runtime': 122.1398, 'train_samples_per_second': 112.199, 'train_steps_per_second': 1.744, 'train_loss': 0.7531390884112864, 'epoch': 2.98}


  0%|          | 0/71 [00:00<?, ?it/s]

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
QWK,▁
eval/loss,█▃▁
eval/qwk,▁▄█
eval/runtime,▁▆█
eval/samples_per_second,█▃▁
eval/steps_per_second,█▃▁
train/epoch,▁▃▆██
train/global_step,▁▃▆███
train/learning_rate,▁
train/loss,▁

0,1
QWK,0.78819
eval/loss,0.36703
eval/qwk,0.81487
eval/runtime,6.1966
eval/samples_per_second,331.473
eval/steps_per_second,10.49
total_flos,896939113463808.0
train/epoch,2.98
train/global_step,213.0
train/grad_norm,inf


[34m[1mwandb[0m: Agent Starting Run: o2feec7g with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	gradient_checkpointing: True
[34m[1mwandb[0m: 	lr: 0.0007194249121713805
[34m[1mwandb[0m: 	lr_scheduler_type: linear
[34m[1mwandb[0m: 	num_epochs: 5
[34m[1mwandb[0m: 	warmup_ratio: 0.07441723883852884
[34m[1mwandb[0m: 	weight_decay: 0.027249254922248597
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/355 [00:00<?, ?it/s]

{'loss': 7.4293, 'grad_norm': inf, 'learning_rate': 0.0, 'epoch': 0.01}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.267929196357727, 'eval_qwk': 0.0, 'eval_runtime': 6.5418, 'eval_samples_per_second': 313.981, 'eval_steps_per_second': 19.719, 'epoch': 0.99}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.3124651908874512, 'eval_qwk': 0.0, 'eval_runtime': 6.552, 'eval_samples_per_second': 313.491, 'eval_steps_per_second': 19.689, 'epoch': 2.0}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.2289841175079346, 'eval_qwk': 0.0, 'eval_runtime': 6.4543, 'eval_samples_per_second': 318.236, 'eval_steps_per_second': 19.987, 'epoch': 2.99}
{'loss': 1.3705, 'grad_norm': 9.521703720092773, 'learning_rate': 0.00023469044390956621, 'epoch': 3.5}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.2290771007537842, 'eval_qwk': 0.0, 'eval_runtime': 6.4997, 'eval_samples_per_second': 316.015, 'eval_steps_per_second': 19.847, 'epoch': 4.0}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.2292695045471191, 'eval_qwk': 0.0, 'eval_runtime': 6.5284, 'eval_samples_per_second': 314.624, 'eval_steps_per_second': 19.76, 'epoch': 4.97}
{'train_runtime': 280.4352, 'train_samples_per_second': 81.445, 'train_steps_per_second': 1.266, 'train_loss': 1.3453393586924378, 'epoch': 4.97}


  0%|          | 0/142 [00:00<?, ?it/s]

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
QWK,▁
eval/loss,▄█▁▁▁
eval/qwk,▁▁▁▁▁
eval/runtime,▇█▁▄▆
eval/samples_per_second,▂▁█▅▃
eval/steps_per_second,▂▁█▅▃
train/epoch,▁▂▄▅▆▇██
train/global_step,▁▂▄▅▆▇███
train/grad_norm,▁
train/learning_rate,▁█

0,1
QWK,0.0
eval/loss,1.22927
eval/qwk,0.0
eval/runtime,6.5284
eval/samples_per_second,314.624
eval/steps_per_second,19.76
total_flos,1494547194937344.0
train/epoch,4.97
train/global_step,355.0
train/grad_norm,9.5217


[34m[1mwandb[0m: Agent Starting Run: au9eykhx with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	gradient_accumulation_steps: 1
[34m[1mwandb[0m: 	gradient_checkpointing: False
[34m[1mwandb[0m: 	lr: 0.0005739866427008236
[34m[1mwandb[0m: 	lr_scheduler_type: cosine
[34m[1mwandb[0m: 	num_epochs: 3
[34m[1mwandb[0m: 	warmup_ratio: 0.09401645418933266
[34m[1mwandb[0m: 	weight_decay: 0.04653827753363589
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/858 [00:00<?, ?it/s]

{'loss': 7.0427, 'grad_norm': inf, 'learning_rate': 0.0, 'epoch': 0.0}
{'loss': 1.361, 'grad_norm': 3.515922784805298, 'learning_rate': 0.0005117350193690799, 'epoch': 0.87}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.244022011756897, 'eval_qwk': 0.0, 'eval_runtime': 6.5348, 'eval_samples_per_second': 314.315, 'eval_steps_per_second': 19.74, 'epoch': 1.0}
{'loss': 1.2449, 'grad_norm': 5.379490852355957, 'learning_rate': 0.0002551485520621797, 'epoch': 1.75}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.2349462509155273, 'eval_qwk': 0.0, 'eval_runtime': 6.6247, 'eval_samples_per_second': 310.05, 'eval_steps_per_second': 19.472, 'epoch': 2.0}
{'loss': 1.2411, 'grad_norm': 2.4494705200195312, 'learning_rate': 2.8421273517776134e-05, 'epoch': 2.62}


  0%|          | 0/129 [00:00<?, ?it/s]

{'eval_loss': 1.2291325330734253, 'eval_qwk': 0.0, 'eval_runtime': 6.4925, 'eval_samples_per_second': 316.366, 'eval_steps_per_second': 19.869, 'epoch': 3.0}
{'train_runtime': 136.4966, 'train_samples_per_second': 100.398, 'train_steps_per_second': 6.286, 'train_loss': 1.2802293094999584, 'epoch': 3.0}


  0%|          | 0/142 [00:00<?, ?it/s]

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
QWK,▁
eval/loss,█▄▁
eval/qwk,▁▁▁
eval/runtime,▃█▁
eval/samples_per_second,▆▁█
eval/steps_per_second,▆▁█
train/epoch,▁▃▃▅▆▇██
train/global_step,▁▃▃▅▆▇███
train/grad_norm,▄█▁
train/learning_rate,▁█▄▁

0,1
QWK,0.0
eval/loss,1.22913
eval/qwk,0.0
eval/runtime,6.4925
eval/samples_per_second,316.366
eval/steps_per_second,19.869
total_flos,902736017252352.0
train/epoch,3.0
train/global_step,858.0
train/grad_norm,2.44947


## Wrapping up

In [34]:
wandb.finish()