In [13]:
import argparse
import os
import torch
import wandb
import datetime
import pytorch_lightning as L

from glue_transfomer import GLUETransformer
from data_module import GLUEDataModule
from wandb_utils import setup_wandb

#from datetime import datetime



In [14]:
wandb.login()

hyperparams = {
    'learning_rate' : 1e-5,
    'warmup_steps': 0,
    'weight_decay': 0.0,
    'train_batch_size': 32,
    'eval_batch_size': 32,
    'use_cyclic_lr': True,
    'base_lr': 1e-5,
    'max_lr': 1e-1,
    'step_size_up': 100000,
    'step_size_down': 100000
}

model_name = "distilbert-base-uncased"
task_name = 'mrpc'
project_name = 'mlops'

epochs = 3

wandb_logger, checkpoint_callback, log_dir, checkpoint_dir = setup_wandb(hyperparams, model_name, task_name, project_name)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
L.seed_everything(42)

dm = GLUEDataModule(
    model_name_or_path=model_name,
    task_name=task_name,
)
dm.setup("fit")
model = GLUETransformer(
    model_name_or_path=model_name,
    num_labels=dm.num_labels,
    eval_splits=dm.eval_splits,
    task_name=dm.task_name,
    **hyperparams
)

trainer = L.Trainer(
    max_epochs=epochs,
    accelerator="auto",
    devices=1,
    logger=wandb_logger,
    log_every_n_steps=10,
    default_root_dir=log_dir, 
    callbacks=[checkpoint_callback],
)
trainer.fit(model, datamodule=dm)
wandb.finish()

Seed set to 42


Map:   0%|          | 0/408 [00:00<?, ? examples/s]



Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/miniconda3/envs/mlops/lib/python3.9/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]



Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]


  | Name  | Type                                | Params | Mode
---------------------------------------------------------------------
0 | model | DistilBertForSequenceClassification | 67.0 M | eval
---------------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)
0         Modules in train mode
96        Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/miniconda3/envs/mlops/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/opt/miniconda3/envs/mlops/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁██
epoch,▁▅█
f1,▁▇█
trainer/global_step,▁▅█
training_loss,█▇▇▇▆█▆█▆▇▆▇▆▆▅▅▅▆▆▅▅▇▅▆▄▄▆▄▃▄▂▄▄▂▁▃▂▃▃▂
val_loss,██▄▅▅▅▆▆▆▆▆▅▆▅▅▃▃▄▃▃▅▄▄▄▃▄▅▄▂▃▁▅▅▄▄▃▄▆▆▄

0,1
accuracy,0.83578
epoch,2.0
f1,0.88388
trainer/global_step,344.0
training_loss,0.15354
val_loss,0.38154


In [None]:
wandb.login()

hyperparams = {
    'use_cyclic_lr':True,
    'base_lr': 1e-5,
    'max_lr': 1e-1,
    'step_size_up': 100000,
    'step_size_down':  100000
}

task_name = 'mrpc'
model_name = "distilbert-base-uncased"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
project_name = 'mlops'

hyperparam_names = "-".join(hyperparams.keys())
hyperparam_values = "-".join([str(value) for value in hyperparams.values()])
hyperparam_string = f"{hyperparam_names}/{hyperparam_values}"

experiment_name = f'project_1/week_2/{hyperparam_string}-{timestamp}'
folder_structure = f'mlops/project_1/week_2/{hyperparam_string}/'

wandb_experiment_name = experiment_name.replace("/", "-")
wandb_hyperparam_string = hyperparam_string.replace("/", "-")

epochs = 3  # do not change this

log_dir = os.path.join(folder_structure, 'logs')
checkpoint_dir = os.path.join(folder_structure, 'checkpoints')
os.makedirs(log_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

wandb.init(
    project=project_name,
    name=wandb_experiment_name,
    #name = experiment_name,
    config={  
        **hyperparams,  # Pass all hyperparameters dynamically
        "model_name": model_name,
        "task_name": task_name,
    },
    tags=[task_name, model_name],  # Optional tags
    dir=log_dir,  # Use custom log directory for WandB logs
    id=f"{wandb_hyperparam_string}",  # Custom ID using hyperparameter string
)

wandb_logger = WandbLogger(
    project=project_name,
    name=experiment_name,
    log_model=True,  # Log model checkpoints to W&B
    save_dir=log_dir,  # Path for logs
    id=f"{hyperparam_string}"
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',  
    dirpath=checkpoint_dir, 
    filename=hyperparam_string + '-{epoch:02d}-{val_loss:.2f}',  
    save_top_k=1,  
    mode='min',  
)

In [None]:
hyperparam_string

In [None]:
wandb_experiment_name