In [1]:
import os
import sys
import torch
from datasets import load_from_disk
from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [2]:
print(sys.version)

3.9.18 (main, Sep 11 2023, 13:41:44) 
[GCC 11.2.0]


In [3]:

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available, using CPU instead.")


GPU 0: NVIDIA GeForce RTX 3090
GPU 1: NVIDIA GeForce RTX 3090


In [None]:
dataset = load_from_disk('./dataset/hf-dataset')

In [6]:
training = load_from_disk('./data/train')

In [7]:
validation = load_from_disk('./data/validation')

In [8]:
testing = load_from_disk('./data/eval')

In [9]:
tokenized_dataset = DatasetDict({
    "train": training,
    "validation": validation,
    "test": testing,
})

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2000
    })
})

In [11]:
model_id="google/mt5-xl"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [25]:
def count_tokens_over_512(split):
    count = 0
    i = 0
    for example in dataset[split]:
        tokenized_text = tokenizer(example["text"], return_tensors="pt")['input_ids']
        if tokenized_text.size(1) > 512:
            count += 1
            
    return count
train_count = count_tokens_over_512("train")
validation_count = count_tokens_over_512("validation")
test_count = count_tokens_over_512("test")
print("Train:", train_count)
print("Validation:", validation_count)
print("Test:", test_count)

Train: 0
Validation: 0
Test: 0


In [39]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2000
    })
})

In [41]:
def preprocess_function(sample, padding="max_length"):
    inputs = sample["text"]
    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)
    labels = tokenizer(text_target=sample["label"], max_length=200, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset = dataset.map(
    lambda x: preprocess_function(x),
    batched=True,
    remove_columns=["text", "label"]
)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
# Save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("data/train")
tokenized_dataset["validation"].save_to_disk("data/validation")
tokenized_dataset["test"].save_to_disk("data/eval")


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [14]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [16]:
model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 2048)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 2048)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
              (k): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
              (v): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
              (o): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
              (relative_attention_bias): Embedding(32, 32)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear8bitLt(in_features=2048, out_features=5120, bias=False)
              (wi_1): Linear8bitLt(in_feature

In [17]:
lora_config = LoraConfig(
 r=8,
 lora_alpha=32,
 target_modules="all-linear",
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)

In [18]:
model = prepare_model_for_int8_training(model)



In [19]:
model = get_peft_model(model, lora_config)

In [20]:
model.print_trainable_parameters()

trainable params: 17,694,720 || all params: 3,760,314,368 || trainable%: 0.47056491208769063


In [22]:
label_pad_token_id = -100

In [23]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [25]:
output_dir="lora-mt5-xl"

In [26]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mtanmay-k[0m ([33mteam-tk[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [27]:
os.environ["WANDB_PROJECT"]="thesis-lora-mt5-xl"

In [28]:
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [29]:
os.environ["WANDB_PROJECT"]

'thesis-lora-mt5-xl'

In [30]:
last_run_id = "u4l9ezqa"

In [31]:
api = wandb.Api()
runs = api.runs('thesis-lora-mt5-xl')
for run in runs:
  print(run.id)

u4l9ezqa


In [32]:
for run in runs:
  print(f"Run ID: {run.id}")
  if run.config and 'model' in run.config and 'name' in run.config.model:
    model_name = run.config.model.name
    print(f"  Model Name: {model_name}")
  else:
    print(f"  Model Name: Not found in run config")

Run ID: u4l9ezqa
  Model Name: Not found in run config


In [33]:
with wandb.init(
    project=os.environ["WANDB_PROJECT"],
    id=last_run_id,
    resume="must",
) as run:
    my_checkpoint_name = f"checkpoint-{last_run_id}:latest"
    my_checkpoint_artifact = run.use_artifact('team-tk/thesis-lora-mt5-xl/checkpoint-u4l9ezqa:v0', type='model')
    checkpoint_dir = my_checkpoint_artifact.download()
    training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    report_to="wandb",
    )
    trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
    )
    trainer.train(resume_from_checkpoint=checkpoint_dir)

[34m[1mwandb[0m: Downloading large artifact checkpoint-u4l9ezqa:v0, 203.34MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:4.1
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
3000,2.539
3500,2.5602
4000,2.5589
4500,3.1481
5000,3.1431
5500,2.9737
6000,2.916
6500,2.8256
7000,2.8725
7500,2.8696


[34m[1mwandb[0m: Adding directory to artifact (./lora-mt5-xl/checkpoint-5000)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./lora-mt5-xl/checkpoint-7500)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./lora-mt5-xl/checkpoint-10000)... Done. 2.1s
[34m[1mwandb[0m: Adding directory to artifact (./lora-mt5-xl/checkpoint-12500)... Done. 2.1s
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


VBox(children=(Label(value='880.987 MB of 880.987 MB uploaded (0.040 MB deduped)\r'), FloatProgress(value=1.0,…



0,1
train/epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/grad_norm,▄▃▂▁▁▁▁▄▁▁▁▁▁▂▆█▃▇▁▄
train/learning_rate,██▇▇▇▆▆▅▅▅▄▄▄▃▃▂▂▂▁▁
train/loss,▃▃▃██▇▆▅▆▆▄▄▄▃▃▂▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,5.0
train/global_step,12500.0
train/grad_norm,33.09787
train/learning_rate,0.0
train/loss,2.2812
train/total_flos,9.978110803968e+17
train/train_loss,2.11291
train/train_runtime,36791.1907
train/train_samples_per_second,2.718
train/train_steps_per_second,0.34


In [None]:
wandb.sync()

In [53]:
wandb.finish()

In [60]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    report_to="wandb",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mtanmay-k[0m ([33mteam-tk[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,2.8968
1000,2.718
1500,2.7318
2000,2.7005
2500,2.6826
3000,2.5709
3500,2.6332
4000,2.6167
4500,2.6115


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Adding directory to artifact (./lora-mt5-xl/checkpoint-2500)... Done. 2.1s


In [34]:
model.save_pretrained("./model_backup/lora_mt5_xl")
tokenizer.save_pretrained("./model_backup/lora_mt5_xl")

('./model_backup/lora_mt5_xl/tokenizer_config.json',
 './model_backup/lora_mt5_xl/special_tokens_map.json',
 './model_backup/lora_mt5_xl/spiece.model',
 './model_backup/lora_mt5_xl/added_tokens.json',
 './model_backup/lora_mt5_xl/tokenizer.json')

In [None]:
trainer.model.base_model.save_pretrained('./model_backup/base_mt5_xl')