In [1]:
%cd t-few

[Errno 2] No such file or directory: 't-few'
/content/drive/MyDrive/drive_workspace/t-few


In [2]:
# !pip install omegaconf
# !pip install -r requirements.txt 

In [3]:
import os
import torch
import argparse
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger

from src.data import FinetuneDataModule, get_dataset_reader, PretrainDataModule
from src.models.EncoderDecoder import EncoderDecoder
from src.models.modify_model import modify_transformer
from src.utils.Config import Config
from src.utils.util import ParseKwargs, set_seeds
from omegaconf import OmegaConf
from datasets import load_dataset
import re
from src.models.lora import LoRALinear
import torch.nn as nn
import json


In [4]:



def maybe_modify(config,module,c_name,m_name,layer,debug=False):
    should_modify = bool(re.fullmatch(config.lora_modules, m_name)) and bool(re.fullmatch(config.lora_layers, c_name)) 
                    
    if not debug:
        if should_modify:
            setattr(
                module,
                c_name,
                LoRALinear(layer, config.lora_rank, config.lora_scaling_rank, config.lora_init_scale),
            )
    else:
        return dict(
            module=m_name,
            c_name=c_name,layer=type(layer),should_modify=should_modify,is_linear = isinstance(layer, nn.Linear))

def lora_modify(transformer, config,debug=False):
    d_list = []
    for m_name, module in dict(transformer.named_modules()).items():
        for c_name, layer in dict(module.named_children()).items():
            a = maybe_modify(config,module,c_name,m_name,layer,debug)
            d_list.append(a)

                    
 
    return transformer,d_list

In [7]:

dataset = load_dataset("poem_sentiment")
print(dataset)

Using custom data configuration default
Reusing dataset poem_sentiment (/root/.cache/huggingface/datasets/poem_sentiment/default/1.0.0/4e44428256d42cdde0be6b3db1baa587195e91847adabf976e4f9454f6a82099)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 892
    })
    validation: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 105
    })
    test: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 104
    })
})


In [8]:

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  encoding = tokenizer(batch["verse_text"])
  # For language modeling the labels need to be the input_ids
  #encoding["labels"] = encoding["input_ids"]
  return encoding

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
# The GPT-2 tokenizer does not have a padding token. In order to process the data 
# in batches we set one here 
tokenizer.pad_token = tokenizer.eos_token
column_names = dataset["train"].column_names
dataset = dataset.map(encode_batch, remove_columns=column_names, batched=True)



Downloading:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/441 [00:00<?, ?B/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
block_size = 50
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
  # Concatenate all texts.
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
  # customize this part to your needs.
  total_length = (total_length // block_size) * block_size
  # Split by chunks of max_len.
  result = {
    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
    for k, t in concatenated_examples.items()
  }
  result["labels"] = result["input_ids"].copy()
  return result

dataset = dataset.map(group_texts,batched=True,)

dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b")
config  = OmegaConf.create({
    "lora_scaling_rank": 1,
    "lora_rank": 0,
    "lora_init_scale": 0.0,
    "lora_modules": ".*",
    "lora_layers": "k_proj|v_proj|fc2",
    "trainable_param_names": ".*lora_b.*",
    "model_modifier": "lora",
    "optimizer":"adamw",
    "lr": 3e-3,
    "num_steps": 1000,
    "scheduler":"cosine_annealing",
    "warmup_ratio":0.1,
    "weight_decay":0,
    })

model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b")
model,_ = lora_modify(model,config)


Downloading:   0%|          | 0.00/2.45G [00:00<?, ?B/s]

In [11]:
from src.utils.get_optimizer import get_optimizer
from src.utils.get_scheduler import get_scheduler
from transformers import Trainer, TrainingArguments

In [12]:
optimizer,_ = get_optimizer(model,config)
scheduler = get_scheduler(optimizer,config)

training_args = TrainingArguments(
  output_dir="./examples", 
  do_train=True,
  remove_unused_columns=False,
  per_device_train_batch_size=1,
  learning_rate=5e-4,
  num_train_epochs=50,
)


trainer = Trainer(optimizers=(optimizer,scheduler),
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"], 
    )

In [13]:
trainer.train()

***** Running training *****
  Num examples = 194
  Num Epochs = 50
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 9700


Step,Training Loss
500,6.0836
1000,4.8726


Saving model checkpoint to ./examples/checkpoint-500
Configuration saved in ./examples/checkpoint-500/config.json
Model weights saved in ./examples/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./examples/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./examples/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./examples/checkpoint-1000
Configuration saved in ./examples/checkpoint-1000/config.json
Model weights saved in ./examples/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./examples/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./examples/checkpoint-1000/special_tokens_map.json


KeyboardInterrupt: 