# import module and model

In [1]:
import torch
import torch.nn.functional as F
import warnings
import numpy as np
from tqdm import tqdm
from scipy.linalg import block_diag
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from typing import List
from datasets import load_dataset
from promptsource.templates import DatasetTemplates
# from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import scipy
from sklearn.metrics import accuracy_score

import logging
import sys
warnings.filterwarnings("ignore")

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.cuda.is_available()
model_type = "BAAI/glm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_type, trust_remote_code=True, revision='main')
model = AutoModelForSeq2SeqLM.from_pretrained(model_type, trust_remote_code=True, revision='main').half().cuda()
print(f"Model {model_type} loaded.")

Model BAAI/glm-roberta-large loaded.


# Generation

## ConditionalGenerationDataset

In [3]:
# Fine-tuning GLM on classification/mutliple-choice dataset. An example.
import torch
from sklearn.model_selection import train_test_split
from datasets.arrow_dataset import Dataset

class ConditionalGenerationDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_name, split, template_name, prompt_name, tokenizer, shuffle, portion ):
        super(ConditionalGenerationDataset, self).__init__()
        self.dataset_name = dataset_name
        self.split = split
        self.prompt = DatasetTemplates(template_name)[prompt_name]
        self.tokenizer = tokenizer
        # Ensure that the dataset split is valid.
        self.data = []
        if '/' in dataset_name:
            iters = load_dataset(dataset_name.split('/')[0], dataset_name.split('/')[1], split=split)
        else:
            iters = load_dataset(dataset_name, split=split)

        ## do a sampling from the dataset
        if(portion > 1):
            dataset_size = len(iters)//portion
            dataset, _ = train_test_split(iters, test_size=len(iters) - dataset_size, shuffle = shuffle, random_state = 1)
            dataset = Dataset.from_dict(dataset)
        if(portion == 1):
            dataset = iters
        for sample in tqdm(dataset):
            self.data.append(dict(zip(
                ['inputs_pretokenized', 'answer_pretokenized'],
                self.prompting_single_sample(sample)
            )))

    def prompting_single_sample(self, sample):
        """
        Format a sample into a prompted sample.
        :return inputs_pretokenized, choices_pretokenized
        """
        inputs_pretokenized, answer_pretokenized = tuple(self.prompt.apply(sample))
        return inputs_pretokenized + ' [MASK]', answer_pretokenized

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]


## GenerationTrainerClass

In [4]:
# best_model_path = '/path/to/your/model'
best_model_path = '/home/zyw/squad/model_gen'
# best_model_path = '/home/zyw/model_gen' model_gen is a directory, your model will be saved in model_gen. 
# By running the save model, it will create two file, one is pytorch_model.bin and the other one is config.json in the model_gen directory.
# this saved model will be needed for evaluation purpose
def generate_texts(texts: List, targets: List, max_length=256):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    inputs = tokenizer.build_inputs_for_generation(inputs, targets = targets, max_gen_length=max_length)
    inputs = inputs.to('cuda')
    outputs = model(**inputs)
    loss = outputs.loss
    logits = outputs.logits
    return loss, logits

def init_logger():
    logger = logging.getLogger("default")
    cmd_handler = logging.StreamHandler(sys.stdout)
    cmd_handler.setLevel(logging.DEBUG)
    cmd_handler.setFormatter(logging.Formatter(r"[%(asctime)s][%(levelname)s][%(filename)s:%(lineno)s] %(message)s"))
    logger.addHandler(cmd_handler)
    logger.setLevel(logging.INFO)
    return logger
class GenerationTrainer:
    def __init__(self, train_dataset, losses_eval: List):
        self.train_bsz = 4
        self.epoch = 1
        self.lr = 8e-6
        self.losses_eval = losses_eval
        
        # Load tokenizer & logger
        self.tokenizer = tokenizer  # use tokenizer from 3rd cell
        self.logger = init_logger()

        # Load dataset
        self.train_dataset = train_dataset
        self.train_loader = DataLoader(train_dataset, batch_size = self.train_bsz, shuffle=True, drop_last=True)
        

        # Configure training model, optimizer, and scheduler
        self.model = model.float()  # use model from 3rd cell
        self.model.train()
        num_training_steps = self.epoch * (len(self.train_dataset) // self.train_bsz) # 10 * (10246 // 8)
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=int(num_training_steps * 0.05),
                                                         num_training_steps=num_training_steps)

    def train(self):
        for e in range(1, self.epoch + 1):
            self.logger.info(f"Epoch {e}")
            # train
            tqdm_vars = {"lr": np.nan, "loss": np.nan}
            # desc is description, enumertate index of sample start from 1, trainer batch size is 8, then it will be 1,data. 2, data. postfix is the data need to be displayed in the end of tbar
            tbar = tqdm(enumerate(self.train_loader, start=1), desc="train", total=len(self.train_loader),
                        postfix=tqdm_vars)
            # train_loss_value = 0.0
            model.train()
            for i, sample in tbar:
                loss, logits = generate_texts(sample['inputs_pretokenized'], sample['answer_pretokenized'])
                loss.backward()
                self.optimizer.step()
                self.scheduler.step()
                self.optimizer.zero_grad()
                tqdm_vars["lr"] = self.optimizer.state_dict()["param_groups"][0]["lr"]
                tqdm_vars["loss"] = loss.item()
                tbar.set_postfix(tqdm_vars)
                
                # save the loss value for each epoch
                self.losses_eval.append([tqdm_vars["lr"], tqdm_vars["loss"]])

        # save model for evaluation
        model.save_pretrained(best_model_path)




## Load dataset

In [5]:
# when portion == 1 means you are using the whole dataset, when it is 38, it means you are using 1/38 of the dataset
portion = 38
prompt_name = 'given_context_answer_question_variation'
train_dataset = ConditionalGenerationDataset('squad', 'train', 'squad', prompt_name, tokenizer, True, portion) # shuffle the training dataset with random_state = 1


Found cached dataset squad (/home/zyw/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2305/2305 [00:20<00:00, 114.56it/s]


## Training

In [6]:
loss_eval = [] # save the loss value for one epoch
trainer = GenerationTrainer(train_dataset, loss_eval)
trainer.train()

[2023-01-10 18:57:01,972][INFO][2855677035.py:50] Epoch 1


train:   0%|          | 0/576 [00:00<?, ?it/s, loss=nan, lr=nan]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
train: 100%|██████████| 576/576 [04:12<00:00,  2.28it/s, lr=0, loss=0.263]       


## Evaluation stage

In [7]:
# the portion should be the same as the previous set portion, since previous one is 38, we used 38 here
portion = 38
dataset_squad_validation = load_dataset("squad", split="validation")
dataset_size = len(dataset_squad_validation)//portion
dataset, _ = train_test_split(dataset_squad_validation, test_size=len(dataset_squad_validation) - dataset_size, shuffle = False)
dataset_validation_ = Dataset.from_dict(dataset)

## the prompt need to be the same as the previous one
dataset_prompted = train_dataset = ConditionalGenerationDataset('squad', 'validation', 'squad', prompt_name, tokenizer, False, portion)

Found cached dataset squad (/home/zyw/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Found cached dataset squad (/home/zyw/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 278/278 [00:01<00:00, 141.67it/s]


## Load the saved model and do a small test to ensure the model is loaded

In [8]:
## please unzip the config.zip file first and put all the files into the model_gen directory in order to load the saved model
# best_model_path = '/path/to/your/model'
saved_model_gen = AutoModelForSeq2SeqLM.from_pretrained(best_model_path, trust_remote_code=True).half().cuda()
def generate_text_eval(text, max_length=512 ):
    inputs = tokenizer(text, return_tensors="pt")
    inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=max_length)
    inputs = {key: value.cuda() for key, value in inputs.items()}
    # greedy decode strategy (topk = 1)
    outputs = saved_model_gen.generate(**inputs, max_length=max_length, eos_token_id=tokenizer.eop_token_id, top_k=1)[0].tolist()
    sop_id = tokenizer.sop_token_id
    eop_id = tokenizer.eop_token_id
    end_idx = outputs.index(eop_id) if eop_id in outputs else len(outputs)
    return tokenizer.decode(outputs[outputs.index(sop_id) + 1: end_idx]).strip()

## check the model is loaded
print(generate_text_eval(dataset_prompted[1]["inputs_pretokenized"]))
dataset_prompted[1]["answer_pretokenized"]

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50266 for open-end generation.


Carolina Panthers


'Carolina Panthers'

## Load evaluation matric and Evaluate

In [9]:
from datasets import list_metrics
from datasets import load_metric
lst_answer = []
for i, sample in enumerate(dataset_prompted):
    lst_answer.append(generate_text_eval(dataset_prompted[i]["inputs_pretokenized"]))

lst_ids, predictions, answers, references = [], [], [], []
dataset_index = len(lst_answer) - 1 # 6373
for i, sample in enumerate(dataset_squad_validation):
    lst_ids.append(dataset_squad_validation[i]["id"])
    if(i == dataset_index):
        break
for i in range(len(lst_ids)):
    predictions.append({'prediction_text': lst_answer[i], 'id': lst_ids[i]})
    if(i == dataset_index):
        break
for i, sample in enumerate(dataset_squad_validation):
    answers.append(sample["answers"])
    if(i == dataset_index):
        break
for i in range(len(lst_ids)):
    references.append({'answers': answers[i], 'id': lst_ids[i]})
    if(i == dataset_index):
        break


squad_metric = load_metric("squad")
results_6373 = squad_metric.compute(predictions=predictions, references=references)
print(results_6373)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50266 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50266 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50266 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50266 for open-end generati

{'exact_match': 92.0863309352518, 'f1': 94.69410937415482}
