In [1]:
!pip install BitsAndBytes liger_kernel deepspeed lightning gdown peft mpi4py

Collecting BitsAndBytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting liger_kernel
  Downloading liger_kernel-0.5.8-py3-none-any.whl.metadata (23 kB)
Collecting deepspeed
  Downloading deepspeed-0.16.7.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting lightning
  Downloading lightning-2.5.1.post0-py3-none-any.whl.metadata (39 kB)
Collecting mpi4py
  Downloading mpi4py-4.0.3.tar.gz (466 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m466.3/466.3 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hjson (from deepspeed)
  

In [2]:
%%writefile dp.json
{
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "steps_per_print": 2000,
    "zero_optimization": {
        "stage": 2,
        "offload_param": {
            "device": "none"
        },
        "offload_optimizer": {
            "device": "none"
        },
        "stage3_param_persistence_threshold": 1e4,
        "stage3_max_live_parameters": 3e7,
        "stage3_prefetch_bucket_size": 3e7,
        "memory_efficient_linear": false
    },
    "bf16": {
        "enabled": true,
        "loss_scale_window": 50,
        "min_loss_scale": 1e-10
    },
    "gradient_clipping": 1.0,
    "gradient_accumulation_steps": "auto",
    "prescale_gradients": false,
    "wall_clock_breakdown": false,
    "hybrid_engine": {
        "enabled": false,
        "max_out_tokens": 512,
        "inference_tp_size": 1,
        "release_inference_cache": false,
        "pin_parameters": true,
        "tp_gather_partition_size": 8
    }
    
}

Writing dp.json


In [3]:
%%writefile model.py
from transformers import AutoModelForCausalLM, AutoModel, AutoConfig
import torch
from transformers import AutoTokenizer
from torch import nn
from transformers.models.mt5.modeling_mt5 import MT5Attention
import torch.nn.functional as F
from peft import get_peft_model, LoraConfig, PeftModel, PeftConfig


class MLP(nn.Module):
    def __init__(self, mt_dim, llm_dim):
        super(MLP, self).__init__()
        self.linear1 = nn.Linear(mt_dim, mt_dim * 2)
        self.linear2 = nn.Linear(mt_dim * 2, llm_dim)
        self.relu = nn.ReLU()
    def forward(self, mt_hidden_state):
        output = self.linear1(mt_hidden_state)
        output = self.relu(output)
        output = self.linear2(output)
        return output

class Mapping(nn.Module):
    def __init__(self, mt_dim, llm_dim):
        super(Mapping, self).__init__()
        self.mlp = MLP(mt_dim, llm_dim)
        self.end_boundary = nn.Parameter(
            torch.zeros(1, 1, llm_dim), requires_grad=True
        )
    def forward(self, hidden_states):
        hidden_states = self.mlp(hidden_states)
        return hidden_states

    def get_embed(self):
        return self.end_boundary

class BiasTunedLinear(nn.Module):
    """Wrap nn.Linear, re-expose its weight/bias, and add a tiny bias_delta + scale_tuned."""
    def __init__(self, linear: nn.Linear):
        super().__init__()
        # keep the pretrained linear (with its .weight and .bias)
        self.linear = linear
        # re-expose so HF code still sees `.weight` and `.bias`
        self.weight = linear.weight
        self.bias   = linear.bias

        # our new, small tuning parameters
        self.bias_delta = nn.Parameter(torch.zeros(self.linear.out_features))
        self.scale_tuned = nn.Parameter(torch.ones(1))

    def forward(self, x):
        orig = self.linear(x)                   # -> W x + b
        return self.scale_tuned * (orig + self.bias_delta)

def apply_bias_tuning(module: nn.Module):
    """Recursively replace every nn.Linear with a BiasTunedLinear."""
    for name, child in list(module.named_children()):
        if isinstance(child, nn.Linear):
            setattr(module, name, BiasTunedLinear(child))
        else:
            apply_bias_tuning(child)

class GatedMT5Attention(MT5Attention):
    def __init__(self, config, prompt_length, hidden_size, pretrained_attention=None, has_relative_attention_bias=False):
        super().__init__(config, has_relative_attention_bias)
        if pretrained_attention is not None:
            self.load_state_dict(pretrained_attention.state_dict(), strict=False)
        self.gate = nn.Parameter(torch.zeros(self.n_heads))
        self.prompts = nn.Parameter(torch.zeros(prompt_length, hidden_size))

    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
        cache_position=None,
    ):
        """
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder)
        batch_size, seq_length = hidden_states.shape[:2]

        # if key_value_states are provided this layer is used as a cross-attention layer for the decoder
        is_cross_attention = key_value_states is not None
        
        query_states = self.q(hidden_states)
        query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)

        prompt_embeds = self.prompts
        prompt_length = prompt_embeds.size(0)
        prompt = prompt_embeds.unsqueeze(0).expand(batch_size, -1, -1)
        hidden_states = torch.cat([prompt, hidden_states], dim=1)


        current_states = key_value_states if is_cross_attention else hidden_states
        key_states = self.k(current_states)
        value_states = self.v(current_states)
        key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
        value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)

        prompt_key = key_states[:, :, :prompt_length, :]
        token_key = key_states[:, :, prompt_length:, :]
        prompt_value = value_states[:, :, :prompt_length, :]
        token_value = value_states[:, :, prompt_length:, :]

        prompt_scores = torch.matmul(query_states, prompt_key.transpose(3, 2))
        scores = torch.matmul(query_states, token_key.transpose(3, 2))
        
        # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
        # scores = torch.matmul(query_states, key_states.transpose(3, 2))

        if position_bias is None:
            key_length = token_key.shape[-2]
            # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past)
            real_seq_length = query_length if query_length is not None else cache_position[-1] + 1
            if not self.has_relative_attention_bias:
                position_bias = torch.zeros(
                    (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype
                )
                if self.gradient_checkpointing and self.training:
                    position_bias.requires_grad = True
            else:
                position_bias = self.compute_bias(
                    real_seq_length, key_length, device=scores.device, cache_position=cache_position
                )
                position_bias = position_bias[:, :, -seq_length:, :]

            if mask is not None:
                causal_mask = mask[:, :, :, : token_key.shape[-2]]
                position_bias = position_bias + causal_mask

        if self.pruned_heads:
            mask = torch.ones(position_bias.shape[1])
            mask[list(self.pruned_heads)] = 0
            position_bias_masked = position_bias[:, mask.bool()]
        else:
            position_bias_masked = position_bias

        scores += position_bias_masked

        attn_weights_prompts = nn.functional.softmax(prompt_scores.float(), dim=-1).type_as(prompt_scores)
        gated_prompt_scores = torch.tanh(self.gate).unsqueeze(0).unsqueeze(2).unsqueeze(3) * attn_weights_prompts
        
        # (batch_size, n_heads, seq_length, key_length)
        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

        attn_weights = torch.cat([gated_prompt_scores, attn_weights], dim=-1)
        value_states = torch.cat([prompt_value, token_value], dim=2)
        
        attn_output = torch.matmul(attn_weights, value_states)

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, -1, self.inner_dim)
        attn_output = self.o(attn_output)

        outputs = (attn_output, past_key_value, position_bias)

        if output_attentions:
            outputs = outputs + (attn_weights,)
        return outputs

class MultilingualEmbeddingModel(nn.Module):
    ALLOWED_MODELS = {
        "google/mt5-small",
        "google/mt5-base",
        "google/mt5-large",
        "google/mt5-xl",
        "DKYoon/mt5-small-lm-adapt",
        "DKYoon/mt5-large-lm-adapt",
        "DKYoon/mt5-xl-lm-adapt",
    }
    
    def __init__(self, embedding_model, max_seq_len):
        super().__init__()

        if embedding_model not in self.ALLOWED_MODELS:
            raise ValueError(f"Model is not in allowed models: {self.ALLOWED_MODELS}")
        
        self.embedding_model = AutoModel.from_pretrained(embedding_model)
        self.embedding_model = self.embedding_model.encoder 
            
        self.tokenizer = AutoTokenizer.from_pretrained(embedding_model)
        
        self.embedding_dim = self.embedding_model.config.hidden_size

        self.max_seq_len = max_seq_len

        self.num_layers = len(self.embedding_model.block)

        self.prompt_length = 10

        prompt_layers = [-4,-3,-2,-1]
        
        for idx in prompt_layers:
            attention_weights = self.embedding_model.block[idx].layer[0].SelfAttention
            self.embedding_model.block[idx].layer[0].SelfAttention = GatedMT5Attention(self.embedding_model.config, self.prompt_length, self.embedding_dim, pretrained_attention=attention_weights)

        apply_bias_tuning(self.embedding_model)
        
        for n, p in self.named_parameters():
            if any(tok in n for tok in (
                ".bias",        # bias_tuned_linear.bias
                ".scale",       # bias_tuned_linear.scale
                "prompts",      # gated prompts
                "gate",         # gated heads
                "layer_norm",   # MT5 layer norms
            )):
                p.requires_grad = True
            else:
                p.requires_grad = False
    
    def get_input_embeddings(self, model, input_ids):
        return model.get_input_embeddings()(input_ids)
    
    def get_last_hidden_states(self, encoded_inputs, model, tokenizer):
        input_ids, attention_mask = self.mt_input_features(encoded_inputs, tokenizer)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state, attention_mask

    def mt_input_features(self, input_texts_m2m, tokenizer):
        input_ids_m2m, attention_mask_m2m = [], []
        for input_text_m2m in input_texts_m2m:
            encoding_m2m = tokenizer(input_text_m2m,
                                         padding='longest',
                                         max_length=self.max_seq_len,
                                         truncation=True)
            input_ids_m2m_item = encoding_m2m.input_ids
            attention_mask_m2m_item = encoding_m2m.attention_mask
            input_ids_m2m.append(input_ids_m2m_item)
            attention_mask_m2m.append(attention_mask_m2m_item)
        max_len = max([len(item) for item in input_ids_m2m])
        m2m_pad_id = tokenizer.pad_token_id
        for input_ids_m2m_item, attention_mask_m2m_item in zip(input_ids_m2m, attention_mask_m2m):
            while len(input_ids_m2m_item) < max_len:
                input_ids_m2m_item.append(m2m_pad_id)
                attention_mask_m2m_item.append(0)
        input_ids_m2m = torch.tensor(input_ids_m2m, dtype=torch.long).cuda()
        attention_mask_m2m = torch.tensor(attention_mask_m2m, dtype=torch.long).cuda()
        return input_ids_m2m, attention_mask_m2m
    
    def forward(self, encoded_inputs):
        embeddings, attention_mask = self.get_last_hidden_states(
            encoded_inputs, 
            self.embedding_model, 
            self.tokenizer,
        )
        
        return embeddings, attention_mask

class MPTModel(nn.Module):
    def __init__(self, config):
        super(MPTModel, self).__init__()
        self.config = config  # Ensure there is a config attribute
        self.max_gen_len = config['max_gen_len']
        self.encoder_mt = MultilingualEmbeddingModel(config['mt_path'], config['max_seq_len'])
        
        model_llm = AutoModelForCausalLM.from_pretrained(config['llm_path'])

        self.model_llm = model_llm

        self.lora_config = LoraConfig(
            r=32,
            lora_alpha=128,  # Maintains an effective scaling factor of 4
            target_modules=["q_proj", "k_proj", "v_proj"],
            lora_dropout=0.1,
            bias="all",
            task_type="CAUSAL_LM",
        )

        self.llm_embedding_layer = self.model_llm.get_input_embeddings()
        for name, parameter in self.model_llm.named_parameters():
            parameter.requires_grad = False

        # self.model_llm = get_peft_model(self.model_llm, self.lora_config)

        # for name, param in self.model_llm.named_parameters():
        #     if "lora" in name:
        #         param.requires_grad = True  

        d_model = self.encoder_mt.embedding_dim
        self.mapping = Mapping(d_model, model_llm.config.hidden_size)
        self.llm_pad_token_id = config['llm_pad_token_id']
        self.llm_bos_token_id = config['llm_bos_token_id']
        print('mapping layer size:', sum(param.numel() for param in self.mapping.parameters()) / 1000000)

    def squeeze_pad(self, hidden_states, masks):
        x_01 = (masks != 0).long()

        seq_num_len = x_01.size(1)
        offset = torch.tensor([(i + 1) for i in range(seq_num_len)], dtype=torch.long).to(x_01.device)
        offset = offset.unsqueeze(dim=0).expand_as(x_01)
        x_01 *= offset
        _, idx = x_01.sort(1, descending=False)

        masks = masks.gather(1, idx)
        idx = idx.unsqueeze(dim=-1).expand_as(hidden_states)
        hidden_states = hidden_states.gather(1, idx)

        bs, seq_len, dim = hidden_states.size()
        masks_sum = torch.sum(masks, dim=0)
        idx = masks_sum > 0
        idx = idx.unsqueeze(dim=0).expand_as(masks)
        masks = masks[idx]
        idx_ex = idx.unsqueeze(dim=-1).expand_as(hidden_states)
        hidden_states = hidden_states[idx_ex]
        hidden_states = hidden_states.view(bs, -1, dim)
        masks = masks.view(bs, -1)

        return hidden_states, masks, idx

    def merge_loara(self):
        self.model_llm = self.model_llm.merge_and_unload()
        self.model_llm.save_pretrained('/kaggle/working/llama_1b_xcsqa/')

    def forward(self, encoded_inputs,
                labels=None, mask_label=None, input_ids_prompt=None, mask_prompt=None):
        end_boundary = self.mapping.get_embed()
        bs = len(encoded_inputs)
        end_boundary = end_boundary.expand([bs, 1, end_boundary.size(-1)])

        if self.llm_bos_token_id is None:
            bos = torch.tensor([self.llm_pad_token_id for i in range(bs)], dtype=torch.long).cuda()
            mask = torch.zeros([bs, 1], dtype=torch.long).cuda()
        else:
            bos = torch.tensor([self.llm_bos_token_id for i in range(bs)], dtype=torch.long).cuda()
            mask = torch.ones([bs, 1], dtype=torch.long).cuda()
        bos_embedding = self.llm_embedding_layer(bos)
        bos_embedding = bos_embedding.view(bs, 1, -1)
        llm_input_embedding = bos_embedding
        llm_input_mask = mask

        mt_encoder_outputs, attention_mask_mt = self.encoder_mt(encoded_inputs)
        
        mt_hidden_state = self.mapping(mt_encoder_outputs)
        llm_input_embedding = torch.cat([llm_input_embedding, mt_hidden_state, end_boundary],
                                        dim=1)
        llm_input_mask = torch.cat([llm_input_mask, attention_mask_mt, mask], dim=1)

        if input_ids_prompt is not None:

            hidden_states_prompt = self.llm_embedding_layer(input_ids_prompt)
            llm_input_embedding = torch.cat([llm_input_embedding, hidden_states_prompt], dim=1)
            llm_input_mask = torch.cat([llm_input_mask, mask_prompt], dim=1)
        if labels is not None:
            pad_labels = llm_input_mask * -100 + (1 - llm_input_mask) * -100
            label_embedding = self.llm_embedding_layer(labels)
            llm_input_embedding = torch.cat([llm_input_embedding, label_embedding], dim=1)
            llm_input_mask = torch.cat([llm_input_mask, mask_label], dim=1)
            labels = labels * mask_label - 100 * (1 - mask_label)
            labels = torch.cat([pad_labels, labels], dim=1)

        llm_input_embedding, llm_input_mask, cut_pad_idx \
            = self.squeeze_pad(llm_input_embedding, llm_input_mask)

        if labels is None:
            generate_ids = self.model_llm.generate(inputs_embeds=llm_input_embedding,
                                                   attention_mask=llm_input_mask,
                                                   max_new_tokens=self.max_gen_len,
                                                   pad_token_id=self.llm_pad_token_id,
                                                   do_sample=False)
            return generate_ids
        else:
            bs, seq_len = labels.size()
            labels = labels[cut_pad_idx]
            labels = labels.view(bs, -1)
            output = self.model_llm(inputs_embeds=llm_input_embedding,
                                    attention_mask=llm_input_mask,
                                    labels=labels)
            return output.loss

Writing model.py


In [4]:
%%writefile read_data.py
import math
import os
import random
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, SequentialSampler
import json
from datasets import load_dataset
from torch.utils.data import Dataset
from huggingface_hub import login, upload_file

class ExperimentDataset(Dataset):
    def __init__(self, dataset) -> None:
        super().__init__()
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        return sample

def llm_input_features(input_texts_llm, tokenizer_llm,
                         max_seq_len, add_bos_token, add_eos_token):
    tokenizer_llm.add_bos_token = add_bos_token
    # tokenizer_llm.add_eos_token = add_eos_token
    if add_eos_token:
        input_texts_llm = [f"{prompt}{tokenizer_llm.eos_token}" for prompt in input_texts_llm]
    encoding_llm = tokenizer_llm(input_texts_llm,
                         padding='longest',
                         max_length=max_seq_len,
                         truncation=True,
                         add_special_tokens = False,
                         return_tensors="pt")
    input_ids_llm = encoding_llm.input_ids.cuda()
    attention_mask_llm = encoding_llm.attention_mask.cuda()
    attention_mask_llm[:,-1] = 1.0
    return input_ids_llm, attention_mask_llm

def read_dataset(path):
    if 'jsonl' in path:
        dataset = []
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                dataset.append(json.loads(line))
    elif 'json' in path:
        with open(path, 'r', encoding='utf-8') as f:
            dataset = json.load(f)
        if isinstance(dataset, dict):
            if 'data' in dataset:
                dataset = dataset['data']
    else:
        with open(path, 'r', encoding='utf-8') as f:
            dataset = f.readlines()
    return dataset

Writing read_data.py


In [5]:
%%writefile utils.py
import os
import random
import torch
from tqdm import tqdm
from read_data import llm_input_features
import numpy as np

def set_seed(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)  # cpu
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # gpu
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # consistent results on the cpu and gpu

def save_with_accelerate(accelerator, model, output_dir, model_name='pytorch_model.bin'):
    os.makedirs(output_dir, exist_ok=True)
    output_file = output_dir
    accelerator.wait_for_everyone()
    accelerator.save_model(model, output_file, max_shard_size="30GB",safe_serialization=False)


def get_train_ds_config(train_batch_size=1,
                        train_micro_batch_size_per_gpu=1,
                        lr=2e-5,
                        gradient_accumulation_steps=1,
                        offload=True,
                        stage=2,
                        enable_hybrid_engine=False,
                        inference_tp_size=1,
                        release_inference_cache=False,
                        pin_parameters=True,
                        tp_gather_partition_size=8,
                        max_out_tokens=512,
                        warm_step=0,
                        train_step=0):

    device = "cpu" if offload else "none"
    zero_opt_dict = {
        "stage": stage,
        "offload_param": {
            "device": device
        },
        "offload_optimizer": {
            "device": device
        },
        "stage3_param_persistence_threshold": 1e4,
        "stage3_max_live_parameters": 3e7,
        "stage3_prefetch_bucket_size": 3e7,
        "memory_efficient_linear": False
    }
    return {
        "train_batch_size": train_batch_size,
        "train_micro_batch_size_per_gpu": train_micro_batch_size_per_gpu,
        "steps_per_print": 2000,
        "zero_optimization": zero_opt_dict,
        "bf16": {
            "enabled": False,
        },
        "gradient_clipping": 1.0,
        "gradient_accumulation_steps": gradient_accumulation_steps,
        "prescale_gradients": False,
        "wall_clock_breakdown": False,
        "hybrid_engine": {
            "enabled": enable_hybrid_engine,
            "max_out_tokens": max_out_tokens,
            "inference_tp_size": inference_tp_size,
            "release_inference_cache": release_inference_cache,
            "pin_parameters": pin_parameters,
            "tp_gather_partition_size": tp_gather_partition_size,
        },
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": lr,
                "betas": [
                    0.8,
                    0.999
                ],
                "eps": 1e-8
            }
        },
        "scheduler": {
            "type": "WarmupCosineLR",
            "params": {
            "total_num_steps": train_step,
            "warmup_num_steps": warm_step
            }
        },
    }

def evaluate_classification(model, test_set, tokenizer_llm, max_gen_len, use_prompt):
    model.eval()
    results_list = []
    hit = 0
    step_trange = tqdm(test_set)
    preds, golds = [], []
    for test_step in step_trange:
        prompts = test_step['prompt']
        targets = test_step['target']
        input_ids_prompt, mask_prompt = None, None
        if use_prompt:
            add_bos_token = False
            add_eos_token = False
            input_ids_prompt, mask_prompt = llm_input_features(prompts, tokenizer_llm, max_gen_len, add_bos_token,
                                                           add_eos_token)
        generate_ids = model(prompts,
                             input_ids_prompt=input_ids_prompt,
                             mask_prompt=mask_prompt)

        results = tokenizer_llm.batch_decode(generate_ids,
                                               skip_special_tokens=True,
                                               clean_up_tokenization_spaces=False)

        preds += results
        golds += targets

        for result, prompt, target in zip(results, prompts, targets):
            result = result.strip()
            results_list.append({
                'prompt': prompt,
                'prediction': result,
                'answer': target
            })
            if target == result:
                hit += 1

        acc = round(hit / len(results_list) * 100, 2)
        loss_show = 'Acc:' + str(acc)
        step_trange.set_postfix_str(loss_show)

    acc = round(hit / len(results_list) * 100, 2)
    return acc, results_list


Writing utils.py


In [6]:
%%writefile trainer.py
import os
import random
from accelerate import Accelerator
from tqdm import tqdm
from torch.utils.data import DataLoader, SequentialSampler
from transformers import AutoTokenizer, get_scheduler
from datasets import load_dataset
from huggingface_hub import login, upload_file
import json
import torch
import math
from model import MPTModel
from utils import set_seed, save_with_accelerate
from read_data import ExperimentDataset, llm_input_features, read_dataset

def construct_prompt(sample):
    return f"### Instruction:\nNews Sentence: {sample}\nClassify the given news sentence into one of the following categories.\nBusiness, Entertainment, Political, Sports, Science.\n\n### Response:"


class Arguments:
    def __init__(self):
        BATCH_SIZE_PER_GPU=8
        TOTAL_BATCH_SIZE=8
        GRADIENT_ACC_STEPS = TOTAL_BATCH_SIZE // BATCH_SIZE_PER_GPU

        self.llm_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
        self.mt_path = "google/mt5-large"
        self.train_num = 8888
        self.dev_size = 1000
        self.lr = 3e-5
        self.epoch_num = 3
        self.gradient_accumulation = GRADIENT_ACC_STEPS
        self.max_seq_len = 200
        self.max_gen_len = 200
        self.train_batch_size = TOTAL_BATCH_SIZE
        self.eval_batch_size = BATCH_SIZE_PER_GPU
        self.train_micro_batch_size_per_gpu = BATCH_SIZE_PER_GPU
        self.augmentation = False
        self.save_name = 'news'
        self.stage_name = 'no_aug'
        self.report_to = 'wandb'
        self.logging_steps = 100
        self.warm_rate = 0.05
        self.lr_scheduler_name = 'cosine'
        self.system_prompt = None
        self.init_checkpoint = None

def main():
    
    args = Arguments()
    
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    accelerator_log_kwargs = {}
    
    accelerator = Accelerator(
        gradient_accumulation_steps=args.gradient_accumulation,
        **accelerator_log_kwargs
    )

    accelerator.wait_for_everyone()
    set_seed(0)

    llm_path = args.llm_path
    mt_path = args.mt_path
    
    token = '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'
    login(token=token)

    train_samples = []

    ds = load_dataset("Themira/en_si_news_classification_with_label_name")

    train_ds = ds['train_en']
        
    for i in range(len(train_ds)):
        example = train_ds[i]
        sample = {}
        sample['prompt'] = construct_prompt(example['sentence'])
        sample['target'] = example['label']
        train_samples.append(sample)

    args.train_num = len(train_samples)
    
    train_set = train_samples[args.dev_size:]
    dev_set = train_samples[:args.dev_size]
    
    dev_set = ExperimentDataset(
        dev_set
    )
    
    train_set = ExperimentDataset(
        train_set
    )

    train_num = args.train_num
    lr = args.lr
    epoch_num = args.epoch_num
    gradient_accumulation = args.gradient_accumulation
    max_seq_len = args.max_seq_len
    max_gen_len = args.max_gen_len

    train_batch_size = args.train_batch_size
    eval_batch_size = args.eval_batch_size
    train_micro_batch_size_per_gpu = args.train_micro_batch_size_per_gpu

    augmentation = args.augmentation
    save_name = args.save_name
    stage_name = args.stage_name
    result_path_base = f'./results/{save_name}/{stage_name}/'
    output_model_path_base = f'./outputs/{save_name}/{stage_name}/'
    tokenizer_m2m = AutoTokenizer.from_pretrained(mt_path)
    tokenizer_llm = AutoTokenizer.from_pretrained(llm_path, use_fast=True)
    tokenizer_llm.pad_token = tokenizer_llm.eos_token
    tokenizer_llm.padding_side = "left"
    # tokenizer_llm.pad_token = "[PAD]"

    print(json.dumps({
        'llm_path': llm_path,
        'mt_path': mt_path,
        'lr': lr,
        'epoch_num': epoch_num,
        'gradient_accumulation': gradient_accumulation,
        'train_set:': len(train_set),
        'dev_set:': len(dev_set),
        'max_seq_len': max_seq_len,
        'max_gen_len': max_gen_len,
        'train_batch_size': train_batch_size,
        'result_path': result_path_base,
        'output_model_path': output_model_path_base,
    }, indent=2))


    model_config = {
        'mt_path': mt_path,
        'llm_path': llm_path,
        'max_gen_len': max_gen_len,
        'llm_bos_token_id': tokenizer_llm.bos_token_id,
        'llm_pad_token_id': tokenizer_llm.pad_token_id,
        'augmentation' :  augmentation,
        'max_seq_len': max_seq_len
    }

    model = MPTModel(model_config)

    if args.init_checkpoint is not None:
        init_checkpoint = args.init_checkpoint
        checkpoint = torch.load(init_checkpoint, map_location='cpu')
        #model_dict = checkpoint['model_state_dict']
        model.load_state_dict(checkpoint, True)
        print('mapping init from:', init_checkpoint)
    print(model)
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"Parameter name: {name}, requires_grad={param.requires_grad}, shape={param.shape}")
    #train_sampler = RandomSampler(train_set)
    dev_sampler = SequentialSampler(dev_set)
    train_dataloader = DataLoader(
        dataset=train_set,
        batch_size=train_micro_batch_size_per_gpu,
        shuffle=True
    )
    
    dev_dataloader = DataLoader(
        dataset=dev_set,
        batch_size=eval_batch_size,
        shuffle=False,
        sampler=dev_sampler,
        num_workers=1,
        drop_last=False)

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_set)), 3):
        print(f"Sample {index} of the training set: {train_set[index]}.")
    
    # Optimizer
    optimizer = torch.optim.AdamW(parameters, betas=[0.8,0.999], eps=1e-8, weight_decay=3e-7, lr=args.lr)

    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation)
    max_train_steps = args.epoch_num * num_update_steps_per_epoch
    overrode_max_train_steps = True

    # Create the learning rate scheduler.
    # Note: the current accelerator.step() calls the .step() of the real scheduler for the `num_processes` times. This is because they assume 
    # the user initialize the scheduler with the entire training set. In the case of data parLayAlignl training, each process only
    # sees a subset (1/num_processes) of the training set. So each time the process needs to update the lr multiple times so that the total 
    # number of updates in the end matches the num_training_steps here.
    # Here we need to set the num_training_steps to either using the entire training set (when epochs is specified) or we need to multiply the 
    # num_training_steps by num_processes so that the total number of updates matches the num_training_steps.
    num_training_steps_for_scheduler = max_train_steps if overrode_max_train_steps else max_train_steps * accelerator.num_processes
    """
    get_scheduler Agrs
    name:
        LINEAR = "linear"
        COSINE = "cosine"
        COSINE_WITH_RESTARTS = "cosine_with_restarts"
        POLYNOMIAL = "polynomial"
        CONSTANT = "constant"
        CONSTANT_WITH_WARMUP = "constant_with_warmup"
        INVERSE_SQRT = "inverse_sqrt"
        REDUCE_ON_PLATEAU = "reduce_lr_on_plateau"
    """
    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_name,
        optimizer=optimizer,
        num_training_steps=num_training_steps_for_scheduler,
        num_warmup_steps=int(num_training_steps_for_scheduler * args.warm_rate),
    )


    # Prepare everything with `accelerator`.
    model, optimizer, train_dataloader, dev_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, dev_dataloader, lr_scheduler
    )

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation)
    if overrode_max_train_steps:
        max_train_steps = epoch_num * num_update_steps_per_epoch
    # Afterwards we recalculate our number of training epochs
    epoch_num = math.ceil(max_train_steps / num_update_steps_per_epoch)
    
    # Train!
    total_batch_size = train_micro_batch_size_per_gpu * accelerator.num_processes * gradient_accumulation

    print("***** Running training *****")
    print(f"  Num examples transet = {len(train_set)}")
    print(f"  Num examples dataloader = {len(train_dataloader)}")
    print(f"  Num Epochs = {epoch_num}")
    print(f"  Instantaneous batch size per device = {train_micro_batch_size_per_gpu}")
    print(f"  Total train batch size (w. parLayAlignl, distributed & accumulation) = {total_batch_size}")
    print(f"  Gradient Accumulation steps = {gradient_accumulation}")
    print(f"  Total optimization steps = {max_train_steps}")
    print(f"  parameters = {parameters}")
    print(f"  optimizer = {optimizer}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps = 0
    progress_bar.update(completed_steps)
    
    for epoch in range(epoch_num):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            with accelerator.accumulate(model):
                sources = batch['prompt']
                targets = batch['target']

                add_bos_token = False
                add_eos_token = True
                labels, mask_label = llm_input_features(targets, tokenizer_llm,
                                                        max_gen_len, add_bos_token, add_eos_token)

                input_ids_prompt, mask_prompt = None, None
                if augmentation:
                    add_bos_token = False
                    add_eos_token = False
    
                    llm_input_prompts = [i for i in sources]
                    
                    # if args.system_prompt is not None:
                    #     user_prompts = [
                    #         [{
                    #             "role": "system", "content": args.system_prompt
                    #         },
                    #         {
                    #             "role": "user", "content": user_prompt_function(sources[i])
                    #         }]
                    #         for i in range(len(sources))
                    #     ]

                    #     llm_input_prompts = [tokenizer_llm.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True) for prompt in user_prompts]
                        
                    input_ids_prompt, mask_prompt = llm_input_features(llm_input_prompts, tokenizer_llm,
                                                                        max_gen_len, add_bos_token,
                                                                        add_eos_token)
                output_loss = model(sources,
                            input_ids_prompt=input_ids_prompt, mask_prompt=mask_prompt,
                            labels=labels, mask_label=mask_label)
                loss = output_loss
                total_loss += output_loss.detach().float()
                # We keep track of the loss at each logged step
                accelerator.backward(loss)
                # clip gradient norm. don't do this with deepspeed
                optimizer.step()
                optimizer.zero_grad()
                lr_scheduler.step()

            if accelerator.sync_gradients:
                progress_bar.update(1)
                completed_steps += 1
                
                if args.logging_steps and completed_steps % args.logging_steps == 0:
                    avg_loss = accelerator.gather(total_loss).mean().item() / gradient_accumulation / args.logging_steps
                    total_loss = 0                   
                    print(f"  Step: {completed_steps}, LR: {lr_scheduler.get_last_lr()[0]}, Loss: {avg_loss}")
                  
        epoch_model_path = f'./outputs/{save_name}/epoch_{epoch}_{stage_name}/'
        save_with_accelerate(accelerator, model, epoch_model_path)
        print('save epoch model')
    accelerator.wait_for_everyone()

if __name__ == "__main__":
    main()

Writing trainer.py


In [7]:
%%writefile eval.py
import torch.fx
from transformers import AutoTokenizer
import torch
import argparse
import ast
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import json
import os
import sys
import deepspeed
from transformers import AutoModelForCausalLM, AutoModel, AutoConfig
import math
import random
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset
from torch import nn
from datasets import load_dataset
from torch.utils.data import Dataset
from huggingface_hub import login, upload_file

from utils import get_train_ds_config, set_seed, evaluate_classification
from model import MPTModel
from read_data import ExperimentDataset, llm_input_features, read_dataset

def construct_prompt(sample):
    return f"### Instruction:\nNews Sentence: {sample}\nClassify the given news sentence into one of the following categories.\nBusiness, Entertainment, Political, Sports, Science.\n\n### Response:"


def main():
    llm_path = "HuggingFaceTB/SmolLM2-135M-Instruct"
    mt_path = "google/mt5-large"
    max_seq_len = 512
    max_gen_len = 512
    eval_batch_size = 4
    augmentation = False
    save_name = "no_aug"
    task = "news"
        
    
    result_path_base = f'./results/{save_name}/{task}/'

    token = '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'
    login(token=token)

    ds = load_dataset("Themira/en_si_news_classification_with_label_name")

    test_en = ds['test_en']
    test_si = ds['test_si']
    test_sets = {}
    
    test_set = []
    for i in range(len(test_en)):
        example = test_en[i]
        sample = {}
        sample['prompt'] = construct_prompt(example['sentence'])
        sample['target'] = example['label']
        sample['source_language'] = 'en'
        test_set.append(sample)
    test_sets['English'] = test_set
    test_set = []
    for i in range(len(test_si)):
        example = test_si[i]
        sample = {}
        sample['prompt'] = construct_prompt(example['sentence'])
        sample['target'] = example['label']
        sample['source_language'] = 'si'
        test_set.append(sample)
    test_sets['Sinhala'] = test_set

    os.makedirs(result_path_base, exist_ok=True)
    tokenizer_llm = AutoTokenizer.from_pretrained(llm_path, use_fast=True)
    tokenizer_llm.pad_token = tokenizer_llm.eos_token
    tokenizer_llm.padding_side = "left"
    # tokenizer_llm.pad_token = "[PAD]"
    print(json.dumps({
        'llm_path': llm_path,
        'mt_path': mt_path,
        'max_seq_len': max_seq_len,
        'max_gen_len': max_gen_len,
        'save_name': save_name,
        'result_path_base': result_path_base
    }, indent=2))
    print("cuda available: " , torch.cuda.is_available())
    train_micro_batch_size_per_gpu = 4
    train_batch_size = 4
    gpu_num = torch.cuda.device_count()
    gradient_accumulation = 1
    # assert train_micro_batch_size_per_gpu * gpu_num * gradient_accumulation == train_batch_size
    ds_config = get_train_ds_config(train_batch_size=train_batch_size,
                                    train_micro_batch_size_per_gpu=train_micro_batch_size_per_gpu,
                                    gradient_accumulation_steps=gradient_accumulation,
                                    )

    model_config = {
        'mt_path': mt_path,
        'llm_path': llm_path,
        'max_gen_len': max_gen_len,
        'llm_bos_token_id': tokenizer_llm.bos_token_id,
        'llm_pad_token_id': tokenizer_llm.pad_token_id,
        'augmentation' :  augmentation,
        'max_seq_len': max_seq_len
    }
    init_checkpoint = "./outputs/news/epoch_2_no_aug/pytorch_model.bin"
    model = MPTModel(model_config)
    if init_checkpoint is not None:
        init_checkpoint = init_checkpoint
        checkpoint = torch.load(init_checkpoint, map_location='cpu')
        #model_dict = checkpoint['model_state_dict']
        model.load_state_dict(checkpoint, True)
        # model.merge_loara()
        print('mapping init from:', init_checkpoint)
    model.to('cuda')
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # model, optimizer, _, __ = deepspeed.initialize(
    #     config=ds_config,
    #     model=model,
    #     model_parameters=parameters,
    #     training_data=None)
    scores_map = {}
    avg = 0
    for test_lang in test_sets:
        test_set = test_sets[test_lang]
        test_sampler = SequentialSampler(test_set)
        test_set = ExperimentDataset(test_set)
        test_set = torch.utils.data.DataLoader(
            dataset=test_set,
            batch_size=eval_batch_size,
            sampler=test_sampler,
            shuffle=False,
            num_workers=1,
            drop_last=False)
        acc, results_list = evaluate_classification(model, test_set, tokenizer_llm, max_gen_len, augmentation)
        
        print('test_lang:', test_lang, 'acc:', acc)
        scores_map[test_lang] = acc
        result_path = f'{result_path_base}/{test_lang}.json'
        with open(result_path, 'w', encoding='utf-8') as f:
            json.dump(results_list, f, ensure_ascii=False, indent=2)
        avg += acc
    print(scores_map)
    print('Average accuracy :', round(avg / len(test_sets), 1))
    score_path = f'{result_path_base}/scores.tsv'
    with open(score_path, 'w', encoding='utf-8') as f:
        for lang in scores_map:
            score = scores_map[lang]
            f.write(f'{lang}\t{score}\n')


if __name__ == "__main__":
    os.environ['CUDA_VISIBLE_DEVICES'] = "0"
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    set_seed(0)

    main()

Writing eval.py


In [8]:
%%writefile trainAndEval.sh
#!/bin/bash

# Use GPUs 0 and 1
export CUDA_VISIBLE_DEVICES=0

random_port(){
    # Random port
    MASTER_PORT=$((30000 + RANDOM % (99999 - 30000 + 1)))
    echo "MASTER_PORT=$MASTER_PORT"
}

export_world_info() {
    # Set world info for deepspeed
    if [ -z "$CUDA_VISIBLE_DEVICES" ]; then
        echo "CUDA_VISIBLE_DEVICES is not set"
        NUM_GPUS=$(nvidia-smi -L | wc -l)
        CUDA_VISIBLE_DEVICES=$(seq -s ',' 0 $((NUM_GPUS - 1)))
        echo "Use all GPUs"
        export "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
        echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
    else
        NUM_GPUS=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
        echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
    fi
}

random_port
export_world_info

# Train
accelerate launch \
    --main_process_port $MASTER_PORT \
    --num_machines 1 \
    --num_processes $NUM_GPUS \
    --use_deepspeed \
    --deepspeed_config_file dp.json \
    trainer.py \

# Evaluate
python eval.py

Writing trainAndEval.sh


In [10]:
!bash trainAndEval.sh

MASTER_PORT=37288
CUDA_VISIBLE_DEVICES=0
[2025-04-27 10:56:50,572] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
2025-04-27 10:56:53.016774: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745751413.040441     479 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745751413.047771     479 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-27 10:57:01.365677: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745751421.390555     525 cuda_dnn.cc:8310] Unable to 