In [1]:
'''
#some requirements
!pip install loralib
!pip install sentencepiece
!pip install sacrebleu
!pip install peft
'''
myrank = 16

In [2]:
from transformers import AutoTokenizer, AdamW, get_scheduler, AutoModelForSeq2SeqLM
import torch
import torch.nn as nn
from torch.utils.data import Dataset, random_split, DataLoader
from peft import LoraConfig, get_peft_model
from tqdm.auto import tqdm
import numpy as np
from sacrebleu.metrics import BLEU
import time
from matplotlib import pyplot as plt 
%matplotlib inline

#data prepare and process
f = open("chinese.txt", "r")
cndata = f.readlines()
f.close()

f = open("english.txt", "r")
endata = f.readlines()
f.close()

assert len(cndata) == len(endata) == 252777

mydata = [{"cn": cn.strip(), "en": en.strip()} for cn, en in zip(cndata, endata)]

class Mydataset(Dataset):
    def __init__(self, mydata) -> None:
        super().__init__()
        self.data = mydata
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]

mydata = Mydataset(mydata)
train_size = int(0.8 * 252777)
val_size = 252777 - train_size
trainset, valset = random_split(mydata, lengths=[train_size, val_size])

model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 128
max_target_length = 128



In [3]:
#model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
#model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, force_download=True, resume_download=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
config = LoraConfig(
    use_dora=True,
    r=myrank, #attention heads
    lora_alpha=32, #alpha scaling
    target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)
model = model.to(device)

assert model.model.model.encoder.layers[0].self_attn.v_proj.lora_A.default.weight.requires_grad == True
assert model.model.model.encoder.layers[0].self_attn.k_proj.weight.requires_grad == False

cuda
trainable params: 608256 || all params: 78551552 || trainable%: 0.7743398882812653


In [4]:
def mycollate_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['cn'])
        batch_targets.append(sample['en'])
    batch_data = tokenizer(
        batch_inputs,
        padding=True,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch_targets,
            padding=True,
            max_length=max_target_length,
            truncation=True,
            return_tensors="pt"
        )["input_ids"]
        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
        for idx, end_idx in enumerate(end_token_index):
            labels[idx][end_idx+1:] = -100
        batch_data['labels'] = labels
    return batch_data

train_dataloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=mycollate_fn)
valid_dataloader = DataLoader(valset, batch_size=32, shuffle=False, collate_fn=mycollate_fn)

In [5]:
#only optimize lora layers
optimizer = AdamW(filter(lambda p: p.requires_grad_, model.parameters()), lr=3e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))
progress_bar.set_description(f'loss: {0:>7f}')

total_loss = 0.0
losses = []

i = 1
model.train()
for epoch in range(num_epochs):
    t1 = time.time()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        losses.append(loss.item())
        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/i:>7f}')
        progress_bar.update(1)
        i += 1
    t2 = time.time()
    print("epoch {}, training time {}".format(i, t2-t1))



  0%|          | 0/6320 [00:00<?, ?it/s]



epoch 6321, training time 1672.9888784885406


In [6]:
torch.save(model.state_dict(), "translate_peft_dora.pt")

In [4]:
model.load_state_dict(torch.load('translate_peft_dora.pt'))

<All keys matched successfully>

In [6]:
preds, labels = [], []
from sacrebleu import BLEU
import numpy as np
bleu = BLEU()

model.eval()
i = 50
for batch_data in tqdm(valid_dataloader):
    i -= 1
    batch_data = batch_data.to(device)
    with torch.no_grad():
        generated_tokens = model.generate(
            input_ids=batch_data["input_ids"],
            attention_mask=batch_data["attention_mask"],
            max_length=max_target_length,
        ).cpu().numpy()
    label_tokens = batch_data["labels"].cpu().numpy()

    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

    preds += [pred.strip() for pred in decoded_preds]
    labels += [[label.strip()] for label in decoded_labels]
    if i <= 0:
        break

for i in range(4):
    print(preds[i])
    print(labels[i])
bleu_score = bleu.corpus_score(preds, labels).score
print(f"BLEU: {bleu_score:>0.2f}\n")

  0%|          | 0/1580 [00:00<?, ?it/s]



Given India’s popular desire for regional peace and religious reconciliation, it is no surprise that the BJP, which represents Hinduism, has failed.
['As for the threat from the right, anyone hoping for peace in the region and reduced tensions within India between religious communities is relieved by the defeat of the Hindu nationalist Bharatiya Janata Party (BJP).']
Economic growth and increased energy demand would allow air pollution emissions to increase steadily, and rapidly increase the concentration of particulate matter (PM) and ozone over the next few decades, so that such an approach would not be desirable.
['With economic growth and rising energy demand set to fuel a steady rise in emissions of air pollutants and rapidly rising concentrations of particulate matter (PM) and ozone in the coming decades, this approach is untenable.']
The US could and should implement a policy that would allow real income to grow faster, but that would have to be discussed in the next column.
['T

In [7]:
with open("dora_pred.txt", 'w') as f:
    for i in preds:
        f.write(i+'\n')

with open("dora_label.txt", 'w') as f:
    for i in labels:
        f.write(i[0]+'\n')

In [5]:
def translate_sentence(model, tokenizer, sentence, device, max_target_length=128):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=max_target_length)
    print(inputs)
    # Move tensors to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate translation using the model
    with torch.no_grad():
        generated_tokens = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_target_length,
        ).cpu().numpy()

    # Decode the generated tokens to a string
    decoded_translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    return decoded_translation

# Example usage
model.eval() # Make sure the model is in evaluation mode
sentence = ["你好，世界", 
            "你爱我我爱你，蜜雪冰城甜蜜蜜", 
            "人是生而自由的，却处处受到束缚。", 
            "诚实与勤勉应该成为你永久的伴侣。"] # Your Chinese sentence
translation = translate_sentence(model, tokenizer, sentence, device)
for tt in translation:
    print(tt)

{'input_ids': tensor([[ 5349,     2,   907,     0, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000],
        [  132, 28609, 41412,     2, 16351,  9128,  8677,  5257, 47772, 16351,
             0, 65000, 65000, 65000],
        [    7, 19558,  1592,   166,  1546,    11,     2,  2421,  1281,  1281,
          1018, 44624,     9,     0],
        [    7, 26628,    65, 12609, 37834,  1027,  1058,   146,  7727,    11,
         19679,     9,     0, 65000]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}
Hello, world.
You love me. I love you, honey.
Man is born free and bound everywhere.
Integrity and diligence should be your permanent companions.


In [9]:
plt.plot(list(range(len(losses))), losses)
plt.show()

NameError: name 'losses' is not defined