# Vicuna LLaMa 13B LoRa

In [1]:
# To avoid CUDA OOM error when getting state_dict
# !pip uninstall bitsandbytes
# !pip install bitsandbytes==0.37.2

import os
import sys
import gc
import re
import random
import warnings
import pickle
from tqdm.auto import tqdm
from typing import Tuple

from sklearn.model_selection import train_test_split
import numpy as np
import pymorphy2

import torch
import transformers
from datasets import load_dataset
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
from transformers import TrainerCallback
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, PeftConfig, PeftModel, AdaLoraConfig, TaskType

os.environ['WANDB_NOTEBOOK_NAME'] = "Vicuna_13B_CHGK.ipynb"


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/maksim/gitrepo/vicuna_lora/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [17]:
def get_model(source_model_name_or_path: str):
    """
    Returns a transformers model and tokenizer
    """
    model = LlamaForCausalLM.from_pretrained(
        source_model_name_or_path,
        load_in_8bit=True,
        device_map="auto",
    )
    tokenizer = LlamaTokenizer.from_pretrained(
        source_model_name_or_path,
        use_fast=False
    )
    tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token

    return model, tokenizer

def generate_prompt(data_point):
    # result = f"""### Вопрос: {data_point["Вопрос"]} ### """
    # if data_point.get("Комментарий") is not None:
    #     result += f"""Комментарий: {data_point["Комментарий"]};"""
    # result += f"""Ответ: {data_point["Ответ"][:45]}"""
    return f"""Ответь на вопрос викторины. Вопрос: {data_point["Question"]} Ответ: {data_point["Answer"][:45]}"""
    
def generate_prompt_infer(data_point):
    return f"""Ответь на вопрос викторины. Вопрос:{data_point["Question"]} Ответ: """

def generate(quest, model, tokenizer, temperature=0.0, top_p=0.9, repetition_penalty=1.4, max_new_tokens=64, cutoff_len=512):
    inputs = tokenizer(
        generate_prompt_infer(quest),
        return_tensors="pt",
        truncation=True,
        max_length=cutoff_len,
    )
    input_ids = inputs["input_ids"].cuda()

    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    model.eval()
    model.config.use_cache = True
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens
        )
    output = tokenizer.decode(generation_output.sequences[0], skip_special_tokens=True)
    output = output.split(' Ответ: ')
    if len(output) > 1:
        return output[1]
    else:
        return ""
    
def generate_beams(quest, model, tokenizer, temperature=0.5, top_p=0.9, repetition_penalty=1.3, max_new_tokens=64, cutoff_len=512):
    inputs = tokenizer(
        generate_prompt_infer(quest),
        return_tensors="pt",
        truncation=True,
        max_length=cutoff_len,
    )
    input_ids = inputs["input_ids"].cuda()

    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        sampling=True,
        repetition_penalty=repetition_penalty,
        num_beams=6,
        num_beam_groups=2,
        min_length=2,
        use_cache=True,
        # diversity_penalty=0.17,
        # encoder_repetition_penalty=1.5
    )
    model.eval()
    model.config.use_cache = True
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            # no_repeat_ngram_size=2,
            remove_invalid_values=True,
        )
    output = tokenizer.decode(generation_output.sequences[0], skip_special_tokens=True)
    output = output.split(' Ответ: ')
    if len(output) > 1:
        return output[1]
    else:
        return ""

def print_dict(data: dict):
    if data is not None:
        for key, value in data.items():
            print(f'{key}: {value}')

def is_acceptable_quiz(quiz: dict) -> bool:
    if len(re.findall("\(pic: \d*.\w{1,3}\)", quiz['Question'])) > 0:
        return False
    if len(re.findall("<раздатка>", quiz['Question'])) > 0:
        return False
    return True


## Fine Tuning

In [15]:
SOURCE_MODEL = "vicuna-13b/"

MICRO_BATCH_SIZE = 6  
BATCH_SIZE = 256
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1  # paper uses 3
LEARNING_RATE = 2e-4  # from the original paper with batch size 128 lr=2e-5
WARMUP_STEPS = 10
CUTOFF_LEN = 256  
CUTOFF_LEN_TEST = 160
MAX_ANSWER_LENGTH = 25
LORA_R = 128
LORA_ALPHA = 24
LORA_DROPOUT = 0.0
TEST_SIZE = 0.004
DEFAULT_LORA_0 = "checkpoints/vicuna-13b_LoRA_default_0"
SAVED_LORA = "checkpoints/vicuna-13b_checkpoint_LoRA_0.04504"
INPUT_DIR = 'chgk_baza'
DO_COMPILE = False

peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj", "v_proj", "o_proj", "gate_proj", "down_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="lora_only",
    task_type="CAUSAL_LM",
)


In [4]:
model, tokenizer = get_model(SOURCE_MODEL)
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, peft_config)
model.save_pretrained(DEFAULT_LORA_0)
# model.load_adapter(model_id=SAVED_LORA, adapter_name='default')



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
with open(os.path.join(INPUT_DIR, 'quest_clean_list.pkl'), 'rb') as f:
    quest_clean_list = pickle.load(f)

# Train test split
data_train_raw, data_test_raw = train_test_split(quest_clean_list, test_size=TEST_SIZE, random_state=42)
print(f"Train: {len(data_train_raw)}, Test:{len(data_test_raw)}", "\n")

raw_data = dict()
data = dict()
train_data = []
test_data = []

for quiz in tqdm(data_train_raw):
    text = generate_prompt(quiz)
    tokens = tokenizer(text, truncation=False)
    if len(tokens["input_ids"]) <= CUTOFF_LEN:
        train_data.append(tokenizer(
            text,
            truncation=True,
            max_length=CUTOFF_LEN,
            padding="max_length",
        ))
for quiz in tqdm(data_test_raw):
    text = generate_prompt_infer(quiz)
    tokens = tokenizer(text, truncation=False)
    if len(tokens["input_ids"]) <= CUTOFF_LEN_TEST:
        test_data.append(tokenizer(
            text,
            truncation=True,
            max_length=CUTOFF_LEN_TEST,
            padding="max_length",
        ))

data['train'] = train_data
data['test'] = test_data
raw_data['test'] = data_test_raw
raw_data['train'] = data_train_raw
del train_data, test_data, data_train_raw, data_test_raw

print(f"Train: {len(data['train'])}, Test:{len(data['test'])}")

Train: 290449, Test:1167 



  0%|          | 0/290449 [00:00<?, ?it/s]

  0%|          | 0/1167 [00:00<?, ?it/s]

Train: 290449, Test:1014


In [7]:
morph = pymorphy2.MorphAnalyzer()

def preprocess_metrics(text: str) -> str:
    replace_list = ["\.", ",", "\?", "!", "\n", '"', "/", ";", ":", "1\)", "2\)", "3\)", "4\)", "\(", "-", "\)"]
    text = re.sub('|'.join(replace_list), ' ', text)
    text = text.lower()
    text = ' '.join([morph.parse(x)[0].normal_form for x in text.split()])
    return text

def compute_metrics():
    model.save_pretrained(SAVED_LORA)
    model.eval()
    gc.collect()
    torch.cuda.empty_cache()
    model.config.use_cache = True
    predictions = []
    references = []
    for quest in tqdm(raw_data['test']):
        predictions.append(generate(quest, model, tokenizer, repetition_penalty=1.2, max_new_tokens=MAX_ANSWER_LENGTH))
        references.append(quest["Answer"][:45])
    references = [preprocess_metrics(x) for x in references]
    predictions = [preprocess_metrics(x) for x in predictions]
    model.train()
    model.config.use_cache = False

    f2_list = []
    pr_list = []
    rec_list = []

    for pred, ref in zip(predictions, references):
        ref_set = set(ref.split())
        pred_set = set(pred.split())
        intersect = ref_set & pred_set
        pr = len(intersect) / (len(pred_set) + 1e-5)
        rec = len(intersect) / (len(ref_set) + 1e-5)
        f2 = 5 * pr * rec / (4 * pr + rec + 1e-5)
        f2_list.append(f2)
        pr_list.append(pr)
        rec_list.append(rec)

    precision = np.array(pr_list).mean()
    recall = np.array(rec_list).mean()
    f2 = np.array(f2_list).mean()

    del predictions, references
    gc.collect()
    torch.cuda.empty_cache()
    print(f"Presision={precision:.5f}\nRecall={recall:.5f}\nF2={f2:.5f}")
    return {'precision': precision, 'recall': recall, 'F2':f2}

def calc_metrics(predictions: list, references: list, print_best=False) -> Tuple[float, float, float]:
    f2_list = []
    pr_list = []
    rec_list = []

    for pred, ref in zip(predictions, references):
        ref_set = set(preprocess_metrics(ref).split())
        pred_set = set(preprocess_metrics(pred).split())
        intersect = ref_set & pred_set
        pr = len(intersect) / (len(pred_set) + 1e-5)
        rec = len(intersect) / (len(ref_set) + 1e-5)
        f2 = 5 * pr * rec / (4 * pr + rec + 1e-5)
        f2_list.append(f2)
        pr_list.append(pr)
        rec_list.append(rec)
        if print_best and pr > 0:
            print()
            print(ref)
            print("GPT finetuned:", pred, '\n')

    precision = np.array(pr_list).mean()
    recall = np.array(rec_list).mean()
    f2 = np.array(f2_list).mean()
    return precision, recall, f2

In [7]:
class MetricsCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):

        model.save_pretrained(SAVED_LORA)

        model.eval()
        gc.collect()
        torch.cuda.empty_cache()
        model.config.use_cache = True
        predictions = []
        references = []
        for quest in tqdm(raw_data['test']):
            predictions.append(generate(quest, model, tokenizer, repetition_penalty=1.4, max_new_tokens=MAX_ANSWER_LENGTH))
            references.append(quest["Answer"][:45])
        model.train()
        model.config.use_cache = False

        precision, recall, f2 = calc_metrics(predictions, references, print_best=False)

        del predictions, references
        gc.collect()
        torch.cuda.empty_cache()
        print(f"Presision={precision:.5f}\nRecall={recall:.5f}\nF2={f2:.5f}")
    

trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    callbacks=[MetricsCallback()],
    # compute_metrics=compute_metrics,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=WARMUP_STEPS,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        bf16=True,
        seed=44,
        dataloader_num_workers=2,
        logging_steps=1,
        evaluation_strategy="steps",
        eval_accumulation_steps=1,
        eval_steps=50, # 50
        save_strategy="steps",
        save_steps=100,
        save_safetensors=True,
        output_dir="lora-alpaca",
        save_total_limit=2,        
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
model.print_trainable_parameters()
model.gradient_checkpointing_enable()
if DO_COMPILE and torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)
    print("Model compiled!")


trainable params: 351272960 || all params: 13367137280 || trainable%: 2.6278847343445553


In [15]:
warnings.filterwarnings(action="ignore", message="MatMul8bitLt: inputs will be cast from")
warnings.filterwarnings(action="ignore", message="This implementation of AdamW")
trainer.train(resume_from_checkpoint='lora-alpaca/checkpoint-200')

model.save_pretrained(SAVED_LORA)

# Presision=0.02992
# Recall=0.03854
# F2=0.03434

[34m[1mwandb[0m: Currently logged in as: [33mpolushinm[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1071 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


In [23]:
# model.save_pretrained('checkpoints/vicuna-13b_checkpoint_LoRA_chk200')

In [8]:
model.load_adapter(model_id="checkpoints/vicuna-13b_checkpoint_LoRA_chk200", adapter_name='default')

In [9]:
# model.load_adapter(model_id=SAVED_LORA, adapter_name='default')
# model.tie_weights()
model.state_dict()['base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight']

tensor([[ 1.9868e-03,  1.2566e-04,  8.0444e-03,  ...,  7.2476e-04,
         -2.1179e-03, -4.8381e-03],
        [ 3.3036e-03,  4.3476e-03,  1.4770e-03,  ..., -4.0143e-03,
          6.8942e-03, -5.1013e-04],
        [ 8.6272e-03,  9.3042e-03,  2.8657e-03,  ..., -6.9144e-03,
          1.4964e-02, -9.9965e-05],
        ...,
        [ 2.6522e-03,  4.5705e-03,  5.5359e-03,  ..., -5.0041e-03,
          9.6899e-03, -2.1749e-03],
        [ 1.4776e-02,  1.5573e-02,  1.2891e-02,  ..., -6.4296e-03,
          1.4631e-02, -8.8199e-03],
        [-6.2615e-03, -8.4163e-03, -1.1182e-02,  ...,  5.6907e-03,
         -6.5634e-03,  5.8730e-03]], device='cuda:0')

## Generation

In [11]:
# model, tokenizer = get_model(SOURCE_MODEL)
# model = get_peft_model(model, peft_config)

In [18]:
f2_list = []
pr_list = []
rec_list = []
print()

def postprocess(text):
    output = text.split(' Тур: ')[0]
    output = re.split('\d{1,3}/\d{1,3}', output)[0]
    return output

model.config.use_cache = True

for quest in tqdm(raw_data['test']):
    # model.load_adapter(model_id=DEFAULT_LORA_0, adapter_name='default')
    # response = generate(quest, model, tokenizer, repetition_penalty=1.4)
    # print("GPT vicuna:", response, '\n')

    # model.load_adapter(model_id=SAVED_LORA, adapter_name='default')
    response = postprocess(generate_beams(quest, model, tokenizer, max_new_tokens=MAX_ANSWER_LENGTH, cutoff_len=CUTOFF_LEN_TEST))

    precision, recall, f2 = calc_metrics([response, ], [quest["Answer"], ])

    f2_list.append(f2)
    pr_list.append(precision)
    rec_list.append(recall)

    if recall > 0:
        print()
        print_dict(quest)
        print("GPT finetuned:", response)
        print()

precision = np.array(pr_list).mean()
recall = np.array(rec_list).mean()
f2 = np.array(f2_list).mean()
print(f"Precision={precision:.5f}\nRecall={recall:.5f}\nF2={f2:.5f}")




  0%|          | 0/1167 [00:00<?, ?it/s]


Question: ТЯЖЕЛАЯ ТЕМА  1. Так расшифровывается аббревиатура КВ в названии советского тяжелого танка  2. Так называется вода, в молекулу которой вместо протия входит дейтерий  3. Название этого химического элемента происходит от греческого слова "тяжелый"  4. Борис Громов сменил на посту губернатора Московской области именно его  5. Этот боксер снялся в главной роли в художественном фильме "Тяжелые перчатки"
Answer: 1. Клим Ворошилов  2. Тяжелая вода  3. Барий  4. Анатолий Тяжлов  5. Ласло Папп
GPT finetuned: 1. КВ (Клим Ворошилов)  2. Деутерий  3


Question: [Ведущему: неявно выделить голосом слово "определенный".]  Тренер "Арсенала" Герберт Чэпмен хотел, чтобы клуб был всюду на первом месте. Поэтому в определенный момент отдал распоряжение навсегда убрать ЭТО. Назовите ЭТО.
Answer: Определенный артикль The в названии клуба.  Определенный артикль, Артикль The, The.
Comment: Для того чтобы "The Arsenal" находился на первом месте не только по результатам, но и по алфавиту, тренер велел

In [8]:
quest = raw_data['test'][521]
response = generate_beams(quest, model, tokenizer, max_new_tokens=20, cutoff_len=CUTOFF_LEN_TEST)

print()
print_dict(quest)
print("GPT finetuned:", response, '\n')

In [13]:
print(precision)
print(recall)
print(f2)

0.04473945477158799
0.053768498611859804
0.04776260601520403


In [44]:
for i in range(100):
    print(i)
    print_dict(raw_data['test'][i])
    print()

0
Question: В статье из газеты "Секретные материалы 20 века", посвященной адмиралу Макарову, рассказывается, в частности, о том, что еще в 1892 г. Макаров выдвинул идею снабдить головки снарядов бронебойными наконечниками. Реализация идеи в металле потребовала много времени и усилий для разрешения технических трудностей, но Макаров не отступился и добился успеха. Однако в серию колпачки не пошли. Уже из Порт-Артура, готовясь к генеральному сражению с японским флотом, адмирал затребовал для тихоокеанской эскадры два вагона колпачков, но не получил. Автор статьи замечает, что одной из причин поражения русского флота стало то, что Макарову не удалось пробить броню... Какой машины - ответьте одним словом.
Answer: Бюрократической.

1
Question: Никулин вспоминал: "Я не понимал, почему вокруг его имени такой бум. А спустя три года я был восхищен". Сам он рассказывал: "В чем только не обвиняют меня, в чем только не подозревают! Я выхожу, чтобы говорить. Я не пищу, я не кричу, я молчу". Назовит

In [9]:
inputs = tokenizer(
        generate_prompt_infer(quest),
        return_tensors="pt",
        # truncation=True,
        # max_length=CUTOFF_LEN_TEST,
    )
inputs["input_ids"].size(1)

In [7]:
len(inputs['input_ids'][0])