## QWEN-2 7B Instruct LLM
- Developed by Alibaba group : https://arxiv.org/abs/2309.16609
- Four variations 0.5B, 1.5B, 7B, and 72B model parameters
- 72B model has better performance than current open source LLM such as Llama-70B

### Setup the model and tokenizer

In [None]:
import peft, accelerate, loralib
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import wandb
import torch
import  bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
import numpy as np

In [None]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

orig_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B",
    quantization_config=quantization_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B")

In [None]:
orig_model # Looking at the model parameters and architecture

In [None]:
for param in orig_model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float16)

orig_model.gradient_checkpointing_enable()
orig_model.enable_input_require_grads()

In [None]:
class CastOutputToFloat(torch.nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float16)

orig_model.lm_head = CastOutputToFloat(orig_model.lm_head) # Convert the last layer to Float 16

In [None]:
## Setting the Lora Adapters
def print_trainable_params(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable Param: {trainable_params}")
    print(f"All Params: {all_params}\n% Trainable: {(trainable_params/all_params)*100}")

In [None]:
loraConfig = peft.LoraConfig(
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM",
)
model = peft.get_peft_model(orig_model, loraConfig)
print_trainable_params(model)

### Load the datasets and modify them

In [None]:
import json
import os
from datasets import load_dataset
import re

In [None]:
with open("../data/train.json") as f:
    train = json.load(f)
with open("../data/val.json") as f:
    val = json.load(f)
with open("../data/test.json") as f:
    test = json.load(f)

In [None]:
pattern_q = r"\n### Question: (.*?)\n### Response"
pattern_sp = r"### System Prompt: (.*?)\n### Question:"

In [None]:
for x in [train, val, test]:
    for items in x:
        items["prompt"] = items["prompt"].replace("closer to natural language", "closer to natural language. The ordering format is as follows: LIMIT -> UNION/INTERSECT/EXCEPT -> WHERE/HAVING -> ORDER/GROUP BY -> PROJECT -> SELECT -> FROM -> JOIN")
        items["output"] = items["dsql"]
        items["text"] = ""
        match = re.search(pattern_q, items["prompt"], re.DOTALL)
        if match:
            question = match.group(1)
            items["input"] = str(question)
        match = re.search(pattern_sp, items["prompt"], re.DOTALL)
        if match:
            text = match.group(1)
            items["instruction"] = str(text)
        del items["dsql"], items["prompt"]

In [None]:
with open('./datasets/train.json', 'w') as fout:
    json.dump(train, fout)
with open('./datasets/val.json', 'w') as fout:
    json.dump(val, fout)
with open('./datasets/test.json', 'w') as fout:
    json.dump(test, fout)

In [None]:
data_X = load_dataset("json", data_files={
    'train': "datasets/train.json", 'validation': "datasets/val.json"
})

In [None]:
data_X

In [None]:
train[0]

In [None]:
main_prompt = """Below is an instruction that describes a task, paired with an input that provides further context for the given output.

### Instruction:
{}

### Input:
{}

### Output:
{}
"""

EOS_TOKEN = tokenizer.eos_token

In [None]:
def format_prompts(examples):
    inst = examples["instruction"]
    inp = examples["input"]
    outp = examples["output"]
    texts = []
    for inst_, inp_, outp_ in zip(inst, inp, outp):
        text = main_prompt.format(inst_, inp_, outp_) + EOS_TOKEN
        texts.append(text)
    return {"text": texts, }

In [None]:
data_X = data_X.map(format_prompts, batched=True)
data_X = data_X.map(lambda samples: tokenizer(samples["text"]), batched=True)

In [None]:
data_X # Includes both train and validation datasets

In [None]:
del train, val, test

### Training the model

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import wandb
from random_word import RandomWords
r = RandomWords()
run_name = r.get_random_word()
os.environ["WANDB_PROJECT"] = "qwen-7B-finetune"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=data_X["train"],
    eval_dataset=data_X["validation"],
    args=TrainingArguments(
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=1,
        eval_strategy="steps",
        num_train_epochs=3,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        weight_decay=1e-2,
        lr_scheduler_type="linear",
        seed=42,
        fp16=True,
        optim="adamw_8bit",
        logging_steps=0.5,
        output_dir='outputs',
        report_to="wandb",
        run_name=f"finetune-lora-{run_name}",
        load_best_model_at_end=True,
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False

In [None]:
trainer.train()

In [None]:
data_X_test = load_dataset("json", data_files={
    'test': "datasets/test.json"
})

In [None]:
data_X_test = data_X_test.map(format_prompts, batched=True)
data_X_test = data_X_test.map(lambda samples: tokenizer(samples["text"]), batched=True)

In [None]:
sample_test = data_X_test["test"].select(range(3))

In [None]:
output_t = trainer.predict(sample_test)

In [None]:
for i in range(len(output_t.label_ids)):
    preds = np.where(output_t.label_ids[i][:-1] != -100, output_t.label_ids[i][:-1], tokenizer.pad_token_id)
    print("###############\nLLM Output:", tokenizer.decode(preds))
    print("###############\nGT ===>",sample_test[i]["output"], "\n\n")