In [1]:
import torch
torch.cuda.empty_cache()


In [2]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from transformers import Trainer, GPTQConfig, deepspeed
from dataclasses import dataclass, field
from typing import Dict, Optional, List

#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/envs/pytorch/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /opt/conda/envs/pytorch/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)


In [3]:
torch.cuda.is_available()

True

In [4]:
@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="Qwen/Qwen-14B-Chat-Int4")

In [5]:
argmodel = ModelArguments()

In [6]:
config = transformers.AutoConfig.from_pretrained(
        argmodel.model_name_or_path,
        trust_remote_code=True,
    )
config.use_cache = False

In [7]:



#model = AutoModelForCausalLM.from_pretrained(
#    argmodel.model_name_or_path,
#    pad_token_id= tokenizer.pad_token_id,
#    device_map ="auto",
#    trust_remote_code=True,
#    quantization_config=bnb_config)

model = transformers.AutoModelForCausalLM.from_pretrained(
        argmodel.model_name_or_path,
        config=config,
        device_map ="cuda",
        trust_remote_code=True,
        quantization_config=GPTQConfig(
            bits=4, disable_exllama=True
        )
    )


You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.
CUDA extension not installed.
CUDA extension not installed.
Try importing flash-attention for faster inference...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    argmodel.model_name_or_path,
    model_max_length=2048,
    padding_side="right",
    use_fast=False,
    trust_remote_code=True,
)
tokenizer.pad_token_id = tokenizer.eod_id


In [9]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()
  print(
      f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
)

In [10]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [11]:
@dataclass
class LoraArguments:
    lora_r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    lora_target_modules: List[str] = field(
        default_factory=lambda: ["c_attn", "c_proj", "w1", "w2"]
    )
    lora_weight_path: str = ""
    lora_bias: str = "none"
    q_lora: bool = False
    task_type="CAUSAL_LM",



In [12]:
arg = LoraArguments()

config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=arg.lora_target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    modules_to_save = ["wte", "lm_head"]
)


model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 1780285440 || all params: 3337835520 || trainables%: 53.33652390396996


In [13]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1

#generation_config.pad_token_id = tokenizer.eos_token_id
#generation_config.eos_token_id = "<|endoftext|>"

In [14]:
#check device type
device = model.device
device

device(type='cuda', index=0)

In [15]:
device = model.device
def single_out(logits):
    temperature = 0.9
    print(logits['logits'].shape)
    logits = logits['logits'][:, -1, :]
    logits = logits / temperature
    greedy = False

    
    if greedy:
        out = torch.argmax(logits, dim=1).reshape(-1, 1)
        return out

    # Initialize mask with ones
    mask = torch.ones_like(logits).bool()
    top_p  = 0.5
    top_k = 0
    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=1), dim=1)
        sorted_mask = cumulative_probs > top_p
        # Ensure at least the most probable is included if sorted_mask contains all True 
        if sorted_mask.all():
            sorted_mask[..., :1] = 0
        to_scatter = sorted_mask.type_as(logits) * float('-inf')
        to_scatter[sorted_mask == 0] = logits.gather(1, sorted_indices)[sorted_mask == 0]
        logits.scatter_(1, sorted_indices, to_scatter)
    elif top_k > 0:
        top_k = min(top_k, logits.shape[1])            
        values, _ = torch.topk(logits, top_k)
        # smallest allowed value
        kth_values = values[..., -1]
        logits = torch.where(logits < kth_values.unsqueeze(-1), torch.tensor(float('-inf')).type_as(logits), logits)


    probs = torch.softmax(logits, dim=1)
    m = torch.argmax(probs,dim=1)
    return m.reshape(-1, 1)

In [16]:
def gen_out(prompt):
    idx = tokenizer(prompt, return_tensors="pt").to(device)
    for _ in range(100):
        logits = model(**idx)
        next_token = single_out(logits)
        next_tokened = tokenizer.decode(next_token[0], skip_special_tokens=False)
        # print(idx.input_ids[0].shape)
        idx = tokenizer.decode(idx.input_ids[0], skip_special_tokens=False)
        
        idx = tokenizer(idx + next_tokened, return_tensors="pt",).to(device)

        if next_token.item() == tokenizer.eod_id:
             break
    return idx


# LOAD TRAINING DATA

In [81]:
traindata = load_dataset("csv", data_files="datasetV3.csv")

In [82]:
traindata

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'question', 'answers'],
        num_rows: 32609
    })
})

In [83]:
def fil_nan(text):
    if text == None:
        return " "
    else:
        return text

In [88]:
def generate_prompt(data_point):
  return f"""
  human: title->{fil_nan(data_point["title"])}\ncontext->{fil_nan(data_point["context"])}\nquestion->{fil_nan(data_point["question"])}\n
  assistant: {data_point["answers"]}
  """.strip()


def generate_and_tokenize_prompt(data_point):
  full_prompt = generate_prompt(data_point)
  tokenized_full_prompt = tokenizer(full_prompt)
  
  # Check if the tokenized prompt exceeds the maximum length
  if len(tokenized_full_prompt['input_ids']) > 2048:
    # Sliding window approach
    window_size = 2048
    stride = 100  # Adjust the stride value as needed
    num_windows = (len(tokenized_full_prompt['input_ids']) - window_size) // stride + 1
    
    # Create a list to store the windows
    windows = []
    
    # Generate the windows
    for i in range(num_windows):
      start_index = i * stride
      end_index = start_index + window_size
      window = {
        'input_ids': tokenized_full_prompt['input_ids'][start_index:end_index],
        'attention_mask': tokenized_full_prompt['attention_mask'][start_index:end_index]
      }
      windows.append(window)
    
    return windows
  else:
    return [tokenized_full_prompt]


In [85]:
traindata

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'question', 'answers'],
        num_rows: 32609
    })
})

In [89]:
list_sup = []
for i in range(32609):
    gen_lst = generate_and_tokenize_prompt(traindata["train"][i])
    for j in range(len(gen_lst)):
        list_sup.append(gen_lst[j])


In [90]:
texts = []
for i in list_sup:
    texts.append(tokenizer.decode(i["input_ids"]))

In [91]:
import pandas as pd

In [92]:
df = pd.DataFrame(texts, columns=["text"])

In [107]:
df.to_csv("datasetV4.csv", index=False)

In [120]:
truedata = load_dataset("csv", data_files="datasetV4.csv")

In [122]:
truedata['train'][0]['text']


'human: title->พัทธ์ธีรา ศรุติพงศ์โภคิน\ncontext->พัทธ์ธีรา ศรุติพงศ์โภคิน (เกิด 3 ธันวาคม พ.ศ. 2533) หรือชื่อเล่นว่า อร เป็นนักแสดงหญิงชาวไทย สำเร็จมัธยมศึกษาจากCatholic Cathedral College ประเทศนิวซีแลนด์ และปริญญาตรีจากRaffles International College สาขา Business Marketing\n\nเข้าสู่วงการตั้งแต่อายุ 6 ขวบ จากการแสดงละครเวทีกับ ครูชลประคัลภ์ จันทร์เรือง จากนั้นก็เล่นโฆษณาในวัยเด็ก 2- 3 ชิ้น และยังเคยแสดงช่วงละครสั้น ในรายการซุปเปอร์จิ๋ว ประมาณปี 2542\n\nปัจจุบันเป็นทั้ง นักแสดง , พิธีกร และ วีเจ อยู่ที่คลื่น เก็ท 102.5 Bangkok International Hits Music Station และยังเป็นพิธีกรให้กับช่อง ทรู มิวสิก\nquestion->พัทธ์ธีรา ศรุติพงศ์โภคิน เกิดวันที่เท่าไร\n\n  assistant: 3 ธันวาคม พ.ศ. 2533'

In [123]:
def gen_tok(datapoint):
    tok = tokenizer(datapoint['text'],truncation=True,padding='max_length', max_length=2048)
    return tok

In [124]:
traindata["train"] = truedata["train"].shuffle().map(gen_tok)

Map:   0%|          | 0/57957 [00:00<?, ? examples/s]

# Finetune the model

In [125]:
training_args = transformers.TrainingArguments(
      per_device_train_batch_size=3,
      gradient_accumulation_steps=10,
      num_train_epochs=20,
      learning_rate=2e-4,
      save_total_limit=3,
      logging_steps=5,
      output_dir="experiments",
      optim="paged_adamw_8bit",
      lr_scheduler_type="cosine",
      warmup_ratio=0.05,
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=traindata["train"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False 


In [126]:
trainer.train()

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


Step,Training Loss
1,2.0713
2,2.7905
3,2.1643
4,2.4451
5,2.8859
6,2.8865
7,2.4473
8,2.7676
9,2.2864
10,2.2747


KeyboardInterrupt: 

In [19]:
model.save_pretrained("trained-model")

In [6]:
config = PeftConfig.from_pretrained("trained-model")

In [7]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='Qwen/Qwen-14B-Chat-Int4', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=64, target_modules={'w1', 'w2', 'c_attn', 'c_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={})

In [8]:
base_model = transformers.AutoModelForCausalLM.from_pretrained(
        argmodel.model_name_or_path,
        config=config,
        device_map ="cuda",
        trust_remote_code=True,
        quantization_config=GPTQConfig(
            bits=4, disable_exllama=True
        )
    )

#base_model = prepare_model_for_kbit_training(base_model)

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.
CUDA extension not installed.
CUDA extension not installed.
Try importing flash-attention for faster inference...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
not_base_model = transformers.AutoModelForCausalLM.from_pretrained(
        argmodel.model_name_or_path,
        config=config,
        device_map ="cuda",
        trust_remote_code=True,
    )

Try importing flash-attention for faster inference...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
model_to_merge = PeftModel.from_pretrained(model, "trained-model")

merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained(merged_model)

ValueError: Cannot merge LORA layers when the model is gptq quantized

In [35]:
model_to_merge

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): QWenLMHeadModel(
          (transformer): QWenModel(
            (wte): ModulesToSaveWrapper(
              (original_module): Embedding(152064, 5120)
              (modules_to_save): ModuleDict(
                (default): Embedding(152064, 5120)
              )
            )
            (drop): Dropout(p=0.0, inplace=False)
            (rotary_emb): RotaryEmbedding()
            (h): ModuleList(
              (0-39): 40 x QWenBlock(
                (ln_1): RMSNorm()
                (attn): QWenAttention(
                  (core_attention_flash): FlashSelfAttention()
                  (attn_dropout): Dropout(p=0.0, inplace=False)
                  (c_attn): QuantLinear(
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
             

In [29]:
model.base_model_torch_dtype

torch.float16

In [30]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): QWenLMHeadModel(
      (transformer): QWenModel(
        (wte): ModulesToSaveWrapper(
          (original_module): Embedding(152064, 5120)
          (modules_to_save): ModuleDict(
            (default): Embedding(152064, 5120)
          )
        )
        (drop): Dropout(p=0.0, inplace=False)
        (rotary_emb): RotaryEmbedding()
        (h): ModuleList(
          (0-39): 40 x QWenBlock(
            (ln_1): RMSNorm()
            (attn): QWenAttention(
              (core_attention_flash): FlashSelfAttention()
              (attn_dropout): Dropout(p=0.0, inplace=False)
              (c_attn): QuantLinear(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (defau