In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    GPTQConfig,
    BitsAndBytesConfig,
    TrainingArguments
)
from datasets import load_dataset, Dataset
from peft import (
    prepare_model_for_kbit_training,
    get_peft_model,
    LoraConfig,
    AutoPeftModelForCausalLM
)
import json
import copy
from trl import SFTTrainer
import wandb
import os

In [None]:
wandb.login()
wandb_project = 'ToraFT'
if len(wandb_project)>0:
  os.environ['WANDB_PROJECT'] = wandb_project

In [None]:
!huggingface-cli login

PREPARE CONFIG

Prompt: System_prompt Câu hỏi: Q1 Đáp án: A1 Giải thích: E1

In [None]:
class Config:
  MODEL_ID = 'llm-agents/tora-13b-v1.0'
  REVISION = 'main'
  OUTPUT_DIR = 'ToraZaloFT'
  PER_DEVICE_TRAIN_BATCH_SIZE = 8
  GRADIENT_ACCUMULATION_STEPS = 32
  OPTIM = 'paged_adamw_32bit' #8or32
  LEARNING_RATE = 2e-4
  LR_SCHEDULER_TYPE = 'constant'
  LOGGING_STEPS = 32
  SAVE_STRATEGY = 'steps'
  SAVE_STEPS = 32
  WARMUP_STEPS = 5
  EVAL_STEPS = 32
  LOGGING_DIR = './logs'
  MAX_STEPS = 160
  NUM_TRAIN_EPOCHS = 2
  FP16 = True
  PUSH_TO_HUB = False
  DATASET_TEXT_FIELD = 'content'
  MAX_SEQ_LENGTH = 4096
  REPORT_TO = 'wandb'
  PACKING = False
  DO_EVAL = True
  NEFTUNE_NOISE_ALPHA = 5
  EVALUATION_STRATEGY = 'steps'
  R = 128
  LORA_ALPHA = 256
  LORA_DROPOUT = 0.05
  TARGET_MODULES = ['q_proj', 'v_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'up_proj', 'lm_head']
  BIAS = 'none'
  TASK_TYPE = 'CAUSAL_LM'



PREPARE DATASET

In [None]:
train_file_path = './data/ztrain/mTrain.json'
val_file_path = './data/ztrain/mVal.json'
countA = 0
countB = 0
countC = 0
countD = 0
with open(train_file_path, 'r') as file:
    train_data = json.load(file)
with open(val_file_path, 'r') as file:
    val_data = json.load(file)
train_data = train_data['data']
len_train_data = len(train_data)
print(f"Length train dataset: {len_train_data}")
val_data = val_data['data']
len_val_data = len(val_data)
print(f"Length validation dataset: {len_val_data}")
zalo_train_data = {'question': [],
                   'choices': [],
                   'answer': [],
                   'explanation': [],
                   'id': []}
zalo_val_data = {'question': [],
                   'choices': [],
                   'answer': [],
                   'explanation': [],
                   'id': []}
for i in range(len_train_data):
    zalo_train_data['question'].append(train_data[i]['question'])
    zalo_train_data['choices'].append(train_data[i]['choices'])
    zalo_train_data['answer'].append(train_data[i]['answer'])
    zalo_train_data['id'].append(train_data[i]['id'])
    zalo_train_data['explanation'].append(train_data[i]['explanation'])
    if 'A.' in train_data[i]['answer']:
      countA+=1
    elif 'B.' in train_data[i]['answer']:
      countB+=1
    elif 'C.' in train_data[i]['answer']:
      countC+=1
    elif 'D.' in train_data[i]['answer']:
      countD+=1
print(f"Training Data:\nA: {countA} B: {countB} C: {countC} D: {countD}")
countA = 0
countB = 0
countC = 0
countD = 0
for i in range(len_val_data):
    zalo_val_data['question'].append(val_data[i]['question'])
    zalo_val_data['choices'].append(val_data[i]['choices'])
    zalo_val_data['answer'].append(val_data[i]['answer'])
    zalo_val_data['id'].append(val_data[i]['id'])
    zalo_val_data['explanation'].append(val_data[i]['explanation'])
    if 'A.' in val_data[i]['answer']:
      countA+=1
    elif 'B.' in val_data[i]['answer']:
      countB+=1
    elif 'C.' in val_data[i]['answer']:
      countC+=1
    elif 'D.' in val_data[i]['answer']:
      countD+=1
print(f"Validation Data:\nA: {countA} B: {countB} C: {countC} D: {countD}")
# Now 'data' contains the content of the JSON file as a dictionary


In [None]:
system_prompt = ""
# """Bạn là một người hỗ trợ giúp tôi giải những bài toán sau đây. Sẽ có 4 đáp án A, B, C, D. \
# Hãy hít một hơi thật sâu, sau đó từng bước một giải ra và chọn 1 trong 4 đáp án A, B, C, D."""
def preprocess(samples):
  # conv_prefix = f"{system_prompt}"
  # batch = []
  # print(samples['question'])
  # len_dataset = len(samples['question'])
  # print(5*'-'+ f"length of dataset is:{len_dataset}" + 5*'-')
  # for i in range(len_dataset):
  question =  f"<|user|>\n{samples['question']}\n"
  choices_list = samples['choices'].copy()
  choices = " ".join(choices_list)
  question = question + choices + '\n'
  explanation = f"{samples['explanation']}"
  answer = f"<|assistant|>\n{explanation}\nAnswer:{samples['answer']}"
  formatted_conv = f"{question}{answer}"
  return {'content':formatted_conv}
train_data = Dataset.from_dict(zalo_train_data, split = 'train')
train_data = train_data.map(
    preprocess,
    # batched = True,
    remove_columns = train_data.column_names
)
train_data = train_data.shuffle(100)
print(train_data[2])

val_data = Dataset.from_dict(zalo_val_data, split = 'val')
val_data = val_data.map(
    preprocess,
    # batched = True,
    remove_columns = val_data.column_names
)
val_data = val_data.shuffle(100)
print(val_data[0])
# print(count1)

PREPARE MODEL

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_ID, trust_remote_code = True, revision = Config.REVISION)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(Config.MODEL_ID,
                                            #  quantization_config = GPTQConfig(bits = 4, use_exllama= True),
                                             device_map = 'auto',
                                             revision = Config.REVISION,
                                             trust_remote_code = True,
                                            #  use_flash_attention_2 = True
                                             )
print(model)

PREPARE PEFT

In [None]:
peft_config = LoraConfig(r = Config.R,
                    lora_alpha = Config.LORA_ALPHA,
                    lora_dropout = Config.LORA_DROPOUT,
                    target_modules = Config.TARGET_MODULES,
                    bias = Config.BIAS,
                    task_type = Config.TASK_TYPE)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

PREPARE TRAINING ARGUMENTS

In [None]:
training_arguments = TrainingArguments(
    output_dir = Config.OUTPUT_DIR,
    per_device_train_batch_size = Config.PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps = Config.GRADIENT_ACCUMULATION_STEPS,
    optim = Config.OPTIM,
    learning_rate = Config.LEARNING_RATE,
    save_strategy = Config.SAVE_STRATEGY,
    lr_scheduler_type = Config.LR_SCHEDULER_TYPE,
    eval_steps = Config.EVAL_STEPS,
    logging_steps = Config.LOGGING_STEPS,
    max_steps = Config.MAX_STEPS,
    fp16 = Config.FP16,
    save_steps = Config.SAVE_STEPS,
    logging_dir = Config.LOGGING_DIR,
    report_to = Config.REPORT_TO,
    do_eval = Config.DO_EVAL,
    warmup_steps = Config.WARMUP_STEPS,
    push_to_hub = Config.PUSH_TO_HUB,
    neftune_noise_alpha = Config.NEFTUNE_NOISE_ALPHA,
    evaluation_strategy = Config.EVALUATION_STRATEGY,
    num_train_epochs = Config.NUM_TRAIN_EPOCHS
)

In [None]:
trainer = SFTTrainer(model = model,
                     args = training_arguments,
                     train_dataset = train_data,
                     eval_dataset = val_data,
                     peft_config = peft_config,
                     tokenizer = tokenizer,
                     packing = False,
                     dataset_text_field = 'content',
                     max_seq_length = Config.MAX_SEQ_LENGTH)
trainer.train()