In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
!pip install datasets sentencepiece transformers[torch] bitsandbytes --quiet
!pip install peft --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import transformers
import numpy as np
import json
import os
from torch.utils.data import Dataset, random_split
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    device = torch.device("cuda")
else:
    print("CUDA not available. Using CPU.")
    device = torch.device("cpu")

CUDA is available. Using GPU.


# **Load Pretrain Model**

In [None]:
# ตั้งค่า BitsAndBytesConfig เพื่อใช้ 4-bit quantization ลดการใช้หน่วยความจำ
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
# ตั้งค่า LoRA Config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [None]:
model_name = "Konthee/Llama-3.1-8B-ThaiInstruct"

# โหลด Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# โหลดโมเดลพร้อมกับ BitsAndBytesConfig
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# เพิ่ม LoRA เข้าไปในโมเดล
model = get_peft_model(model, lora_config)

In [None]:
SPECIAL_TOKEN = ['<SYMBOL>', '<ASPECT>', '<OPINION>', '<POS>', '<NEG>', '<NEU>']
special_tokens_dict = {'additional_special_tokens': SPECIAL_TOKEN}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(128262, 3072)

# **Load Dataset**

In [None]:
# โหลด Dataset
dataset_folder = '/content/drive/MyDrive/KMITL/FourthYear/Project/Data'
dataset_filename = 'Dataset_400.csv'

In [None]:
dataset = load_dataset('csv', data_files=os.path.join(dataset_folder, dataset_filename))

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 399
    })
})

In [None]:
dataset['train'][0]

{'text': 'TOP ปันผลระหว่างกาล 1.20 บาท ขึ้น XD วันที่ 12 ก.ย. 67',
 'label': '<SYMBOL> TOP <ASPECT> ปันผลระหว่างกาล <OPINION> ขึ้น <POS>'}

In [None]:
def process_data_to_model_inputs(batch):
    input_texts = batch["text"]
    target_texts = batch["label"]

    # เติมโทเคนพิเศษลงในข้อความถ้าจำเป็น
    input_texts = [f"{text}" for text in input_texts]
    target_texts = [f"{label}" for label in target_texts]

    input_encodings = tokenizer(input_texts, truncation=True, padding=True, max_length=256)
    target_encodings = tokenizer(target_texts, truncation=True, padding=True, max_length=64)

    labels = target_encodings["input_ids"]
    labels_with_ignore_index = []
    for label in labels:
        labels_with_ignore_index.append([
            token if token != tokenizer.pad_token_id else -100 for token in label
        ])

    batch["input_ids"] = input_encodings["input_ids"]
    batch["attention_mask"] = input_encodings["attention_mask"]
    batch["labels"] = labels_with_ignore_index
    return batch

In [None]:
# Tokenize the dataset
tokenized_dataset = dataset.map(process_data_to_model_inputs, batched=True)
# tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 399
    })
})

In [None]:
tokenized_dataset['train'].train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 319
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
        num_rows: 80
    })
})

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(['text','label'])

In [None]:
def split_dataset(dataset, train_size=0.8, test_size=0.1, seed=42):
    val_size = 1 - (train_size + test_size)
    dataset = dataset["train"].train_test_split(test_size=test_size + val_size, seed=seed)
    test_val_dataset = dataset['test'].train_test_split(test_size=val_size / (test_size + val_size), seed=seed)
    return {
        "train": dataset["train"],
        "validation": test_val_dataset["train"],
        "test": test_val_dataset["test"]
    }

In [None]:
splits = split_dataset(tokenized_dataset)

In [None]:
splits

{'train': Dataset({
     features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
     num_rows: 319
 }),
 'validation': Dataset({
     features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
     num_rows: 40
 }),
 'test': Dataset({
     features: ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
     num_rows: 40
 })}

# **Fine Tuning**

In [None]:
project_root = '/content/drive/MyDrive/KMITL/FourthYear/Project/model/'
model_folder = 'test_model_attention_mask'
model_path = os.path.join(project_root, model_folder)
os.makedirs(model_path, exist_ok=True)
model_path

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=1,   # ลด batch size เพื่อประหยัดหน่วยความจำ
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    fp16=True,
    logging_steps=50,
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
)

In [None]:
def make_contiguous(model):
    for param in model.parameters():
        param.data = param.data.contiguous()

make_contiguous(model)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splits["train"],
    eval_dataset=splits["validation"],
)

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 4.88 MiB is free. Process 9596 has 39.54 GiB memory in use. Of the allocated memory 39.02 GiB is allocated by PyTorch, and 24.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# **Save Model**

In [None]:
saved_model_path = '/content/drive/MyDrive/KMITL/FourthYear/Project/model/test_model_attention_mask/save_model'
os.makedirs(saved_model_path, exist_ok=True)
model.save_pretrained(saved_model_path)
tokenizer.save_pretrained(saved_model_path)

In [None]:
dataset['train'][0]

In [None]:
def generate_text(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    model.to(device)
    outputs = model.generate(
        input_ids=input_ids,
        max_length=50,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# ตัวอย่างการทดสอบ
input_text = "ICHI โบรกแนะซื้อเป้า 17.20 บ. ชี้หุ้นปลอดภัยนโยบายการเมือง-เอลนีโญยาว 3 ปีหนุน"
print(generate_text(input_text))

In [None]:
# tokenizer.decode(splits['test'][0]['labels'], skip_special_tokens=True)
tokenizer.decode(tokenizer('<SYMBOL> ICHI <ASPECT> หุ้น <OPINION> ปลอดภัย <POS>').input_ids)