In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install chardet --quiet
!pip install bitsandbytes accelerate --quiet
!pip install -U bitsandbytes --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import transformers
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig
import torch
import pandas as pd
import chardet

In [None]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [None]:
# Load tokenizer and model with QLoRA configuration
# AI model normally 32 bits too big --> need quantile to reduce size - to be able to run the model but trade off with accuracy
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
pretrain_model_name = "Konthee/Llama-3.1-8B-ThaiInstruct"

tokenizer = AutoTokenizer.from_pretrained(pretrain_model_name, use_fast=False, token="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")
model = AutoModelForCausalLM.from_pretrained(pretrain_model_name, token="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")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/983 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [None]:
# System Prompt with Example
system_prompt = """
You are a Thai language expert in finance stock market model. Your task is to indentify symbols and extract aspect, opinion word, and sentiment from a given sentence that is an opinion about The Stock Exchange of Thailand.
Task:

For each sentence provided, identify:
- Symbol (the stock market abbreviation)
- Aspect (related term in the stock market)
- Opinion (word that conveys a sentiment or perspective)
- Sentiment (e.g., POS for positive, NEG for negative, NEU for neutral)

**Output Format**: Each output should be a list of lists, with each inner list structured as:
[[<symbol>SYMBOL<aspect>ASPECT<opinion>OPINION<sentiment>SENTIMENT]]

If no Symbol is mentioned, skip that sentence. If Aspect and Opinion are unclear, infer them based on stock market context.

Example Outputs:
Input: "TOP interim dividend 1.20 baht, XD on Sept 12"
Output: [[<symbol>TOP<aspect>interim dividend<opinion>increased<sentiment>POS]]

Input: "TRUE sells five bond issues, surpassing target with orders reaching 18 billion"
Output: [[<symbol>TRUE<aspect>bond sales<opinion>surpassed<sentiment>POS],[<symbol>TRUE<aspect>order volume<opinion>expanded<sentiment>POS]]
"""

In [None]:
def generate_response(model, tokenizer, prompt, max_new_tokens=50, temperature=0.3, top_p=0.9):

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(messages,add_generation_prompt=True,padding=True,truncation=True,return_tensors="pt").to(model.device)


    # Instead of a list, use tokenizer.eos_token_id directly
    eos_token_id = tokenizer.eos_token_id

    # Get pad_token_id: if it's a list, take the first element
    pad_token_id = model.config.pad_token_id
    if isinstance(pad_token_id, list):
        pad_token_id = pad_token_id[0]
    elif pad_token_id is None:
        pad_token_id = tokenizer.eos_token_id

    outputs = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        pad_token_id=pad_token_id, # Pass the integer value
        eos_token_id=eos_token_id, # Pass the integer value
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )

    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
import chardet

# Read a small part of the file to detect the encoding
with open('/content/drive/MyDrive/KMITL/Fourth Year/Project/Test01 Dataset.xlsx - 500.csv', 'rb') as f:
    result = chardet.detect(f.read(10000))

# Print the detected encoding
print(result['encoding'])

# Read the CSV file using the detected encoding
df = pd.read_csv('/content/drive/MyDrive/KMITL/Fourth Year/Project/Test01 Dataset.xlsx - 500.csv', encoding=result['encoding'])

utf-8


In [None]:
def process_llama(sentence):
    #prompt = f"**Input:** \"{sentence}\""
    result = generate_response(model,tokenizer,sentence)
    return result

# Apply the function to the 'sentence' column and store results in a new column
df['Llama-3.1-8B-Instruct_adjusted_prompt_test'] = df['text'].apply(process_llama)

df.to_csv('/content/drive/MyDrive/KMITL/Fourth Year/Project/Llama-3.1-8B-Instruct_prompt_test.csv', index=False, encoding='utf-8-sig')