# Importing Libraries

In [1]:
import os
from dotenv import load_dotenv
import random
import numpy as np
from dataclasses import dataclass

# PyTorch
import torch

# Huggingface
import huggingface_hub
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer

# Hyperparameters

In [2]:
@dataclass
class CONFIG:
    debug: bool = False
    
    # Model
    username: str = "PathFinderKR"
    model_name: str = "Llama-3.2-KO-1B-Instruct"
    model_id: str = f"{username}/{model_name}"
    
    # Inference
    max_new_tokens: int = 128000
    do_sample: bool = True
    temperature: float = 0.7
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    
    # Device
    device: torch.device = None
    attn_implementation: str = None
    torch_dtype: torch.dtype = torch.bfloat16
    
    # Seed
    seed: int = 42

# Reproducibility

In [3]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"Seed: {seed}")
    
set_seed(CONFIG.seed)

Seed: 42


# Device

In [4]:
def configure_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print("> Running on GPU", end=' | ')
        print("Num of GPUs: ", num_gpu)
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("> Running on MPS")
    else:
        device = torch.device("cpu")
        print("> Running on CPU")
    return device

CONFIG.device = configure_device()

> Running on GPU | Num of GPUs:  1


In [5]:
def configure_attn_implementation(device):
    if device == "cuda":
        if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
            attn_implementation = "flash_attention_2"
        else:
            attn_implementation = "eager"
    else:
        attn_implementation = None
    return attn_implementation

CONFIG.attn_implementation= configure_attn_implementation(CONFIG.device)

# HuggingFace

In [6]:
load_dotenv()
huggingface_hub.login(
    token=os.getenv("HUGGINGFACE_TOKEN"),
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


# Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG.model_id)
streamer = TextStreamer(tokenizer)

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

# Model

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    CONFIG.model_id,
    device_map=CONFIG.device,
    attn_implementation=CONFIG.attn_implementation,
    torch_dtype=CONFIG.torch_dtype
)

config.json:   0%|          | 0.00/885 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

# Inference

In [9]:
# Llama-3-Instruct template
def prompt_template(system, user):
    return (
        "<|start_header_id|>system<|end_header_id|>\n\n"
        f"{system}<|eot_id|>"
        
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{user}<|eot_id|>"
        
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    )

def generate_instruct_model(system, user):
    prompt = prompt_template(system, user)
    
    input_ids = tokenizer.encode(
        prompt,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(CONFIG.device)
    
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        streamer=streamer
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=False)

In [10]:
system_prompt = "다음 지시사항에 대한 응답을 작성해 주세요."
user_prompt = "피보나치 수열에 대해 설명해주세요."

In [11]:
response = generate_instruct_model(system_prompt, user_prompt)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

다음 지시사항에 대한 응답을 작성해 주세요.<|eot_id|><|start_header_id|>user<|end_header_id|>

피보나치 수열에 대해 설명해주세요.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

피보나치 수열은 유일한 두 숫자, 0과 1로 시작하는 수열입니다. 이 수열은 다음의 방정식을 사용하여 정렬된 방식으로 계속 확장됩니다:

f(n) = f(n-1) + f(n-2)

이제 f(4), f(5), f(6)을 구해 봅시다. 첫 번째 피보나치 수열인 0과 1을 계산하고 그 다음 두 항을 계산합니다.

f(4) = f(3) + f(2)
f(3) = f(2) + f(1)
f(2) = f(1) + f(0)

따라서 f(4)는 다음과 같습니다:
f(4) = (f(3) + f(2)) + f(1)
f(4) = (f(2) + f(1)) + f(0)
f(4) = (f(1) + f(0)) + f(0)
f(4) = (F(0) + F(-1)) + F(0)
f(4) = F(-1) + F(0)

f(5) = f(4) + f(3)
f(4) = (F(3) + F(2)) + F(1)
f(3) = (F(2) + F(1)) + F(0)
f(2) = (F(1) + F(0)) + F(0)
f(2) = (F(0) + F(-1)) + F(0)
f(2) = F(-1) + F(0)

f(6) = f(5) + f(4)
f(5) = (F(4) + F(3)) + F(2)
f(4) = (F(3) + F(2)) + F(1)
f(3) = (F(2) + F(1)) + F(0)
f(2) = (F(1) + F(0)) + F(0)
f(1) = (F(0) + F(-1)) + F(0)
f(0) = F(-1) + F(0)

따라서 f(4) = F(-1) + F(0), f(5) = F(-1) + F(0), f(6) = F(-1) + F(0)의 값