# Importing Libraries

In [1]:
import os
from dotenv import load_dotenv
import random
import numpy as np
from dataclasses import dataclass

# PyTorch
import torch

# Huggingface
import huggingface_hub
from transformers import TextStreamer

# Unsloth
from unsloth import FastLanguageModel, is_bf16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Configuration

In [2]:
@dataclass
class CONFIG:
    # Model
    model_id: str = "PathFinderKR/KHU-Llama-3.2-11B-Instruct"
    model_type: str = "instruct"
    
    # Inference
    max_new_tokens: int = 2048
    do_sample: bool = True
    temperature: float = 0.7
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    dtype: torch.dtype = torch.bfloat16 if is_bf16_supported() else torch.float16
    load_in_4bit: bool = True
    
    # Device
    device: torch.device = None
    
    # Seed
    seed: int = 42

## Reproducibility

In [3]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"Seed: {seed}")
    
set_seed(CONFIG.seed)

Seed: 42


## Device

In [4]:
def configure_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print("> Running on GPU", end=' | ')
        print("Num of GPUs: ", num_gpu)
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("> Running on MPS")
    else:
        device = torch.device("cpu")
        print("> Running on CPU")
    return device

CONFIG.device = configure_device()

> Running on GPU | Num of GPUs:  1


## HuggingFace

In [5]:
load_dotenv()
huggingface_hub.login(
    token=os.getenv("HUGGINGFACE_TOKEN"),
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


# Model

In [6]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=CONFIG.model_id,
    max_seq_length=CONFIG.max_new_tokens,
    dtype=CONFIG.dtype,
    load_in_4bit=CONFIG.load_in_4bit
)

==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4080 SUPER. Max memory: 15.992 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]



In [7]:
FastLanguageModel.for_inference(model)
text_streamer = TextStreamer(tokenizer)

# Inference

In [8]:
llama_3_instruct_prompt = """<|start_header_id|>system<|end_header_id|>

{}<|eot_id|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}"""

# Generate base model
def generate_text(prompt):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        prompt
    ], return_tensors = "pt").to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

# Generate instruction model
def generate_response(system, user):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        llama_3_instruct_prompt.format(
            system,
            user,
            ""
        )
    ], return_tensors = "pt").to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

In [10]:
if CONFIG.model_type == "base":
    sample_text = "Capital of France:"
    sample_response = generate_text(sample_text)
    print(sample_response)
    print(tokenizer.tokenize(sample_response[0]))
elif CONFIG.model_type == "instruct":
    sample_system = "You are a helpful assistant."
    sample_user = "What is the capital of France?"
    sample_response = generate_response(sample_system, sample_user)
    print(sample_response)
    print(tokenizer.tokenize(sample_response[0]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Solve the following question. Instead of thinking step by step, insert <|pause|> token to your response when this is the time to think. This will delay your generation, making you to think more. for example, 2+6 equals <|pause|> 8<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of France, also known as the capital, is a city, country, or town that holds the political, economic, and social life. It's the largest and most prominent city in a country, holding all its government, business, and infrastructure.

Here's the answer: 

In a nation, there are two parts, - a capital and a capital. The capital is often called the "capital" because it has many important functions including governance, industry, law enforcement, and administration. A capital is typically a large city with many industries, particularly in urban areas, where