## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


## Imports

In [2]:
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

## Device

In [3]:
# Device setup
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


In [4]:
# Flash Attention Implementation
if device == "cuda:0":
    if torch.cuda.get_device_capability()[0] >= 8: # Ampere, Ada, or Hopper GPUs
        attn_implementation = "flash_attention_2"
        torch_dtype = torch.bfloat16
    else:
        attn_implementation = "eager"
        torch_dtype = torch.float16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float32
print(f"Attention Implementation = {attn_implementation}")

Attention Implementation = flash_attention_2


## Hyperparameters

In [5]:
################################################################################
# Tokenizer parameters
################################################################################
max_length=8192
padding="do_not_pad" # "max_length", "longest", "do_not_pad"
truncation=True

################################################################################
# Generation parameters
################################################################################
num_return_sequences=1
max_new_tokens=1024
do_sample=True # True for sampling, False for greedy decoding
temperature=0.6
top_k=0 # not recommended
top_p=0.9
repetition_penalty=1.1

################################################################################
# bitsandbytes parameters
################################################################################
load_in_4bit=True
bnb_4bit_compute_dtype=torch_dtype
bnb_4bit_quant_type="nf4" # "nf4", #fp4"
bnb_4bit_use_double_quant=True

## Model

In [6]:
# Model List

# gemma variants
# "google/gemma-1.1-7b-it"
# "google/codegemma-7b-it"

# llama variants
# "meta-llama/Meta-Llama-3-8B" // downloaded
# "meta-llama/Meta-Llama-3-8B-Instruct" // downloaded
# "codellama/CodeLlama-7b-Instruct-hf"
# "PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct"

# mistral variants
# "mistralai/Mistral-7B-Instruct-v0.2"

# openELM variants
# "apple/OpenELM-3B-Instruct" // downloaded

# solar variants
# "upstage/SOLAR-10.7B-Instruct-v1.0" // downloaded
# "PathFinderKR/Waktaverse-SOLAR-KO-10.7B-Instruct"

In [7]:
model_id = "PathFinderKR/Waktaverse-Llama-3-KO-8B-Instruct-v2"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = "right"

In [9]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant
)

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation=attn_implementation,
    torch_dtype=torch_dtype,
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
# Display the model architecture
display(Markdown(f'```{model}```'))

```LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(145792, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=145792, bias=False)
)```

## Inference

In [12]:
def generate_response(system ,user):
    messages = [
        {"role": "system", "content": system},
        {"role": "user", "content": user}
    ]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    
    input_ids = tokenizer.encode(
        prompt,
        max_length=max_length,
        padding=padding,
        truncation=truncation,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(device)
    
    outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=num_return_sequences,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=False).split("### 응답:")[-1]

In [13]:
system_prompt = "다음 지시사항에 대한 응답을 작성해 주세요."

In [14]:
user_prompt = "피보나치 수열에 대해 설명해 주세요."

In [15]:
response = generate_response(system_prompt, user_prompt)
print(response)


피보나치 수열은 수학에서 가장 유명한 수열 중 하나입니다. 피보나치 수열의 각 항은 이전 두 항의 합이며, 첫 번째 항은 1, 두 번째 항은 2입니다. 따라서 피보나치 수열은 다음과 같습니다: 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 218217, 3524578, 5702881, 927378, 14930352, 24295332, 39102296, 63245986, 10233435, 16558064, 26744801, 43349441, 70145867, 11527315, 18832455, 30540897, 49150502, 79583137, 12885451, 20922798, 33860887, 54512200, 88179826, 14250703, 23058581, 37156677, 60071119, 97474199, 15778318, 25611415, 41466929, 67338853, 10827531, 17616361, 28657609, 46410056, 75025744, 12137366, 19641823, 31791112, 51369585, 83203088, 135631951, 220902990, 357843643, 579297547, 93484422, 151782885, 24427916, 39615433, 64078936, 103995354, 16773176, 27233169, 44109824, 71778588, 116268591, 188277651, 306145546, 495063216, 802407235, 130496641, 211713604, 343650706, 560487239, 913321037, 147573952, 240873996, 390112999, 627018970, 101906992

## Multi-turn Conversation

In [16]:
system_prompt = "다음 지시사항에 대한 응답을 작성해주세요."
conversation_history = [{"role": "system", "content": system_prompt}]

In [17]:
def generate_conversation(user):
    global conversation_history
    conversation_history.append({"role": "user", "content": user})
    prompt = tokenizer.apply_chat_template(
        conversation_history,
        tokenize=False,
        add_generation_prompt=False
    )
    
    input_ids = tokenizer.encode(
        prompt,
        max_length=max_length,
        padding=padding,
        truncation=truncation,
        add_special_tokens=True,
        return_tensors="pt"
    ).to(device)
    
    outputs = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=num_return_sequences,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty
    )
    
    assistant_response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    conversation_history.append({"role": "assistant", "content": assistant_response.split("<|end_of_text|>")[-1]})
    return assistant_response.split("### 응답:")[-1]

In [18]:
user_input = "피보나치 수열에 대해 설명해 주세요."
response = generate_conversation(user_input)
print(response)


피보나치 수열의 n번째 항은 F(n)=F(n-1)+F(n-2)로 주어집니다. 따라서 첫 번째 항인 1과 두 번째 항인 2를 사용하여 이 공식을 적용할 수 있습니다. 예를 들어 3번째 항을 찾으려면 3번째 항은 1+2=3이 되어야 합니다. 마찬가지로 4번째 항은 2+3=5가 되어야 하며 5번째 항은 3+5=8이 되어야 합니다. 이러한 방식으로 피보나치 수열의 모든 항을 계산할 수 있습니다.

수학적으로 피보나치 수열의 n번째 항은 다음과 같이 표현할 수 있습니다:

F(n) = (φ^n - (-1)^n)/(√5)

여기서 φ는 황금비(0.618...)이며, 이는 φ^2 + φ = 1이라는 사실에서 파생됩니다.

따라서 피보나치 수열의 n번째 항은 F(n) = (0.618^n - (-1)^n)/(√5)로 주어지며, 여기서 n은 양의 정수입니다.<|end_of_text|>


In [19]:
user_input = "파이썬 코드도 작성해 주세요"
response = generate_conversation(user_input)
print(response)


피보나치 수열은 다음과 같은 방식으로 생성됩니다:

1. 처음 두 항은 0과 1입니다.
2. 다음 항은 이전 두 항의 합이 됩니다: F(n) = F(n-1) + F(n-2).
3. n번째 항은 n-1번째와 n-2번째의 합으로 주어집니다: F(n) = F(n-1) + F(n-2).

다음은 첫 번째 10개의 피보나치 수를 보여주는 표입니다:

0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10
---------------------------|--------------------------------------------------------
1 | 1 | 2 | 3 | 5 | 8 | 13 | 21 | 34 | 55 | 89

다음은 파이썬에서 피보나치 수열을 생성하는 방법에 대한 예시 코드입니다:

```python
def fib_sequence(n):
    sequence = [0, 1]
    for i in range(2, n+1):
        sequence.append(sequence[i-2] + sequence[i-1])
    return sequence[:n]

print(fib_sequence(10))
```

결과는 첫 번째 10개의 피보나치 수로 구성된 목록입니다:

[1, 1, 2, 3, 5, 8, 13, 21, 34, 55].<|end_of_text|>
