## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


## Imports

In [4]:
# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

## Device

In [5]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [7]:
# Tokenizer arguments
max_length = 1024

# model arguments
max_new_tokens=500

# mixed precision
dtype = torch.bfloat16

# quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_type="nf4"
)

## Model

In [8]:
# Model List

# gemma variants
# "google/gemma-7b-it" // downloaded
# "PathFinderKR/waktaverse-gemma-ko-7b-it"
# "beomi/gemma-ko-7b"

# llama2 variants
# "meta-llama/Llama-2-7b-chat-hf" // downloaded
# "PathFinderKR/waktaverse-Llama-2-ko-7b-it"
# "beomi/KoAlpaca-Polyglot-5.8B" // downloaded
# "beomi/open-llama-2-ko-7b"

# solar variants
# "chihoonlee10/T3Q-ko-solar-dpo-v4.0"

In [9]:
model_id = "PathFinderKR/waktaverse-gemma-ko-7b-it"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation="flash_attention_2",
    torch_dtype=dtype,
    quantization_config=quantization_config
)



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

## Running the model on a single GPU

In [None]:
# Chat Template
def generate_response(prompt):
    messages = [{"role": "user", "content": prompt}]
    chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer.encode(chat, add_special_tokens=False, return_tensors="pt")
    outputs = model.generate(input_ids=input_ids.to(model.device), max_new_tokens=max_new_tokens)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
#prompt = "Write me a poem about Machine Learning."
prompt = "머신러닝에 대한 시를 써주세요."

In [None]:
response = generate_response(prompt)
print(response)

## Prompt Engineering

In [None]:
#text = "Write me a poem about Machine Learning"
text = "머신러닝에 대한 시를 써주세요."

In [None]:
prompt = f"""Waktaverse-Gemma, a Korean language model, capable of performing various natural language processing tasks such as text generation, question answering, translation, summarization, and more. 

Please respond to the following text delimited by triple backticks in Korean.
Use Korean only.
'''{text}'''
"""

In [None]:
response = generate_response(prompt)
print(response)