In [None]:
!nvidia-smi

In [None]:
!pip install --q --upgrade transformers
!pip install --q --upgrade torch
!pip install --q --upgrade bitsandbytes
!pip install --q --upgrade accelerate

In [None]:
import os
import torch
from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline)
from accelerate import Accelerator

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN

In [None]:
accelerator = Accelerator()

In [None]:
model_name = "meta-llama/Llama-3.1-405B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0
)

In [None]:
# RuntimeError: CUDA is required but not available for bitsandbytes.

model_c = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto"
)

model_c = accelerator.prepare(model_c)

In [None]:
text_generator = pipeline(
    "text-generation",
    model=model_c,
    tokenizer=tokenizer
)

In [None]:
def generate_response(prompt):
    response = text_generator(
        prompt,
        max_length=1000,
        num_return_sequences=1,
        truncation=True,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
    generated_text = response[0]['generated_text'].strip()
    return generated_text

In [None]:
prompt = "Provide list of countries of the world."
response = generate_response(prompt)
print(response)