In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import accelerate

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type='nf8',  # Can be 'nf4' or 'fp4'
    bnb_8bit_compute_dtype=torch.bfloat16  # Adjust compute type if needed
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map='auto',
    quantization_config=quantization_config,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    total_size_bytes = total_params * 2
    total_size_gb = total_size_bytes / (1024 ** 3)
    print(f"Model size: {total_size_gb:.2f} GB")

get_model_size(model)

Model size: 14.96 GB


In [7]:
model.device

device(type='cuda', index=0)

In [8]:
def generate_response(prompt, max_tokens=40):
    messages = [
        {"role": "system", "content": "You are a general purpose chatbot!"},
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=max_tokens,
        eos_token_id=terminators,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1]:]
    del input_ids
    torch.cuda.empty_cache()
    return f'Prompt: {prompt}\n' + f'Response: {tokenizer.decode(response, skip_special_tokens=True)}'

In [34]:
prompt = 'When did Albert Einstein and Stephen Hawking win the Nobel Prize?'
output = generate_response(prompt)
print(output)

Prompt: When did Albert Einstein and Stephen Hawking win the Nobel Prize?
Response: Albert Einstein never won the Nobel Prize in Physics, despite being one of the most influential physicists of the 20th century. He was nominated for the prize several times, but the Nobel Committee did not


In [18]:
prompt = 'Neil Armstrong did not eat on the Apollo 11 module. Yes or no?'
output = generate_response(prompt)
print(output)

Prompt: Neil Armstrong did not eat on the Apollo 11 module. Yes or no?
Response: Yes.


In [17]:
prompt = 'Tell me about "Aptomania".'
output = generate_response(prompt)
print(output)

Prompt: Tell me about "Aptomania".
Response: Aptomania! That's a fascinating topic. Aptomania refers to the widespread and intense enthusiasm for aptitude tests, particularly the popular ones like the Myers-Briggs Type Indicator (MB


In [10]:
prompt = '256789/5 = ? Just give the answer.'
output1 = generate_response(prompt)
print(output1)

prompt = 'If 256789 is divided by 5, what will be the result? Just give the answer.'
output2 = generate_response(prompt)
print(output2)

prompt = 'What is the result of 256789 divided by 5? Just give the answer.'
output3 = generate_response(prompt)
print(output3)

Prompt: 256789/5 = ? Just give the answer.
Response: 51.36
Prompt: If 256789 is divided by 5, what will be the result? Just give the answer.
Response: 51.4
Prompt: What is the result of 256789 divided by 5? Just give the answer.
Response: 51375.8


In [29]:
prompt = 'If Ram has 14 apples and he gives 3 to Shyam and receives 4 from Ramesh, how many apples does Ram have now? Give number only.'
output1 = generate_response(prompt)
print(output1)

prompt = 'Out of a total of 20 apples 14 are given to Ram, and the remaining to Ramesh. Shyam receives 3 apples from Ram, who receives 4 from Ramesh. How many apples does Ram have now? Just give the number.'
output2 = generate_response(prompt)
print(output2)

Prompt: If Ram has 14 apples and he gives 3 to Shyam and receives 4 from Ramesh, how many apples does Ram have now? Give number only.
Response: 11
Prompt: Out of a total of 20 apples 14 are given to Ram, and the remaining to Ramesh. Shyam receives 3 apples from Ram, who receives 4 from Ramesh. How many apples does Ram have now? Just give the number.
Response: 8


In [47]:
prompt = 'Who won the 2020 Mens Cricket World Cup?'
output1 = generate_response(prompt)
print(output1)

prompt = 'Which team won the Mens Cricket World Cup in 2020?'
output2 = generate_response(prompt, max_tokens=50)
print(output2)

prompt = 'Did England win the 2020 Cricket World Cup? Yes or no?'
output3 = generate_response(prompt)
print(output3)

Prompt: Who won the 2020 Mens Cricket World Cup?
Response: The 2019 ICC Men's Cricket World Cup was won by the England national cricket team. They defeated New Zealand in the final at Lord's on July 14, 2019.
Prompt: Which team won the Mens Cricket World Cup in 2020?
Response: The ICC Men's Cricket World Cup 2019 was won by the England national cricket team, who defeated New Zealand in the final at Lord's on July 14, 2019. There was no Cricket World Cup held in 2020,
Prompt: Did England win the 2020 Cricket World Cup? Yes or no?
Response: Yes.
