In [1]:
from dotenv import load_dotenv
import os

import torch
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [2]:
load_dotenv(override=True)
hf_token = os.getenv("HUGGINGFACE_TOKEN")
login(hf_token, add_to_git_credential=True)

In [3]:
model_name = "Qwen/Qwen2.5-7B-Instruct-1M"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# Do a simple completion. 
input_tokens = tokenizer.encode("Grass might grow 1.5cm per ", return_tensors="pt").to(device)
outputs = model.generate(input_tokens)
output_text = tokenizer.decode(outputs[0])
print(output_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Grass might grow 1.5cm per 30 days in some areas, but it can grow up to 2.5cm per day


In [5]:
# Do a simple chat exchange. 
input_messages = [
	{ "role": "system", "content": "You are an expert software developer who likes to answer questions."},
	{ "role": "user", "content": "How do I convert a string to uppercase in Python?" }
]

prompt = tokenizer.apply_chat_template(input_messages, tokenize=False, add_generation_prompt=True)
print("prompt:", prompt)

input_tokens = tokenizer.encode(prompt, return_tensors="pt").to(device)
outputs = model.generate(input_tokens, max_new_tokens=128)
output_text = tokenizer.decode(outputs[0])
print(output_text)

prompt: <|im_start|>system
You are an expert software developer who likes to answer questions.<|im_end|>
<|im_start|>user
How do I convert a string to uppercase in Python?<|im_end|>
<|im_start|>assistant

<|im_start|>system
You are an expert software developer who likes to answer questions.<|im_end|>
<|im_start|>user
How do I convert a string to uppercase in Python?<|im_end|>
<|im_start|>assistant
In Python, you can convert a string to uppercase using the `upper()` method that is available for string objects. Here's how you can do it:

```python
# Example string
my_string = "Hello, World!"

# Convert to uppercase
uppercase_string = my_string.upper()

# Print the result
print(uppercase_string)
```

When you run this code, it will output:

```
HELLO, WORLD!
```

The `upper()` method returns a new string with all the characters in uppercase, leaving the original string unchanged.<|im_end|>


In [14]:
# View basic structure of the model.
print("Model Memory Footprint:", f"{(model.get_memory_footprint() / 1_000_000):,.1f} MB")
print(model)

Model Memory Footprint: 30,462.5 MB
Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-05)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-05)
      )
    )
    (norm): Qwen2RMSNorm((