In [44]:
import os
from huggingface_hub import InferenceClient
from dotenv import load_dotenv

# Load the environment variables
load_dotenv()

## You need a token from https://hf.co/settings/tokens. If you run this on Google Colab, you can set it up in the "settings" tab under "secrets". Make sure to call it "HF_TOKEN"
os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
# if the outputs for next cells are wrong, the free model may be overloaded. You can also use this public endpoint that contains Llama-3.2-3B-Instruct
# client = InferenceClient("https://jc26mwg228mkj8dw.us-east-1.aws.endpoints.huggingface.cloud")

### Text Generation

In [45]:
prompt="""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
The capital of France is<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
output = client.text_generation(
    prompt,
    max_new_tokens=100,
)

print(output)



...Paris!


### Chat method

In [46]:
output = client.chat.completions.create(
    messages=[
        {"role": "user", "content": "Which is the captial of France?"},
    ],
    stream=False,
    max_tokens=1024,
)
print(output.choices[0].message.content)

The capital of France is Paris.


The chat method is the RECOMMENDED method to use in order to ensure a smooth transition between models, but since this notebook is only educational, we will keep using the **“text_generation”** method to understand the details.

In [47]:
SYSTEM_PROMPT = """
Answer the following questions as best you can. You have access to the following tools:

get_weather: Get the current weather in a given location

The way you use the tools is by specifying a json blob.
Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).

The only values that should be in the "action" field are:
get_weather: Get the current weather in a given location, args: {"location": {"type": "string"}}
example use : 

{{
  "action": "get_weather",
  "action_input": {"location": "New York"}
}}

ALWAYS use the following format:

Question: the input question you must answer
Thought: you should always think about one action to take. Only one action at a time in this format:
Action:

$JSON_BLOB (inside markdown cell)

Observation: the result of the action. This Observation is unique, complete, and the source of truth.
... (this Thought/Action/Observation can repeat N times, you should take several steps when needed. The $JSON_BLOB must be formatted as markdown and only use a SINGLE action at a time.)

You must always end your output with the following format:

Thought: I now know the final answer
Final Answer: the final answer to the original input question

Now begin! Reminder to ALWAYS use the exact characters `Final Answer:` when you provide a definitive answer.
"""

USER_PROMPT = "What's the weather in London ?"

### Option 1

In [48]:
prompt=f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM_PROMPT}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{USER_PROMPT}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

### Option 2

In [49]:
from transformers import AutoTokenizer

messages=[
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "What's the weather in London ?"},
    ]

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [50]:
output = client.text_generation(
    prompt=prompt,
    max_new_tokens=200,
    stop=["Observation:"],
)
print(output)

Question: What's the weather in London?

Action:
```
{
  "action": "get_weather",
  "action_input": {"location": "London"}
}
```

Observation:


In [66]:
# Dummy function
def get_weather(location):
    return f"the weather in {location} is 20°C and sunny. \n"

get_weather("London")

'the weather in London is 20°C and sunny. \n'

In [67]:
new_prompt = prompt+output+get_weather("London")
final_output = client.text_generation(
    prompt=new_prompt,
    max_new_tokens=200,
)
print(final_output)

Final Answer: The final answer is 20°C and sunny.


Así queda el prompt nuevo

In [68]:
print(prompt+output+get_weather("London"))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 23 Feb 2025

Answer the following questions as best you can. You have access to the following tools:

get_weather: Get the current weather in a given location

The way you use the tools is by specifying a json blob.
Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).

The only values that should be in the "action" field are:
get_weather: Get the current weather in a given location, args: {"location": {"type": "string"}}
example use : 

{{
  "action": "get_weather",
  "action_input": {"location": "New York"}
}}

ALWAYS use the following format:

Question: the input question you must answer
Thought: you should always think about one action to take. Only one action at a time in this format:
Action:

$JSON_BLOB (inside markdown cell)

Observation: the result of the action. This Ob