In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time
from pathlib import Path
import pandas as pd

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [3]:
base = Path("/home/vicente/Github/BDLab-Agent/backend/data/LLMs/Phi-3-4B")
Phi_3 = AutoModelForCausalLM.from_pretrained(
    str(base),
    dtype=torch.bfloat16,
    device_map="auto",
    local_files_only=True)
Phi_3_tokenizer = AutoTokenizer.from_pretrained(str(base),local_files_only=True)
Phi_3.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
!nvidia-smi

Thu Sep 18 16:33:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX 4000 Ada Gene...    Off |   00000000:01:00.0 Off |                  Off |
| 30%   40C    P2             30W /  130W |    7492MiB /  20475MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Quadro RTX 4000                Off |   00

In [11]:
print(Phi_3_tokenizer.chat_template)

{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'user' %}{{'<|user|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
' + message['content'] + '<|end|>
'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
' }}{% else %}{{ eos_token }}{% endif %}


You’re right—sorry for the extra. Here’s just the roles explanation for your Phi-3 `chat_template`.

### Roles Phi-3 expects

* **system**: Sets global behavior/instructions (tone, rules, persona). Usually one message at the start.
* **user**: What the human (or “other side”) says/asks each turn.
* **assistant**: What the model (Phi-3) previously replied. Including these lets the model “see” its own past messages and stay consistent.

### How your template uses them

For each message it emits:

* `<|system|>\n{content}\n<|end|>\n`
* `<|user|>\n{content}\n<|end|>\n`
* `<|assistant|>\n{content}\n<|end|>\n`

After all messages:

* If `add_generation_prompt=True`, it appends `<|assistant|>\n`
  → This tells the model: “now produce the next assistant reply.”
* Otherwise it appends the EOS token and won’t generate a reply.

### Why “assistant” matters

* It’s the only way to show the model its **own prior outputs** in structured form.
* Without prior `assistant` turns, the model lacks context about what it already said—leading to repeats or restarts.

That’s it: Phi-3 recognizes exactly these three roles in your template, and the final `<|assistant|>` tag is the cue for the next response.


In [None]:
def ask_phi3(model, tokenizer, system_prompt, user_prompt):
    # list of messages in the required format
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    # padding token to prevent warnings
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # chat template and tokenize the input
    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # the response from the model
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        eos_token_id=tokenizer.eos_token_id,
        #do_sample=True,
        #temperature=0.6,
        #top_p=0.9
    )

    # Decode and return only the new response text
    response_ids = outputs[0][inputs.input_ids.shape[-1]:]
    return tokenizer.decode(response_ids, skip_special_tokens=True)

In [None]:
sys_prompt = "You are a helpful assistant with weird comments"
usr_promt = "Explain the concept of Agentic AI vs LLMs"

reponse = ask_phi3(Phi_3,Phi_3_tokenizer, system_prompt=sys_prompt, user_prompt=usr_promt)

In [None]:
print(reponse)

In [6]:
sys_prompt2 = "You a college professor at CMU"
usr_promt2 = "Explain the class rubric and syllabus for a new Agentic AI course "

reponse2 = ask_phi3(Phi_3,Phi_3_tokenizer, system_prompt=sys_prompt2, user_prompt=usr_promt2)

In [7]:
print(reponse2)

As a college professor at CMU, I have designed a comprehensive syllabus and rubric for the new Agentic AI course. The course aims to provide students with a deep understanding of the principles, applications, and ethical considerations of agentic AI.

Course Title: Agentic AI: Principles, Applications, and Ethics

Course Description:
This course explores the concept of agentic AI, which refers to AI systems that can act autonomously and make decisions without human intervention. Students will learn about the underlying principles of ag


In [8]:
Phi3_second = AutoModelForCausalLM.from_pretrained(
    str(base),
    dtype=torch.bfloat16,
    device_map="auto",
    local_files_only =True
)
Phi3_second_tokenizer = AutoTokenizer.from_pretrained(str(base), local_files_only=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### 1 - What `ask_phi3_chat(...)` actually sends

* You pass a `messages` list with roles `system`, `user`, and later `assistant`.
* The tokenizer’s `chat_template` wraps each message like:

  * `<|system|>\n...<|end|>\n`
  * `<|user|>\n...<|end|>\n`
  * `<|assistant|>\n...<|end|>\n`
* Because you call `apply_chat_template(..., add_generation_prompt=True)`, the template **adds one more** `<|assistant|>\n` at the end. That tag says: “Now it’s the assistant’s turn—generate the next reply.”

So every call has the shape:

```
<|system|> ... <|end|>
<|user|>   ... <|end|>
[zero or more history turns...]
<|assistant|> ... <|end|>   # (only if you included prior assistant replies)
<|user|>   ... <|end|>
<|assistant|>                # <-- model starts generating here
```

### 2 - How your debate loop uses the roles

* `alice_msgs = [{"role":"system", ...}]`: sets global rules for Alice.
  This is where you impose “3–5 sentences, rebuttal, end with a question.”
* You append a **user** message: “Start with your stance…”
  → Alice’s model sees `<|system|>...<|end|>\n<|user|>Start...<|end|>\n<|assistant|>` and produces Alice’s **first** assistant reply.
* You then append that reply to history as `{"role":"assistant", "content": alice_text}`.
  This is crucial: it lets Alice “remember” what she just said on later turns.

Vicente is identical but with his own `system` message. For Vicente’s first **user** message you include Alice’s last text (“Alice said: ... Your rebuttal:”). That turns Alice’s text into *Vicente’s input* (a `user` turn from Vicente’s perspective), so Vicente replies appropriately.

### 3 - Why two separate histories?

* You keep `alice_msgs` and `vicente_msgs` separate because each model plays a different persona.
* Each history contains:

  * its **own** `system` rules (persona + debate constraints), and
  * alternating turns where we (the orchestrating script) feed in the opponent’s last message as a **user** message.
* After each generation, you append the model’s output to its own history as an **assistant** message so it has memory of its prior stance, tone, and claims.

(You *could* run a single shared history, but with two distinct models/voices it’s simpler and cleaner—no risk of the model responding as the wrong speaker.)

### 4 - How a single turn looks (concretely)
First Alice call (abridged):

```
<|system|>
You are Alice. Debate rules: ...
<|end|>
<|user|>
Start with your stance in 2-3 sentences. End with a question.
<|end|>
<|assistant|>
# model generates Alice’s opening
```

First Vicente call (abridged):

```
<|system|>
You are Vicente. Debate rules: ...
<|end|>
<|user|>
Alice said:
""" <Alice’s opening> """
Your rebuttal:
<|end|>
<|assistant|>
# model generates Vicente’s rebuttal
```

Next Alice turn:

* `alice_msgs` already has:

  * `system` (rules)
  * prior `user` (opening prompt)
  * prior `assistant` (Alice’s opening)
* You append a new **user** message containing Vicente’s last reply:

```
<|user|>
Vicente said:
""" <Vicente’s reply> """
Your rebuttal:
<|end|>
<|assistant|>
# model generates Alice’s rebuttal to that
```

### 5 - Why this fixes your original loop

* Before, you weren’t feeding back **assistant** history, so the model restarted the same intro each time.
* Now, each side “sees” what it said (assistant history) and what the other side just said (new user turn), so it naturally moves forward.

### 6 - Small tips (still using your function)

* Keep `max_new_tokens` tight (your 140 is fine) to enforce brief turns.
* If you ever see the model adding speaker labels, you can ask it not to in the `system` rules (“Do not write speaker labels.”).
* If turns get long, you can trim what you quote from the opponent in the **user** message (e.g., only last \~800–1000 chars) to keep prompts lean.

That’s the whole mapping: **system** = global rules/persona, **user** = the opponent’s latest message (what this speaker should respond to), **assistant** = the speaker’s previous replies so they have memory. The final `<|assistant|>` tag (added by `add_generation_prompt=True`) is the cue for the next answer.


In [10]:
def ask_phi3_chat(model, tokenizer, messages):
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=140, eos_token_id=tokenizer.eos_token_id)
    response_ids = outputs[0][inputs.input_ids.shape[-1]:]
    return tokenizer.decode(response_ids, skip_special_tokens=True).strip()


In [12]:
topic = "Mars vs. Europa as a realistic second home for Earth"

base_rules = f"""Debate rules:
- Topic: {topic}
- 3-5 sentences, no greetings
- Rebut the opponent's last point
- End with a question
"""

alice_msgs   = [{"role":"system","content":"You are Alice. " + base_rules}]
vicente_msgs = [{"role":"system","content":"You are Vicente. " + base_rules}]

# opening statements
alice_msgs.append({"role":"user","content":"Start with your stance in 2-3 sentences. End with a question."})
alice_text = ask_phi3_chat(Phi_3, Phi_3_tokenizer, alice_msgs)
alice_msgs.append({"role":"assistant","content":alice_text})

vicente_msgs.append({"role":"user","content":f"Alice said:\n\"\"\"\n{alice_text}\n\"\"\"\nYour rebuttal:"})
vicente_text = ask_phi3_chat(Phi3_second, Phi3_second_tokenizer, vicente_msgs)
vicente_msgs.append({"role":"assistant","content":vicente_text})

print("Convo Start")
print("Alice:   ", alice_text)
print("Vicente: ", vicente_text)

for _ in range(10):
    alice_msgs.append({"role":"user","content":f"Vicente said:\n\"\"\"\n{vicente_text}\n\"\"\"\nYour rebuttal:"})
    alice_text = ask_phi3_chat(Phi_3, Phi_3_tokenizer, alice_msgs)
    alice_msgs.append({"role":"assistant","content":alice_text})
    print("Alice:   ", alice_text)

    vicente_msgs.append({"role":"user","content":f"Alice said:\n\"\"\"\n{alice_text}\n\"\"\"\nYour rebuttal:"})
    vicente_text = ask_phi3_chat(Phi3_second, Phi3_second_tokenizer, vicente_msgs)
    vicente_msgs.append({"role":"assistant","content":vicente_text})
    print("Vicente: ", vicente_text)

Convo Start
Alice:    Mars has been a primary candidate for human colonization due to its proximity and the presence of water ice. While Europa's subsurface ocean is intriguing, the technological challenges and risks of radiation exposure make Mars a more feasible option for establishing a second home for humanity. How do you justify the higher risks associated with Europa's harsh environment compared to the more accessible Mars? Europa's potential for life and the abundance of water ice make it a compelling choice for a second home. The technological advancements required to explore and potentially colonize Europa could drive innovation and provide a unique opportunity for humanity.
Vicente:  While Alice highlights the potential for life and water ice on Europa, the current technological limitations and the extreme radiation environment present significant risks that cannot be overlooked. Mars, on the other hand, offers a more immediate and practical opportunity for human colonization