<a href="https://colab.research.google.com/github/Sounakray2003/Asmadiya-tech/blob/main/Token_count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio transformers torch



In [None]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re

# -------------------------------------------------
# 1. Load model
# -------------------------------------------------
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"   # gated – run `huggingface-cli login`

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -------------------------------------------------
# 2. Globals
# -------------------------------------------------
SYSTEM_PROMPT = "You are a helpful assistant."
conversation = []                     # list of {"role":..., "content":...}
all_input_counts   = []               # content-only token count per turn
all_output_counts  = []               # raw output token count
all_input_strings  = []               # display list (no Ġ)
all_output_strings = []               # display list (no Ġ)
all_input_raw      = []               # raw token strings (with Ġ) – for decoding
all_output_raw     = []               # raw token strings (with Ġ) – for decoding

UNNECESSARY = {
    '', '\n', '<', '|', '>', 'system', 'user', 'ass', 'istant',
    'im', '_', 'start', 'end',
    '<|begin_of_text|>', '<|eot_id|>', '<|end_of_text|>',
    '<|end_header_id|>', '<|start_header_id|>'
}

# -------------------------------------------------
# 3. Chat function
# -------------------------------------------------
def chat(user_input: str, history: list):
    global conversation, all_input_counts, all_output_counts
    global all_input_strings, all_output_strings, all_input_raw, all_output_raw

    # ---- 1. add user message ----
    conversation.append({"role": "user", "content": user_input})

    # ---- 2. content-only prompt (system + history + current) ----
    content = SYSTEM_PROMPT + " "
    for m in conversation[:-1]:
        content += m["content"] + " "
    content += user_input

    # ---- 3. tokenise content-only (no chat template) ----
    ids = tokenizer(content, return_tensors="pt").to(model.device).input_ids[0]
    raw_tokens = tokenizer.convert_ids_to_tokens(ids)                 # keep Ġ
    clean_tokens = [t.replace("Ġ", "") for t in raw_tokens
                    if t not in UNNECESSARY and t.strip() != "\n"]

    # ---- 4. store input info ----
    all_input_counts.append(len(clean_tokens))
    all_input_strings.append(clean_tokens)
    all_input_raw.append(raw_tokens)

    # ---- 5. generate with full chat template (needed for the model) ----
    msgs = [{"role": "system", "content": SYSTEM_PROMPT}] + conversation
    full_prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    full_ids = tokenizer(full_prompt, return_tensors="pt").to(model.device).input_ids

    with torch.no_grad():
        out_ids = model.generate(
            full_ids,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # ---- 6. extract only the newly generated tokens ----
    new_ids = out_ids[0][full_ids.shape[1]:]
    reply = tokenizer.decode(new_ids, skip_special_tokens=True).strip()

    out_raw = tokenizer.convert_ids_to_tokens(new_ids)
    out_clean = [t.replace("Ġ", "") for t in out_raw
                 if t not in UNNECESSARY and t.strip() != "\n"]

    # ---- 7. store output info ----
    all_output_counts.append(len(new_ids))
    all_output_strings.append(out_clean)
    all_output_raw.append(out_raw)

    # ---- 8. update conversation history (keep only last 20 turns) ----
    conversation.append({"role": "assistant", "content": reply})
    if len(conversation) > 20:
        conversation = conversation[-20:]

    # ---- 9. Gradio history ----
    history = history or []
    history.append((user_input, reply))

    # ---- 10. decoded strings (use raw tokens with Ġ) ----
    def safe_decode(token_list):
        if not token_list:
            return ""
        ids = tokenizer.convert_tokens_to_ids(token_list)
        if None in ids:
            return ""
        return tokenizer.decode(ids, skip_special_tokens=True).strip()

    decoded_inputs  = [safe_decode(raw) for raw in all_input_raw]
    decoded_outputs = [safe_decode(raw) for raw in all_output_raw]

    # ---- 11. return everything ----
    return (
        "", history,
        f"{all_input_counts}",
        f"{all_output_counts}",
        f"{all_input_strings}",
        f"{decoded_inputs}",
        f"{all_output_strings}",
        f"{decoded_outputs}"
    )

# -------------------------------------------------
# 4. Clear
# -------------------------------------------------
def clear_chat():
    global conversation, all_input_counts, all_output_counts
    global all_input_strings, all_output_strings, all_input_raw, all_output_raw
    conversation = []
    all_input_counts = all_output_counts = []
    all_input_strings = all_output_strings = []
    all_input_raw = all_output_raw = []
    return [], "", "[]", "[]", "[]", "[]", "[]", "[]"

# -------------------------------------------------
# 5. UI
# -------------------------------------------------
with gr.Blocks(title="Llama-3.2-1B – Clean Tokens") as demo:
    gr.Markdown(
        "# Llama-3.2-1B-Instruct Chat\n"
        "**Input Tokens** = system + history + user (content only, no tags).\n"
        "`Ġ` is stripped **only** for the token-string display."
    )
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(height=500)
            txt = gr.Textbox(placeholder="Type and press Enter…", label="Message")
        with gr.Column(scale=1):
            gr.Markdown("**Input count (content-only):**");   i_cnt = gr.Markdown("[]")
            gr.Markdown("**Output count (raw):**");          o_cnt = gr.Markdown("[]")
            gr.Markdown("**Input tokens (no Ġ):**");         i_str = gr.Markdown("[]")
            gr.Markdown("**Decoded input:**");               i_dec = gr.Markdown("[]")
            gr.Markdown("**Output tokens (no Ġ):**");        o_str = gr.Markdown("[]")
            gr.Markdown("**Decoded output:**");              o_dec = gr.Markdown("[]")

    gr.Button("Clear").click(
        clear_chat,
        None,
        [chatbot, txt, i_cnt, o_cnt, i_str, i_dec, o_str, o_dec]
    )
    txt.submit(
        chat,
        [txt, chatbot],
        [txt, chatbot, i_cnt, o_cnt, i_str, i_dec, o_str, o_dec]
    )

# -------------------------------------------------
# 6. Launch
# -------------------------------------------------
if __name__ == "__main__":
    demo.launch(debug=True, share=False)

  chatbot = gr.Chatbot(height=500)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>