<a href="https://colab.research.google.com/github/cs1090218/conv/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the model (might take 5-10 mins for non-tiny model)

In [1]:
%%capture
!pip install gradio

In [2]:
# If the local dir doesn't have adapter files and we need to get the full model from hub

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

##### Baseline model for comparison
# model_name = "meta-llama/Llama-3.2-1B-Instruct"                              # 1.50s per response (also too verbose)

##### Finetuned models
# model_name = "shashankverma590/qwen-0_5b-kid-friendly-chatbot"                # 0.56s per response
# model_name = "shashankverma590/llama-3.2-1b-kid-friendly-chatbot"             # 0.42s per response
# model_name = "shashankverma590/tiny-llama-1b-kid-friendly-chatbot-tiny"       # 0.40s per response
# model_name = "shashankverma590/llama-3-1-8b-kid-friendly-chatbot"             # 8.39s per response (0.42s with A100)
# model_name = "shashankverma590/llama-3.1-8b-kid-friendly-chatbot"             # 19.23s per response (this is finetuned on instruct base model)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/509 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/909 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]



# ChatBot

In [3]:
from transformers import pipeline

system_prompt = """You are a helpful chatbot for conversing with kids under the age of 7.
You should be empathetic, encouraging and positive minded in general.
The current mood of the user is "{emotion}", you should reply accordingly."""

emotion = input('Enter the current mood: ')
conversation_history = [
    {"role": "system", "content": system_prompt.format(emotion=emotion)}
]

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
def get_chatbot_response(user_input):
    conversation_history.append({"role": "user", "content": user_input})
    prompt = pipe.tokenizer.apply_chat_template(conversation_history, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    response = outputs[0]['generated_text'][len(prompt):].strip()
    conversation_history.append({"role": "assistant", "content": response})
    return response

Enter the current mood: happy


In [4]:
import gradio as gr
from datetime import datetime

inference_times = []
def chatbot_interface(user_input, history=[]):
    start_time = datetime.now()
    response = get_chatbot_response(user_input)
    end_time = datetime.now()
    inference_times.append((end_time - start_time).total_seconds())
    history.append((user_input, response))
    return "", history

with gr.Blocks() as chat_interface:
    with gr.Row():
        gr.Markdown("# Chatbot Interface", elem_id="title")

    with gr.Row():
        chatbot = gr.Chatbot(label="Chatbot Conversation", height=275)  # Fixed height
    with gr.Row():
        message = gr.Textbox(label="Your Message", placeholder="Type your message here...")

    # Define interactions
    message.submit(chatbot_interface, [message, chatbot], [message, chatbot])



In [5]:
chat_interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7af7bd260c27d6fe79.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [11]:
inference_times = inference_times[1:]
print (f"Average inference time: {sum(inference_times)/len(inference_times):.4f} over {len(inference_times)} inferences")

Average inference time: 19.2327 over 9 inferences


In [10]:
inference_times

[11.767854,
 14.491291,
 14.368534,
 27.397306,
 32.020001,
 25.048534,
 19.419484,
 9.834452,
 22.862349,
 7.6525]