In [2]:
import os
import io
import IPython.display
from PIL import Image
import base64 
import requests 
requests.adapters.DEFAULT_TIMEOUT = 60

from dotenv import load_dotenv, find_dotenv
hf_api_key = os.environ['HF_API_KEY']
import gradio as gr
import requests
import os
import json

In [3]:
load_dotenv(find_dotenv(), override=True)

True

## Building An App To Chat With Any LLM

Here we'll be using an [Inference Endpoint](https://huggingface.co/inference-endpoints) for `falcon-40b-instruct` , the best ranking open source LLM on the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). 

In [7]:
import requests

API_URL = "https://pgwpq7ijn5o78468.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
	"Accept" : "application/json",
	"Authorization": "Bearer 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",
	"Content-Type": "application/json" 
}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
	"inputs": "Can you please let us know more details about your ",
	"parameters": {}
})

In [8]:
output

[{'generated_text': '"problem"?'}]

In [9]:
import requests

API_URL = "https://pgwpq7ijn5o78468.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
    "Accept": "application/json",
    "Authorization": "Bearer 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",
    "Content-Type": "application/json" 
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

# Specify the model you want to use
model_name = "tiiuae/falcon-40b-instruct"  # Replace with the model of your choice

output = query({
    "inputs": "Has math been invented or discovered?",
    "parameters": {"model": model_name, "max_new_tokens": 256}
})

print(output)


[{'generated_text': '\nMath has been both invented and discovered. It is a human invention in the sense that it is a system of rules and concepts that we have created to help us understand the world around us. However, it is also a discovery in the sense that it is a fundamental aspect of the universe that we have uncovered through observation and experimentation.'}]


In [16]:
import gradio as gr
import requests

# Define the query function for interacting with the Hugging Face API
API_URL = "https://pgwpq7ijn5o78468.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
    "Accept": "application/json",
    "Authorization": "Bearer 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",
    "Content-Type": "application/json" 
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    response_json = response.json()
    print("API Response:", response_json)  # Print the API response for debugging
    return response_json

# Define the generate function for the Gradio interface
def generate(input_text, slider_value):
    payload = {
        "inputs": input_text,
        "parameters": {"max_new_tokens": slider_value}
    }
    output = query(payload)
    if output and isinstance(output, list) and 'generated_text' in output[0]:
        return output[0]['generated_text']
    else:
        # If the structure is different, return a formatted error message
        return f"Error in generation: {output}"

# Set up and launch the Gradio interface
demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(label="Prompt", placeholder="Type your prompt here..."), 
        gr.Slider(label="Max new tokens", value=20, maximum=1024, minimum=1)
    ], 
    outputs=gr.Textbox(label="Completion")
)

# Close any existing Gradio interfaces and launch the new one
gr.close_all()
demo.launch()

Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




API Response: [{'generated_text': '\nArtificial intelligence (AI) is the simulation of human intelligence processes by computer systems. These processes include learning (the acquisition of information and rules for using the information), reasoning (using rules to reach approximate or definite conclusions) and self-correction.'}]


## `gr.Chatbot()`

- `gr.Chatbot()` allows you to save the chat history (between the user and the LLM) as well as display the dialogue in the app.
- Define your `fn` to take in a `gr.Chatbot()` object.  
  - Within your defined `fn` function, append a tuple (or a list) containing the user message and the LLM's response:
`chatbot_object.append( (user_message, llm_message) )`

- Include the chatbot object in both the inputs and the outputs of the app.

In [17]:
import random

def respond(message, chat_history):
        #No LLM here, just respond with a random pre-made message
        bot_message = random.choice(["Tell me more about it", 
                                     "Cool, but I'm not interested", 
                                     "Hmmmm, ok then"]) 
        chat_history.append((message, bot_message))
        return "", chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.launch()

Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




#### Format the prompt with the chat history

- You can iterate through the chatbot object with a for loop.
- Each item is a tuple containing the user message and the LLM's message.

```Python
for turn in chat_history:
    user_msg, bot_msg = turn
    ...
```

In [19]:
import gradio as gr
import requests

# Define the query function for interacting with the Hugging Face API
API_URL = "https://pgwpq7ijn5o78468.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
    "Accept": "application/json",
    "Authorization": "Bearer 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",
    "Content-Type": "application/json"
}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    response_json = response.json()
    print("API Response:", response_json)  # Print the API response for debugging
    return response_json

# Define the generate function for the Gradio interface
def generate(input_text, max_new_tokens=1024, stop_sequences=None):
    payload = {
        "inputs": input_text,
        "parameters": {"max_new_tokens": max_new_tokens}
    }
    if stop_sequences:
        payload["parameters"]["stop_sequences"] = stop_sequences

    output = query(payload)
    if output and isinstance(output, list) and 'generated_text' in output[0]:
        return output[0]['generated_text']
    else:
        return "Error in generation"

def format_chat_prompt(message, chat_history):
    prompt = ""
    for turn in chat_history:
        user_message, bot_message = turn
        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

def respond(message, chat_history):
    formatted_prompt = format_chat_prompt(message, chat_history)
    bot_message = generate(formatted_prompt, stop_sequences=["\nUser:", ""])
    chat_history.append((message, bot_message))
    return "", chat_history

with gr.Blocks() as demo:
    chat_history = gr.State([])
    chatbot = gr.Chatbot(height=240)
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chat_history], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chat_history], outputs=[msg, chatbot])

gr.close_all()
demo.launch()

Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Running on local URL:  http://127.0.0.1:7865



To create a public link, set `share=True` in `launch()`.




API Response: [{'generated_text': " I'm sorry, but as an AI language model, I don't have a personal belief system or a definitive answer to that question. However, many philosophers and religious leaders have proposed different theories and beliefs about the meaning of life."}]


### Adding Other Advanced Features

In [20]:
def format_chat_prompt(message, chat_history, instruction):
    prompt = f"System:{instruction}"
    for turn in chat_history:
        user_message, bot_message = turn
        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

### Streaming

- If your LLM can provide its tokens one at a time in a stream, you can accumulate those tokens in the chatbot object.
- The `for` loop in the following function goes through all the tokens that are in the stream and appends them to the most recent conversational turn in the chatbot's message history.

In [22]:
def respond(message, chat_history, instruction, temperature=0.7):
    # Format the prompt with chat history and new message
    prompt = format_chat_prompt(message, chat_history, instruction)
    
    # Update chat history with the new message
    chat_history.append([message, ""])

    # Create the payload for the API request
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 1024,
            "temperature": temperature,
            "stop_sequences": ["\nUser:", ""]
        }
    }

    # Use the query function to get the response from the API
    output = query(payload)

    # Extract and return the generated text
    if output and isinstance(output, dict) and 'generated_text' in output:
        # Update chat history with the response
        chat_history[-1][1] = output['generated_text']
        return output['generated_text']
    else:
        return "Error in generation"

In [25]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    with gr.Accordion(label="Advanced options",open=False):
        system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
        temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.queue().launch()    

Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Closing server running on port: 7863
Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.




Notice, in the cell above, you have used `demo.queue().launch()` instead of `demo.launch()`. "queue" helps you to boost up the performance for your demo. You can read [setting up a demo for maximum performance](https://www.gradio.app/guides/setting-up-a-demo-for-maximum-performance) for more details.

In [None]:
gr.close_all()