### **Installing the necessary packages:**

In [None]:
!pip install openvino-dev[onnx]
!pip install optimum[openvino]
!pip install transformers
!pip install gradio
!pip install langchain langserve langsmith

### **Model Setup & Loading (with OpenVINO)**

We will load both the chatbot model and sentiment analysis model using **OpenVINO for performance optimization**.

**Chatbot Model:** `TogetherComputer/RedPajama-INCITE-Chat-3B-v1`
- Built for natural, multi-turn chat interactions.
- Size: Medium (3 billion parameters), optimized for efficient responses.
- Ideal for conversational AI in resource-constrained environments, offering good performance in understanding and generating contextually relevant replies across various domains.

---


**Sentiment Model:** `distilbert-base-uncased-finetuned-sst-2-english`
- Detects positive or negative sentiment in English.
- Size: Small, efficient version of BERT, with 66 million parameters.
- Fast and accurate for short text sentiment analysis, ideal for quick insights on opinions.





In [2]:
import openvino as ov
from transformers import AutoTokenizer, AutoConfig, DataCollatorWithPadding
from optimum.intel.openvino import OVModelForCausalLM, OVModelForSequenceClassification, OVQuantizer
from pathlib import Path
import warnings

# Suppress deprecation and tracer warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Define models
model_name_chat = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
model_name_sentiment = "distilbert-base-uncased-finetuned-sst-2-english"

# Load and convert models to OpenVINO format, no BetterTransformer
ov_model_chat = OVModelForCausalLM.from_pretrained(model_name_chat, export=True, compile=False)
ov_model_sentiment = OVModelForSequenceClassification.from_pretrained(model_name_sentiment, export=True, compile=False)

# Load tokenizers
ov_model_chat_tok = AutoTokenizer.from_pretrained(model_name_chat, trust_remote_code=False)
ov_model_sentiment_tok = AutoTokenizer.from_pretrained(model_name_sentiment, trust_remote_code=False)

# Apply dynamic padding to avoid padding-related issues
data_collator_chat = DataCollatorWithPadding(tokenizer=ov_model_chat_tok, padding="longest")
data_collator_sentiment = DataCollatorWithPadding(tokenizer=ov_model_sentiment_tok, padding="longest")

# Save models and tokenizers locally
path_model_chat = Path("model_chat")
path_model_sentiment = Path("model_sentiment")
ov_model_chat.save_pretrained(path_model_chat)
ov_model_chat_tok.save_pretrained(path_model_chat)
ov_model_sentiment.save_pretrained(path_model_sentiment)
ov_model_sentiment_tok.save_pretrained(path_model_sentiment)

# Quantize models for optimized CPU performance

# Pass the model object to OVQuantizer
quantizer_chat = OVQuantizer.from_pretrained(ov_model_chat)
quantized_model_chat = quantizer_chat.quantize(
    export=True,
    optimization_config={"approach": "dynamic"},
    save_directory=path_model_chat / "quantized",
)

quantizer_sentiment = OVQuantizer.from_pretrained(ov_model_sentiment)
quantized_model_sentiment = quantizer_sentiment.quantize(
    export=True,
    optimization_config={"approach": "dynamic"},
    save_directory=path_model_sentiment / "quantized",
)

# Reload quantized models after saving
ov_model_chat = OVModelForCausalLM.from_pretrained(path_model_chat / "quantized", device="CPU")
ov_model_sentiment = OVModelForSequenceClassification.from_pretrained(path_model_sentiment / "quantized", device="CPU")

print("Models loaded, tokenizers initialized, and models quantized for optimal performance on CPU.")


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
  or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
  if sequence_length != 1:
  elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│ int8_asym                 │ 100% (130 / 130)            │ 100% (130 / 130)                       │
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


Output()

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

`quantization_config` was not provided. In the future, please provide `quantization_config`
Calibration dataset was not provided, assuming weight only quantization.


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


`quantization_config` was not provided. In the future, please provide `quantization_config`
Calibration dataset was not provided, assuming weight only quantization.


INFO:nncf:Statistics of the bitwidth distribution:
┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑
│ Weight compression mode   │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │
┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥
│ int8_asym                 │ 100% (40 / 40)              │ 100% (40 / 40)                         │
┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙


Output()

Models loaded, tokenizers initialized, and models quantized for optimal performance on CPU.


In [4]:
from langsmith import Client
client = Client(api_key="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")

### **Sentiment Analysis Function**

Let's define the function `get_sentiment() `to return a sentiment score. If the sentiment is negative, we will multiply the score by `-1`.

This code also logs the input text and output sentiment analysis results, including the sentiment and score, using Langsmith for evaluation and tracing of the LLM's performance and behavior.






In [20]:
from transformers import pipeline

# sentiment pipeline
pipe = pipeline("text-classification", model=ov_model_sentiment, tokenizer=ov_model_sentiment_tok)

def get_sentiment(text):
    outputs = pipe(text)
    sentiment = outputs[0]["label"]
    sentiment_score = outputs[0]["score"]
    if sentiment == "NEGATIVE":
        sentiment_score *= -1

    # Log input and output using
    client.create_run(
        inputs={"text": text},
        run_type="llm",
        outputs={"sentiment": sentiment, "score": sentiment_score},
        name="distilbert-sentiment"
    )
    return sentiment_score



In [21]:
get_sentiment("Yes")

0.9997833371162415

In [22]:
get_sentiment("NO")

-0.9965683221817017

### **Chatbot with Sentiment-Infused Responses**

To make the chatbot respond based on sentiment, we will adjust the `bot()` function. The chatbot will consider the sentiment score and adjust the tone of its response accordingly.

In [27]:
from threading import Event, Thread
from uuid import uuid4
from typing import List, Tuple
import torch
from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer

# Model Configuration
history_template = "\n<human>:{user}\n<bot>:{assistant}"
current_message_template = "\n<human>:{user}\n<bot>:{assistant}"
start_message = ""
stop_tokens = [29, 0]
tokenizer_kwargs = {}

# Define Token Processor for Partial Text Updates
def red_pijama_partial_text_processor(partial_text, new_text):
    if new_text == '<':
        return partial_text
    partial_text += new_text
    return partial_text.split('<bot>:')[-1]

max_new_tokens = 256

# Stopping Criteria Class
class StopOnTokens(StoppingCriteria):
    def __init__(self, token_ids):
        self.token_ids = token_ids
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in self.token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

# Set Stop Tokens
if stop_tokens is not None:
    stop_tokens = [StopOnTokens(stop_tokens)]

# Helper for Text Processing
def default_partial_text_processor(partial_text:str, new_text:str):
    partial_text += new_text
    return partial_text

text_processor = red_pijama_partial_text_processor

# History Conversion Function
def convert_history_to_text(history:List[Tuple[str, str]]):
    text = start_message + "".join(
        [history_template.format(user=item[0], assistant=item[1]) for item in history[:-1]]
    )
    text += current_message_template.format(user=history[-1][0], assistant=history[-1][1])
    return text

# User Message Handling
def user(message, history):
    return "", history + [[message, ""]]

# Define a helper function to convert sentiment score to a label
def get_sentiment_label(score):
    if score > 0.5:
        return "Positive"
    elif score < -0.5:
        return "Negative"
    else:
        return "Neutral"


In [28]:
# Bot Response Generation with Sentiment Annotation
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
    user_message = history[-1][0]
    user_sentiment = get_sentiment(user_message)

    # Convert sentiment score to label
    sentiment_label = get_sentiment_label(user_sentiment)

    messages = convert_history_to_text(history)
    input_ids = ov_model_chat_tok(messages, return_tensors="pt", **tokenizer_kwargs).input_ids

    if input_ids.shape[1] > 2000:
        history = [history[-1]]
        messages = convert_history_to_text(history)
        input_ids = ov_model_chat_tok(messages, return_tensors="pt", **tokenizer_kwargs).input_ids

    streamer = TextIteratorStreamer(ov_model_chat_tok, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=temperature > 0.0,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
    )
    if stop_tokens is not None:
        generate_kwargs["stopping_criteria"] = StoppingCriteriaList(stop_tokens)

    stream_complete = Event()

    def generate_and_signal_complete():
        ov_model_chat.generate(**generate_kwargs)
        stream_complete.set()

    t1 = Thread(target=generate_and_signal_complete)
    t1.start()

    partial_text = ""
    for new_text in streamer:
        partial_text = text_processor(partial_text, new_text)

        # Display sentiment label instead of numeric score
        history[-1][1] = f"**USER_SENTIMENT: {sentiment_label}**\n" + partial_text
        yield history

### **Gradio Interface for Chatbot**

Use Gradio to create an interactive user interface (UI) for the chatbot where users can input their messages and see the chatbot's responses. The sentiment score will also be displayed alongside the bot's responses.

In [None]:
#!pip install gradio

In [29]:
import gradio as gr
from uuid import uuid4

def get_uuid():
    return str(uuid4())

with gr.Blocks(
    theme=gr.themes.Soft(),
    css=".disclaimer {font-variant-caps: all-small-caps;}",
) as demo:
    conversation_id = gr.State(get_uuid)
    gr.Markdown(f"""<h1><center>Sentiment Analysis Chatbot</center></h1>""")
    chatbot = gr.Chatbot(height=800)

    # Message Input and Buttons
    with gr.Row():
        with gr.Column():
            msg = gr.Textbox(
                label="Chat Message Box",
                placeholder="Chat Message Box",
                show_label=False,
                container=False
            )
        with gr.Column():
            with gr.Row():
                submit = gr.Button("Submit")
                stop = gr.Button("Stop")
                clear = gr.Button("Clear")

    # Advanced Options
    with gr.Row():
        with gr.Accordion("Advanced Options:", open=False):
            with gr.Row():
                with gr.Column():
                    temperature = gr.Slider(
                        label="Temperature",
                        value=0.1,
                        minimum=0.0,
                        maximum=1.0,
                        step=0.1,
                        interactive=True,
                        info="Higher values produce more diverse outputs",
                    )
                with gr.Column():
                    top_p = gr.Slider(
                        label="Top-p (nucleus sampling)",
                        value=1.0,
                        minimum=0.0,
                        maximum=1,
                        step=0.01,
                        interactive=True,
                        info="Sample from the smallest set of tokens whose cumulative probability exceeds top_p.",
                    )
                with gr.Column():
                    top_k = gr.Slider(
                        label="Top-k",
                        value=50,
                        minimum=0.0,
                        maximum=200,
                        step=1,
                        interactive=True,
                        info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",
                    )
                with gr.Column():
                    repetition_penalty = gr.Slider(
                        label="Repetition Penalty",
                        value=1.1,
                        minimum=1.0,
                        maximum=2.0,
                        step=0.1,
                        interactive=True,
                        info="Penalize repetition — 1.0 to disable.",
                    )

    # Button Events
    submit_event = msg.submit(
        fn=user,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).then(
        fn=bot,
        inputs=[chatbot, temperature, top_p, top_k, repetition_penalty, conversation_id],
        outputs=chatbot,
        queue=True,
    )
    submit_click_event = submit.click(
        fn=user,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).then(
        fn=bot,
        inputs=[chatbot, temperature, top_p, top_k, repetition_penalty, conversation_id],
        outputs=chatbot,
        queue=True,
    )

    stop.click(
        fn=None,
        inputs=None,
        outputs=None,
        cancels=[submit_event, submit_click_event],
        queue=False,
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue(max_size=2)
demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cf2adb0d37366a152c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


