## Building a router agent to classify text/reviews and draft up a response

In [1]:
# built-in libraries
import os
os.environ['PYTHONUTF8'] = '1'
from typing import TypeVar, Any

# litellm libraries
import litellm
from litellm.types.utils import ModelResponse, Message
from litellm import completion
from instructor import from_litellm, Mode

# misc libraries
from pydantic import BaseModel, create_model

In [None]:
# built-in libraries
from typing import Literal, TypedDict,  Any, Optional, Tuple, List, Dict, Union

# langgraph libraries
from langgraph.graph import StateGraph, START, END
from langchain_core.runnables.graph import  MermaidDrawMethod

# misc libraries
from pydantic import Field
from IPython.display import Image
import pandas as pd
import numpy as np
#from decouple import config
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from tqdm import tqdm
from sklearn.metrics import f1_score


In [None]:

# local modules
from src.llm import LLMCaller

In [None]:
from datasets import load_dataset
from typing import List, Dict
from nltk.tokenize import sent_tokenize
from collections import defaultdict

import nltk
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to C:\Users\Valdemar
[nltk_data]     Schultz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [56]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

Watsonx credentials

In [6]:
WX_API_KEY = os.getenv("WX_API_KEY")
WX_PROJECT_ID_RAG = os.getenv("WX_PROJECT_ID_RAG")
WX_API_URL = "https://us-south.ml.cloud.ibm.com"



Getting our LLM caller class - Defined in LLM.py

In [86]:

model = LLMCaller(
    api_key=WX_API_KEY,
    project_id=WX_PROJECT_ID_RAG,
    api_url=WX_API_URL,
    model_id="watsonx/mistralai/mistral-large",
    params={
        GenParams.TEMPERATURE: 0.3,
        GenParams.MAX_NEW_TOKENS: 50,
    }
)

### Creating our few shot examples data set for the LLM to learn from
We will use the few shot examples to train the LLM to classify the text and draft a response

In [8]:
# 1) Load your CSV
csv_path = os.path.join("data", "ekman_train.csv")
# Expect columns: text, ekman_emotion
df = pd.read_csv(csv_path)


creating few shot dataframe

In [9]:
# Define the order of emotion columns. idxmax will pick the first one if multiple emotions are present.
emotion_columns = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

# Find the emotion for each row.
# This creates a new series where each value is the column name of the first '1' found in emotion_columns.
df['ekman_emotion'] = df[emotion_columns].idxmax(axis=1)

# Recreate the DataFrame with only the 'text' and 'ekman_emotion' columns.
# This overwrites the original df, which is expected by the subsequent cells.
df = df[['text', 'ekman_emotion']]

# You can print the head of the modified DataFrame to verify
print("Transformed DataFrame head:")
print(df.head())

Transformed DataFrame head:
                                                text ekman_emotion
0                        [NAME] good one i like that           joy
1                  That’s actually interesting to me           joy
2  Why is this getting downvoted. I love Meepo so...       sadness
3                    I'm not offended. Just curious.      surprise
4  We have reached the stage where below 2 millio...       neutral


Createing my embedding tool

In [10]:
# 2. Load a local embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# 3. Helper to embed one or more texts
def embed(texts):
    """
    texts: a single string or list of strings
    returns: numpy array of shape (n_texts, 384)
    """
    # If a single string, wrap it in a list for the model call
    single = False
    if isinstance(texts, str):
        texts = [texts]
        single = True

    embs = embed_model.encode(texts, convert_to_numpy=True)

    # If only one input, return its vector directly
    return embs[0] if single else embs


Chunking the few-shot examples

In [11]:

# 2) Chunk long texts into 1–2 sentence chunks (if you want)
def chunk_text(text, max_sentences=2):
    sents = sent_tokenize(text)
    for i in range(0, len(sents), max_sentences):
        yield " ".join(sents[i : i + max_sentences]).strip()

shots = []
for _, row in df.iterrows():
    text = row["text"].strip()
    emo  = row["ekman_emotion"]
    for chunk in chunk_text(text):
        shots.append({"text": chunk, "emotion": emo})

# 4) Balance to N examples per emotion (to avoid over-representing any one)
N = 200
by_emo = defaultdict(list)
for shot in shots:
    by_emo[shot["emotion"]].append(shot)

few_shots = []
for emo, lst in by_emo.items():
    few_shots.extend(lst[:N])

print(f"Prepared {len(few_shots)} few-shot examples "
      f"({len(by_emo)} emotions × up to {N} each)")

Prepared 1400 few-shot examples (7 emotions × up to 200 each)


Embedding the few-shot examples

In [12]:
# 1) Embed each shot in place
for shot in few_shots:
    shot["embedding"] = embed(shot["text"])

# 2) (Optional) Build an (N, D) matrix for retrieval
emb_matrix = np.vstack([shot["embedding"] for shot in few_shots]).astype("float32")

print(f"Embedded {len(few_shots)} few-shot examples into a {emb_matrix.shape} array")


Embedded 1400 few-shot examples into a (1400, 384) array


In [13]:
# Define your state type
class ReviewState(TypedDict):
    review_text: str
    few_shot_examples: List[Dict[str, str]] 
    emotion: Optional[str]
    draft_reply: Optional[str]
    verbose: bool = False

Creating the nodes of the system

In [27]:
few_shots: List[Dict[str, object]] = few_shots # List of few-shot examples
emb_matrix: np.ndarray = np.vstack([shot["embedding"] for shot in few_shots]).astype("float32")

In [None]:
def read_review(state: ReviewState) -> dict:
    """Reviewbot loads and logs the incoming customer review"""
    review = state["review_text"]
    if state["verbose"]:
        print(f"Reviewbot received review: “{review}”")
    # no changes to state here
    return {}

#dynamic few-shotting technique, matches unseen reviews to the most similar few-shot examples
def retrieve_few_shot_examples(state: ReviewState) -> dict:
    """
    Embed the incoming review, compute cosine similarity against your pre-embedded few_shots,
    and stash the top-5 (text, emotion) pairs into state['few_shot_examples'].
    """
    # 1) Embed the new review
    q_emb = embed(state["review_text"]).reshape(1, -1).astype("float32")
    
    # 2) Compute cosine similarities
    sims = cosine_similarity(q_emb, emb_matrix)[0]  # shape (N,)
    
    # 3) Grab top-5 indices
    topk = sims.argsort()[-5:][::-1]
    
    new_examples = []
    for i in topk:
        if not (0 <= i < len(few_shots)):
            # Optionally, log a warning here if an index is out of bounds,
            # though with argsort this shouldn't happen if emb_matrix and few_shots are aligned.
            # print(f"Warning: Index i={i} is out of bounds for few_shots (len={len(few_shots)})")
            continue

        current_shot_element = few_shots[i]

        if not isinstance(current_shot_element, dict):
            # Optionally, log an error or warning here
            # print(f"Warning: few_shots[{i}] is not a dictionary! It is a {type(current_shot_element)}")
            continue

        try:
            text_val = current_shot_element["text"]
            emotion_val = current_shot_element["emotion"]
            
            example_to_add = {"text": text_val, "emotion": emotion_val}
            new_examples.append(example_to_add)

        except KeyError as e:
            # Optionally, log this error if it's critical
            # print(f"Error: KeyError occurred while processing: {current_shot_element}. Key: {e.args[0]}")
            # Depending on desired robustness, you might skip this example or re-raise
            continue # Skip this example if a key is missing
        except Exception as e:
            # Optionally, log other unexpected errors
            # print(f"Error: An unexpected error occurred while processing: {current_shot_element}. Error: {e}")
            continue # Skip this example on other errors
            
    return {"few_shot_examples": new_examples}

def classify_emotion(state: ReviewState) -> dict:
    """Use the LLM with few-shot context + Ekman definitions to pick emotions."""
    text = state["review_text"]
    # Format few-shot context
    example_lines = "\n".join(
        f"- “{ex['text']}” → {ex['emotion']}"
        for ex in state["few_shot_examples"]
    )
    
    prompt = f"""
You are a highly precise expert emotion-classification agent. Your task is to read a text / customer review  and assign **one or more** of these labels: 
anger, disgust, fear, joy, sadness, surprise, or neutral.  

Here are a few similar, examples **With Labels** to help you classify the new review:
{example_lines}

Now classify this new review into one or more of:
Anger, disgust, fear, joy, sadness, surprise or neutral.

Review:
\"\"\"{text}\"\"\"

Respond **only** in JSON with:
  • emotion: a string or list of strings from the set above.
"""
    response_model = model.create_response_model(
        "FewShotEmotionClassification",
        {
            "emotion": (
                # allow multi-label if you want, or just one
                List[Literal["anger","disgust","fear","joy","sadness","surprise","neutral"]],
                Field(description="The chosen Ekman emotion(s)")
            ),
        }
    )
    resp = model.invoke(prompt, response_model=response_model)
    
    return {"emotion": resp.emotion}

def draft_response(state: ReviewState) -> dict:
    """Draft an empathetic reply based on the classified emotion"""
    text = state["review_text"]
    emo  = state["emotion"]
    prompt = f"""A customer expresses **{emo}** in this review:
\"\"\"{text}\"\"\"
You are a senior customer-service specialist with excellent empathy and brand voice consistency.  
Given a review and its classified emotion(s), draft a short reply that:

  - Acknowledges their {emo}, WITHOUT writing the specific emotions in the response,
  - Addresses any concerns they raise,
  - Invites further dialogue if needed.
"""
    resp = model.invoke(prompt)
    return {"draft_reply": resp.answer}


def notify_user(state: ReviewState) -> dict:
    """Present the final draft back to the support employee"""
    if state["verbose"]:
        print("\n" + "="*40)
        print(f"review : {state["review_text"]}")
        print(f"Review classified as: {state['emotion']}")
        print("Response:")
        print(state["draft_reply"])
        print("="*40 + "\n")
    return {}

No need for conditional edge as it is a linear system

### Creating Graph of the system

In [109]:
# Create the graph
review_graph = StateGraph(ReviewState)  # Initialize with your ReviewState

# Add nodes
review_graph.add_node("read review",       read_review)
#review_graph.add_node("fetch emotion definitions", fetch_emotion_definitions)
review_graph.add_node("retrieve few-shot examples", retrieve_few_shot_examples)
review_graph.add_node("classify emotion",  classify_emotion)
review_graph.add_node("draft response",    draft_response)
review_graph.add_node("notify user",      notify_user)

# Wire up the edges
# 1) START → read_review
review_graph.add_edge(START, "read review")

# 2) read_review → fetch_emotion_definitions
review_graph.add_edge("read review", "retrieve few-shot examples")

review_graph.add_edge("retrieve few-shot examples", "classify emotion")
# 2) retrieve_few_shot_examples → classify_emotion

# 3) classify_emotion → route_review
review_graph.add_edge("classify emotion", "draft response")

#4 ) draft_response → notify_user
review_graph.add_edge("draft response", "notify user")

# 5) notify_user → END
review_graph.add_edge("notify user", END)


compiled_graph = review_graph.compile()

Print the graph

In [None]:
Image(compiled_graph.get_graph().draw_mermaid_png(draw_method=MermaidDrawMethod.API))

Testing the system

In [120]:
#generate test texts/ reviews to pass to my review bot
Example_text1 = "I love this product! It works great and exceeded my expectations."
Example_text2 = "I am very disappointed with the service. It was not what I expected."
Example_text3 = "The product is okay, but it could be better. I expected more."
Example_text4 = "the service was nothing special. I was hoping for a better experience. but product was good."


System is correctly categorizing the text and drafting a response

In [121]:
# Process the legitimate email
print("\nProcessing review")
test_examples = compiled_graph.invoke(
    {
        "review_text": Example_text4, # invoke the graph with the email text
        "verbose": True,  # enable verbose output
    }
)


Processing review
Reviewbot received review: “the service was nothing special. I was hoping for a better experience. but product was good.”

Review classified as: ['neutral']
Response:
Thank you for your feedback. We're sorry to hear that your experience with our service didn't meet your expectations, but we're glad you found the product to be good. We strive to improve, so if there's anything specific you'd like us to address, please let us know.



Evaluating the system

In [None]:
# Define emotion columns (ensure this matches your dataset and classify_emotion output)
emotion_columns = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

# 1. Load test data
try:
    df_test = pd.read_csv("data/ekman_test.csv")
    df_test = df_test[:60]#small dataset for testing the classification function
except FileNotFoundError:
    print("Error: data/ekman_test.csv not found. Make sure the path is correct.")
    raise

# Prepare lists for true and predicted labels
all_true_labels = []
all_pred_labels = []


In [94]:

print("Evaluating classify_emotion function...")
# 2. & 3. Get true labels and predictions
# Ensure `few_shots`, `emb_matrix` (used by retrieve_few_shot_examples)
# and `model` (used by classify_emotion) are initialized from previous cells.

for index, row in tqdm(df_test.iterrows(), total=df_test.shape[0], desc="Processing test data"):
    text = row['text']
    
    # Prepare true labels for this sample
    true_emotions_for_sample = [col for col in emotion_columns if col in row and row[col] == 1]
    all_true_labels.append(true_emotions_for_sample)
    
    # Prepare state for classify_emotion
    # ReviewState is defined in a previous cell
    current_state = ReviewState(
        review_text=text,
        few_shot_examples=[], 
        emotion=None,         
        draft_reply=None,
        verbose=False         
    )
    # Call retrieve_few_shot_examples
    try:
        retrieved_state_update = retrieve_few_shot_examples(current_state)
        current_state.update(retrieved_state_update)
    except Exception as e:
        print(f"Error during retrieve_few_shot_examples for text: '{text[:50]}...': {e}")
        all_pred_labels.append([]) # Append empty list for this sample on error
        continue

    # Call classify_emotion
    try:
        classification_result = classify_emotion(current_state)
        predicted_emotions_for_sample = classification_result.get('emotion', [])
        
        if isinstance(predicted_emotions_for_sample, str): # Ensure it's a list
            predicted_emotions_for_sample = [predicted_emotions_for_sample]
        
        all_pred_labels.append(predicted_emotions_for_sample)
    except Exception as e:
        print(f"Error during classify_emotion for text: '{text[:50]}...': {e}")
        all_pred_labels.append([]) # Append empty list for this sample on error
        continue

# 4. Evaluate
# Initialize MultiLabelBinarizer with all possible emotion classes
mlb = MultiLabelBinarizer(classes=emotion_columns)

# Fit on all possible labels to ensure consistent encoding, then transform
mlb.fit([emotion_columns]) # Fit with all known classes
y_true_binarized = mlb.transform(all_true_labels)
y_pred_binarized = mlb.transform(all_pred_labels)

# Print classification report
print("\nClassification Report for classify_emotion:")
report = classification_report(y_true_binarized, y_pred_binarized, target_names=mlb.classes_, zero_division=0)
print(report)

# Calculate overall metrics
precision_micro = precision_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
recall_micro = recall_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
f1_micro = f1_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)

precision_macro = precision_score(y_true_binarized, y_pred_binarized, average='macro', zero_division=0)
recall_macro = recall_score(y_true_binarized, y_pred_binarized, average='macro', zero_division=0)
f1_macro = f1_score(y_true_binarized, y_pred_binarized, average='macro', zero_division=0)

precision_weighted = precision_score(y_true_binarized, y_pred_binarized, average='weighted', zero_division=0)
recall_weighted = recall_score(y_true_binarized, y_pred_binarized, average='weighted', zero_division=0)
f1_weighted = f1_score(y_true_binarized, y_pred_binarized, average='weighted', zero_division=0)

print(f"\nOverall Micro Averages: Precision: {precision_micro:.4f}, Recall: {recall_micro:.4f}, F1-Score: {f1_micro:.4f}")
print(f"Overall Macro Averages: Precision: {precision_macro:.4f}, Recall: {recall_macro:.4f}, F1-Score: {f1_macro:.4f}")
print(f"Overall Weighted Averages: Precision: {precision_weighted:.4f}, Recall: {recall_weighted:.4f}, F1-Score: {f1_weighted:.4f}")


Evaluating classify_emotion function...


Processing test data:  25%|██▌       | 15/60 [00:59<03:50,  5.12s/it]

Error during classify_emotion for text: 'Thats fair. I know its a joke it just always struc...': 1 validation error for FewShotEmotionClassification
emotion
  Field required [type=missing, input_value={'answer': 'Thats fair. I...te it off. No worries.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing


Processing test data: 100%|██████████| 60/60 [03:19<00:00,  3.33s/it]



Classification Report for classify_emotion:
              precision    recall  f1-score   support

       anger       0.17      0.17      0.17         6
     disgust       0.00      0.00      0.00         1
        fear       0.33      0.50      0.40         2
         joy       0.72      0.54      0.62        24
     neutral       0.43      0.60      0.50        20
     sadness       0.57      0.57      0.57         7
    surprise       0.29      0.57      0.38         7

   micro avg       0.42      0.52      0.47        67
   macro avg       0.36      0.42      0.38        67
weighted avg       0.50      0.52      0.50        67
 samples avg       0.43      0.52      0.46        67


Overall Micro Averages: Precision: 0.4217, Recall: 0.5224, F1-Score: 0.4667
Overall Macro Averages: Precision: 0.3583, Recall: 0.4216, F1-Score: 0.3769
Overall Weighted Averages: Precision: 0.5011, Recall: 0.5224, F1-Score: 0.4974
