# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
# Standard library imports
from pydantic import BaseModel
import threading
import warnings
import random
import string
import queue
import json
import csv
import os

warnings.filterwarnings("ignore")
JWT_SECRET_API = !echo $(aws --profile "chat-prod_ro" secretsmanager get-secret-value --secret-id "arn:aws:secretsmanager:us-west-2:242659714806:secret:shared/cresta-server-jwt_secret-VDn5My" --query SecretString --output text) # type: ignore
os.environ["JWT_SECRET_API"] = json.loads(JWT_SECRET_API[0])["jwt-secret"]
os.environ["CONFIG_SERVICE_ADDR"] = "auth.chat-prod.internal.cresta.ai:443"
os.environ["CONFIG_USE_SECURE_CHANNEL"] = "true"

# Third party imports

from sentence_transformers import SentenceTransformer, util
from retry import retry

# Greyparrot imports
from greyparrot.llm.prompting import prompts as prompts_utils
from greyparrot.conversations.db import ConversationsDBConn
from greyparrot.multi_tenancy.v3_config import V3Config
from greyparrot.chats_common import PartialChat
from greyparrot.dataset_common import Dataset
from greyparrot.common import get_logger

logger = get_logger(__name__)

# Local imports
from llm_proxy import get_open_ai_client, LLMProxyProperties

ModuleNotFoundError: No module named 'sentence_transformers'

In [4]:
embedder = SentenceTransformer("all-mpnet-base-v2")

In [5]:
customer_id = "brinks"
profile_id = "care-voice"
usecase_id = "care-voice"
language_code = "en-US"

In [6]:
def get_chats_with_ids(chat_ids: list[str]):
    customer_name = V3Config.short_name_from_ids(customer_id, profile_id)
    conv_db_conn = ConversationsDBConn.from_customer_name(customer_name)
    chats = conv_db_conn.get_detailed_chats(
        customer_id=customer_id,
        profile_id=profile_id,
        usecase_id=usecase_id,
        language_code=language_code,
        conversation_ids=chat_ids,
        is_dev_user=False,
    )
    return chats

In [7]:
LLM_ENGINE = "gpt-4-0125-preview"
CONCURRENCY = 10

In [8]:
llm_proxy_properties = LLMProxyProperties(
    project_id="virtual-agent-simulation",
    customer_id=customer_id,
    profile_id=profile_id,
    usecase_id="",
)
open_ai_client = get_open_ai_client(
    llm_proxy_properties=llm_proxy_properties,
    provider="openai",
)

In [9]:
@retry(tries=3, delay=60, backoff=2, logger=logger)
def chat_completion(**kwargs):
    return open_ai_client.beta.chat.completions.parse(**kwargs)

In [10]:
def save_jsonl_file(data, output_file):
    with open(output_file, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

In [11]:
def print_workflow(w: dict):    
    for w_k, w_v in w.items():        
        if isinstance(w_v, list):
            val = "\n* ".join(w_v)
            print(f"{w_k.capitalize()}:\n* {val}")
        else:
            print(f"{w_k.capitalize()}: {w_v}")
    print("-" * 50)

# Discovery prompts

In [12]:
# TODO remove this after fixing speaker_role flips
flips = {"agent": "visitor", "visitor": "agent"}

In [13]:
def chat_to_prompt_text(chat: PartialChat, speakers_flipped: bool = False):
    return "\n".join([
        f"{string.capwords(prompts_utils.speaker_role_str_for_prompts(flips[m.speaker_role.value] if speakers_flipped else m.speaker_role.value))}: {m.text}"
        for m in chat.messages
    ])

In [14]:
SYSTEM_PROMPT_AGENT_WORKFLOW_DISCOVERY = """### Context and data description
You are a conversation analyst working for a Call Center.

You will be given 1 conversation at a time. Each conversation is between a Call Center Agent and a Customer. Your primary goal is to extract workflows of steps which the Agent takes in **the given conversation** to help resolve the Customer's needs related to a procut issue.

The primary use case of these workflows is to create a troubleshooting template to address similar customer needs in the future.

Each workflow should be a list of steps which the Agent needs to take.

For each workflow, return the product, issue, and a list of steps which the Agent needs to take.

Make sure the product is specific and not general.
Make sure the issue is specific and not general.
Make sure the steps are detailed.

**Important**: There could be more than 1 workflow in a single conversation. There could also be no workflows in a single conversation. The workflows will be used to create troubleshooting guides to address similar customer needs in the future."""

In [15]:
SYSTEM_PROMPT_VISITOR_WORKFLOW_DISCOVERY = """### Context and data description
You are a conversation analyst working for a Call Center.

You will be given 1 conversation at a time. Each conversation is between a Call Center Agent and a Customer. Your primary goal is to extract flows of steps which Customer takes in **the given conversation**.

The primary use case of this flow is to create a template to simulate similar customer scenarios.

The flow should be a list of steps which the Customer takes.

The schema of each flow should be as follows:
- **title**: title of the flow
- **steps**: a list of steps which Customer needs to follow

**Important**: There could be more than 1 flow in a single conversation. The flows will be used create templates to simulate similar customer scenarios."""

In [17]:
FLOW_PROMPTS = {
    "agent": SYSTEM_PROMPT_AGENT_WORKFLOW_DISCOVERY,
    "visitor": SYSTEM_PROMPT_VISITOR_WORKFLOW_DISCOVERY
}

In [None]:
class Flow(BaseModel):
    product: str
    issue: str
    steps: list[str]

class Flows(BaseModel):
    flows: list[Flow]

In [18]:
def discover_flow_in_chat(chat: PartialChat,
                          speaker_role: str,
                          speakers_flipped: bool = False,
                          llm_engine: str = LLM_ENGINE):
    logger.info(f"Discovering {speaker_role} flow in chat {chat.chat_name}")
    messages = [
        {
            "role": "system",
            "content": FLOW_PROMPTS[speaker_role]
        },
        {
            "role": "user",
            "content": chat_to_prompt_text(chat, speakers_flipped)
        },
    ]

    # Note: temperature=0.1 to allow for some exploration
    response = chat_completion(model=llm_engine,
                               messages=messages,
                               temperature=0.1,
                               response_format=Flows)
    workflow = response.choices[0].message.parsed

    return workflow

In [19]:
def extract_flows_from_chats(chats: list[PartialChat], speaker_role: str,
                                concurrency: int = 10):
    lock = threading.Lock()
    indexes = queue.Queue()

    workflows = {}
    for idx in range(len(chats)):
        indexes.put(idx)

    def workflow_labeler_worker():
        while True:
            try:
                idx = indexes.get(block=False)
            except queue.Empty:
                return
            chat = chats[idx]
            try:
                extracted_workflows = discover_flow_in_chat(chat, speaker_role)
                with lock:
                    workflows[str(chat)] = extracted_workflows
                    if len(workflows) % 10 == 0:
                        print(f"Workflows from {len(workflows)} chats extracted!")
            except Exception as e:
                logger.warning(e, str(chat))
            indexes.task_done()

    logger.info(
        f"Starting processing {len(chats)} chats with {concurrency} workers")
    workers = [
        threading.Thread(target=workflow_labeler_worker)
        for _ in range(concurrency)
    ]
    for worker in workers:
        worker.start()
    for worker in workers:
        worker.join()
    logger.info(f"Finished processing all {len(chats)} chats")

    return workflows

In [None]:
test_chat = get_chats_with_ids(["0843c54c-6487-45ce-946a-cc6257484f54"])[0]
str(test_chat), test_chat.messages

In [None]:
workflows = discover_flow_in_chat(test_chat, "agent", speakers_flipped=True)
for idx, w in enumerate(workflows):
    print("-" * 50)
    print(f"[Agent Workflow#{idx + 1}]")
    for k, v in w.items():
        if isinstance(v, list):
            val = "\n* ".join(v)
            print(f"{k.capitalize()}:\n* {val}")
        else:
            print(f"{k.capitalize()}: {v}")

In [None]:
len(workflows)

# Relevant chats (from KA-QE trainset)

In [None]:
qe_dataset = Dataset.pull_from_repo(
    "brinks-care-voice/hf:082720241521675544.train")
len(qe_dataset)

In [None]:
qe_dataset[0]

In [None]:
chat_ids = list(set([pc.chat_name for pc, _ in qe_dataset]))
len(chat_ids)

In [26]:
queries_to_chat = {query: pc.chat_name for pc, query in qe_dataset}

In [None]:
queries = list(queries_to_chat.keys())
len(queries)

In [None]:
source_embeddings = embedder.encode(queries,
                                    convert_to_tensor=True,
                                    show_progress_bar=False)
len(source_embeddings)

In [None]:
with open("Brinks Evaluation v.20240806_ - Response Evaluation (internal).csv") as f:
    reader = csv.DictReader(f)
    data = list(reader)
    customer_queries = [entry["Question"] for entry in data]
len(customer_queries)

In [40]:
matched_queries = []
for q in customer_queries:
    target_embeddings = embedder.encode(q,
                                        convert_to_tensor=True,
                                        show_progress_bar=False)
    scores = util.cos_sim(target_embeddings,
                          source_embeddings).cpu().tolist()[0]
    matches = [(idx, score) for idx, score in enumerate(scores) if score > 0.7]
    if matches:
        matched_queries.extend([queries[i] for i, s in matches])

In [None]:
len(matched_queries)

In [None]:
matched_chats = [queries_to_chat[q] for q in matched_queries]
len(matched_chats)

In [None]:
num_chats = 1000
sampled_chat_ids = list(set(random.sample(chat_ids, num_chats) + matched_chats))
print(len(sampled_chat_ids))
sampled_chats = get_chats_with_ids(sampled_chat_ids)
len(sampled_chats)

# Agent Workflow Discovery

In [None]:
workflows = extract_flows_from_chats(sampled_chats, "agent")

In [None]:
len(workflows)

In [None]:
for pc, ws in list(workflows.items())[:5]:
    print(f"\n\n<{pc}>")
    for idx, w in enumerate(ws):
        print("-" * 50)
        print(f"[Agent Workflow#{idx + 1}]")
        for k, v in w.items():
            if isinstance(v, list):
                val = "\n* ".join(v)
                print(f"{k.capitalize()}:\n* {val}")
            else:
                print(f"{k.capitalize()}: {v}")

In [None]:
## start here

In [97]:
title_to_workflow, title_to_pc_id = {}, {}
for pc_id, ws in workflows.items():
    for idx, w in enumerate(ws):
        title_to_workflow[w["title"]] = w
        title_to_pc_id[w["title"]] = pc_id

In [None]:
len(title_to_workflow), len(title_to_pc_id)

In [62]:
def group_semantically_similar_workflows(workflows: list[str], embedder: SentenceTransformer,
                                           semantic_threshold: int = 0.8):
    workflow_embeddings = embedder.encode(workflows, convert_to_tensor=True)
    workflow_groups, solo_workflows = [], []
    for idx, workflow in enumerate(workflows):
        existing_groups = [group for group in workflow_groups if idx in group]
        if existing_groups:
            assert len(
                existing_groups) == 1, "A workflow should only be in 1 group"
            continue

        print(f"Finding similar Workflows for Workflow#{idx}")

        scores = util.cos_sim(workflow_embeddings[idx:idx + 1],
                              workflow_embeddings).cpu().tolist()[0]
        matches = [
            i for i, score in enumerate(scores) if score > semantic_threshold
        ]
        filtered_matches = [i for i in matches if i != idx]
        if filtered_matches:
            outstanding_groups = []
            new_group = filtered_matches + [idx]
            for group in workflow_groups:
                if set(filtered_matches).intersection(set(group)):
                    new_group.extend(group)
                else:
                    outstanding_groups.append(group)
            workflow_groups = outstanding_groups + [list(set(new_group))]
        else:
            solo_workflows.append(idx)

    print(f"Found {len(solo_workflows)} # of Solo Workflows..")
    print(f"Found {len(workflow_groups)} # Groups of Workflows..")

    return [workflows[idx] for idx in solo_workflows
           ], [[workflows[idx] for idx in group] for group in workflow_groups]

In [None]:
solo_workflows, grouped_workflows = group_semantically_similar_workflows(
    list(title_to_workflow.keys()), embedder)

In [None]:
for each in grouped_workflows:
    print("-" * 50)
    print(len(each))
    for e in each:
        print(e)

In [None]:
unique_workflow_titles = solo_workflows + [random.choice(w) for w in grouped_workflows]
unique_workflows = [title_to_workflow[w] for w in unique_workflow_titles]
len(unique_workflows)

In [None]:
filtered_unique_workflows = [w for w in unique_workflows if len(w["steps"]) > 3] # skip too small workflows
len(filtered_unique_workflows)

In [None]:
filtered_unique_workflows[0]

In [None]:
for w in filtered_unique_workflows[:10]:
    print(w["title"])

In [None]:
for w in filtered_unique_workflows:
    print("-" * 100)
    print("\n\n")
    for k, v in w.items():
        if isinstance(v, list):
            val = "\n* ".join(v)
            print(f"{k.capitalize()}:\n* {val}")
        else:
            print(f"{k.capitalize()}: {v}")    

In [None]:
[w["title"] for w in filtered_unique_workflows]

In [None]:
len(filtered_unique_workflows)