In [1]:
!pip install -q google-genai python-dotenv

In [3]:
from google import genai

def get_gemini_client():
    api_key = (
        os.getenv("GEMINI_API_KEY")
        or os.getenv("GOOGLE_API_KEY")
    )
    if not api_key:
        raise RuntimeError("No Gemini API key found in environment variables.")
    return genai.Client(api_key=api_key)

client = get_gemini_client()

resp = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="In one sentence, explain why trustworthy metrics matter in A/B testing LLM prompts for sales calls."
)

print(resp.text)


Trustworthy metrics are essential to accurately identify LLM prompts that genuinely improve sales call outcomes, directly driving conversions and revenue rather than implementing ineffective or detrimental changes.


In [4]:
import os

BASE_DIR = "/content/svg_prompt_ab_tester"  # you can rename if you want

subdirs = [
    BASE_DIR,
    os.path.join(BASE_DIR, "assets"),
    os.path.join(BASE_DIR, ".streamlit"),
]

for d in subdirs:
    os.makedirs(d, exist_ok=True)

BASE_DIR

'/content/svg_prompt_ab_tester'

In [5]:
LOGO_PATH = "assets/logo.png"

In [11]:
app_path = os.path.join(BASE_DIR, "app.py")

app_code = r"""
import os
import streamlit as st

COMPANY_NAME = "Spring Venture Group"
LOGO_PATH = "assets/company_logo.png"

st.set_page_config(
    page_title=f"{COMPANY_NAME} Prompt A/B Tester",
    page_icon="üß™",
    layout="wide",
)

# --- Header with logo ---
col_logo, col_title = st.columns([1, 3])

with col_logo:
    if os.path.exists(LOGO_PATH):
        st.image(LOGO_PATH, use_column_width=True)
    else:
        st.markdown(
            f"**{COMPANY_NAME}**\n\n_(Logo missing at `{LOGO_PATH}` ‚Äî please add it to the assets folder.)_"
        )

with col_title:
    st.title("Spring Venture Group Prompt A/B Tester ‚Äî Trustworthy Conversational Metrics")
    st.markdown(
        \"\"\"\
This prototype Streamlit app is built **specifically for Spring Venture Group** to explore how different
LLM prompts extract structured, trustworthy metrics from unstructured sales and health insurance conversations.
\"\"\"
    )

st.markdown("---")

# --- Start with WHY ---
st.subheader("Why this app?")
st.markdown(
        \"\"\"\
Modern call transcripts are rich but unstructured. To make them useful, we define prompts that transform raw
conversations into **structured signals** (e.g., intent, sentiment, friction, next-best-action).
This app is designed to:
- Compare two prompt definitions side-by-side (Prompt A vs Prompt B)
- Evaluate how trustworthy and consistent their outputs are
- Provide lightweight, explainable metrics that align with a **scientific, experiment-driven mindset**
\"\"\"
)

st.info(
    "Next steps (to be implemented): transcript input, Prompt A/B definitions, Gemini-powered JSON extraction, "
    "and evaluation metrics (compliance, coverage, consistency, risk)."
)
"""

with open(app_path, "w", encoding="utf-8") as f:
    f.write(app_code)

app_path

'/content/svg_prompt_ab_tester/app.py'

In [12]:
req_path = os.path.join(BASE_DIR, "requirements.txt")

requirements = """
streamlit>=1.40.0
google-genai>=0.3.0
python-dotenv>=1.0.0
pillow>=10.0.0
"""

with open(req_path, "w", encoding="utf-8") as f:
    f.write(requirements.strip() + "\n")

req_path

'/content/svg_prompt_ab_tester/requirements.txt'

In [13]:
readme_path = os.path.join(BASE_DIR, "README.md")

readme = """
# Spring Venture Group Prompt A/B Tester ‚Äî Trustworthy Conversational Metrics

> Built specifically for **Spring Venture Group** to demonstrate a "Start with Why" mindset around evaluating LLM prompts
> for trustworthy, experiment-ready conversational metrics.

This repository contains a Streamlit MVP that compares two LLM prompts (Prompt A vs Prompt B) on the same call transcript
and computes lightweight evaluation metrics such as JSON compliance, coverage of expected keys, consistency, and
hallucination-risk heuristics.

**Status:** MVP skeleton ‚Äî UI shell and project structure created. Functionality will be added in subsequent steps.
"""

with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme.strip() + "\n")

readme_path

'/content/svg_prompt_ab_tester/README.md'

In [18]:
import os

BASE_DIR = "/content/svg_prompt_ab_tester"
app_path = os.path.join(BASE_DIR, "app.py")

app_code = r"""
import os
import streamlit as st
from google import genai
import json
import re # For cleaning JSON output

COMPANY_NAME = "Spring Venture Group"
LOGO_PATH = "assets/company_logo.png"

EXAMPLE_TRANSCRIPT = \"\"\"Agent: Hi, thanks for calling Spring Venture Group. How can I help you today?
Customer: I'm trying to understand my options for health insurance. My employer plan is getting too expensive.
Agent: Got it. I'll ask a few questions about your needs and budget, then we can compare some options.
Customer: Sure, that sounds good.
Agent: Great. First, are you primarily concerned about monthly premium, out-of-pocket costs, or keeping your current doctors?
Customer: Mostly monthly premium, but I don't want surprise bills either.
Agent: Understood. Based on what you're telling me, I can walk you through a couple of plans and highlight trade-offs.
Customer: Okay, let's do that.
\"\"\"


def get_gemini_client():
    api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise RuntimeError("No Gemini API key found in environment variables.")
    return genai.Client(api_key=api_key)


def generate_json(prompt_text: str, transcript: str, schema_keys: list):
    client = get_gemini_client()
    model = client.models.get("gemini-2.5-flash") # Or another suitable model

    full_prompt = prompt_text.replace("{{TRANSCRIPT}}", transcript)

    try:
        response = model.generate_content(full_prompt)
        raw_text = response.text

        # Attempt to clean and parse the JSON
        # Remove any leading/trailing ```json or ```
        cleaned_text = raw_text.strip()
        if cleaned_text.startswith("```json") and cleaned_text.endswith("```"):
            cleaned_text = cleaned_text[7:-3].strip()
        elif cleaned_text.startswith("```") and cleaned_text.endswith("```"):
            cleaned_text = cleaned_text[3:-3].strip()

        parsed_json = json.loads(cleaned_text)
        return raw_text, parsed_json, None
    except json.JSONDecodeError as e:
        return raw_text, None, f"JSON parse error: {e}"
    except Exception as e:
        return None, None, f"Error during content generation: {e}"

def evaluate_prompt_output(
    parsed: dict,
    raw_1: str,
    raw_2: str = None,
    required_keys: list = None,
    transcript_text: str = None,
):
    metrics = {}

    # 1. JSON Compliance (handled implicitly by successful parsing)
    metrics["is_json_compliant"] = parsed is not None

    # 2. Key Coverage
    if required_keys and parsed:
        present_keys = set(parsed.keys())
        missing_keys = set(required_keys) - present_keys
        metrics["missing_keys"] = list(missing_keys)
        metrics["key_coverage_score"] = (len(required_keys) - len(missing_keys)) / len(required_keys)
    else:
        metrics["missing_keys"] = required_keys if required_keys else []
        metrics["key_coverage_score"] = 0.0

    # Placeholder for other metrics (consistency, risk, etc.)
    metrics["consistency_score"] = None # To be implemented
    metrics["hallucination_risk"] = None # To be implemented

    return metrics


st.set_page_config(
    page_title=f"{COMPANY_NAME} Prompt A/B Tester",
    page_icon="üß™",
    layout="wide",
)

# --- Header with logo ---
col_logo, col_title = st.columns([1, 3])

with col_logo:
    if os.path.exists(LOGO_PATH):
        st.image(LOGO_PATH, use_column_width=True)
    else:
        st.markdown(
            f"**{COMPANY_NAME}**\n\n_(Logo missing at `{LOGO_PATH}` ‚Äî please add it to the assets folder.)_"
        )

with col_title:
    st.title("Spring Venture Group Prompt A/B Tester ‚Äî Trustworthy Conversational Metrics")
    st.markdown(
        \"\"\"\
This prototype Streamlit app is built **specifically for Spring Venture Group** to explore how different
LLM prompts extract structured, trustworthy metrics from unstructured sales and health insurance conversations.
\"\"\"
    )

st.markdown("---")

# --- Start with WHY ---
st.subheader("Why this app?")
st.markdown(
        \"\"\"\
Modern call transcripts are rich but unstructured. To make them useful, we define prompts that transform raw
conversations into **structured signals** (e.g., intent, sentiment, friction, next-best-action).
This app is designed to:
- Compare two prompt definitions side-by-side (Prompt A vs Prompt B)
- Evaluate how trustworthy and consistent their outputs are
- Provide lightweight, explainable metrics that align with a **scientific, experiment-driven mindset**
\"\"\"
)

st.markdown("---")

# -----------------------
# 1. Conversation Transcript
# -----------------------
st.header("1. Conversation Transcript")

st.markdown(
    \"\"\"\
Paste a **single call transcript** here, or upload a `.txt` file.
    You can also load a small example transcript for quick demo purposes.
\"\"\"
)

# Session state for transcript text
if "transcript_text" not in st.session_state:
    st.session_state["transcript_text"] = ""

col_input, col_side = st.columns([3, 1])

with col_input:
    transcript_text = st.text_area(
        "Paste call transcript",
        value=st.session_state["transcript_text"],
        height=260,
        placeholder="Paste the full conversation transcript here...",
    )
    # keep state in sync with manual edits
    st.session_state["transcript_text"] = transcript_text

with col_side:
    uploaded_file = st.file_uploader(
        "Or upload transcript (.txt)",
        type=["txt"],
        help="Upload a plain-text file with the call transcript.",
    )
    if uploaded_file is not None:
        try:
            content = uploaded_file.read().decode("utf-8", errors="ignore")
            st.session_state["transcript_text"] = content
            st.success("Transcript loaded from file. You can review/edit it on the left.")
        except Exception as e:
            st.error(f"Could not read uploaded file: {e}")

    if st.button("Load example transcript"):
        st.session_state["transcript_text"] = EXAMPLE_TRANSCRIPT
        st.info("Example transcript loaded. You can edit it in the text area.")

st.markdown(
    \"\"\"\
_This transcript will be used as the shared input when we later compare **Prompt A** vs **Prompt B** using Gemini._
\"\"\"
)

st.info(
    "Next steps (to be implemented): Prompt A/B inputs, expected JSON schema keys, Gemini-backed extraction, "
    "and evaluation metrics (compliance, coverage, consistency, risk)."
)
"""

with open(app_path, "w", encoding="utf-8") as f:
    f.write(app_code)

print("Rewrote app.py")

Rewrote app.py


In [24]:
%cd /content
!zip -r svg_prompt_ab_tester.zip svg_prompt_ab_tester

/content
  adding: svg_prompt_ab_tester/ (stored 0%)
  adding: svg_prompt_ab_tester/__pycache__/ (stored 0%)
  adding: svg_prompt_ab_tester/__pycache__/app.cpython-312.pyc (deflated 54%)
  adding: svg_prompt_ab_tester/app.py (deflated 70%)
  adding: svg_prompt_ab_tester/README.md (deflated 38%)
  adding: svg_prompt_ab_tester/assets/ (stored 0%)
  adding: svg_prompt_ab_tester/assets/logo.png (stored 0%)
  adding: svg_prompt_ab_tester/requirements.txt (deflated 11%)
  adding: svg_prompt_ab_tester/.streamlit/ (stored 0%)


In [25]:
%cd /content/svg_prompt_ab_tester

/content/svg_prompt_ab_tester


In [27]:
%%writefile test_backend.py
import os
import json

from app import generate_json, evaluate_prompt_output, EXAMPLE_TRANSCRIPT

# Make sure key exists
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise RuntimeError("No GEMINI_API_KEY / GOOGLE_API_KEY set in environment!")

schema_keys = ["summary", "customer_intent", "sentiment", "friction_points", "next_best_action"]

print("Calling Gemini once for Prompt A-style test...")

prompt_text = """
You are a conversation analyst at Spring Venture Group. Given the call transcript below, extract a single JSON object
with exactly these keys: summary, customer_intent, sentiment, friction_points, next_best_action.

Output STRICT JSON only.

Transcript:
{{TRANSCRIPT}}
"""

raw, parsed, err = generate_json(prompt_text, EXAMPLE_TRANSCRIPT, schema_keys)

print("\nRaw output:\n", raw[:500], "...\n")

if err:
    print("ERROR:", err)
else:
    print("Parsed JSON:\n", json.dumps(parsed, indent=2))

    metrics = evaluate_prompt_output(
        parsed=parsed,
        raw_1=raw,
        raw_2=None,
        required_keys=schema_keys,
        transcript_text=EXAMPLE_TRANSCRIPT,
    )
    print("\nMetrics:\n", json.dumps(metrics, indent=2))

Writing test_backend.py


In [29]:
%cd /content/svg_prompt_ab_tester
!pip install -q -r requirements.txt

/content/svg_prompt_ab_tester
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.1/9.1 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [32]:
%cd /content/svg_prompt_ab_tester
!python -m py_compile app.py && echo "Syntax OK"
!python test_backend.py

/content/svg_prompt_ab_tester
Syntax OK
2026-02-04 01:03:10.743 
  command:

    streamlit run test_backend.py [ARGUMENTS]
2026-02-04 01:03:10.747 Session state does not function when running a script without `streamlit run`
Calling Gemini once for Prompt A-style test...

Raw output:
 ```json
{
  "summary": "The customer called Spring Venture Group to explore health insurance options because their employer plan is becoming too expensive. The agent proposed asking questions about needs and budget to compare options, which the customer agreed to. The customer's primary concern is monthly premium, but they also want to avoid surprise bills.",
  "customer_intent": "To understand and compare health insurance options, specifically looking for more affordable alternatives to their c ...

Parsed JSON:
 {
  "summary": "The customer called Spring Venture Group to explore health insurance options because their employer plan is becoming too expensive. The agent proposed asking questions about need

In [33]:
%cd /content
!zip -r svg_prompt_ab_tester2.zip svg_prompt_ab_tester

/content
  adding: svg_prompt_ab_tester/ (stored 0%)
  adding: svg_prompt_ab_tester/__pycache__/ (stored 0%)
  adding: svg_prompt_ab_tester/__pycache__/app.cpython-312.pyc (deflated 54%)
  adding: svg_prompt_ab_tester/test_backend.py (deflated 49%)
  adding: svg_prompt_ab_tester/app.py (deflated 70%)
  adding: svg_prompt_ab_tester/README.md (deflated 38%)
  adding: svg_prompt_ab_tester/assets/ (stored 0%)
  adding: svg_prompt_ab_tester/assets/logo.png (stored 0%)
  adding: svg_prompt_ab_tester/requirements.txt (deflated 11%)
  adding: svg_prompt_ab_tester/.streamlit/ (stored 0%)
