# State Extraction from Conversations - Accuracy Analysis

**Goal:** Extract state from conversations and compare accuracy to actual data availability.

**Hypothesis:** Extraction accuracy should match the percentage of conversations that actually mention state.

In [16]:
import json
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm
import time

# OpenAI import with fallback for older versions
try:
    from openai import OpenAI
except ImportError:
    print("Warning: OpenAI class not found. You may need to upgrade openai package.")
    print("Run: pip install --upgrade openai")
    OpenAI = None

Run: pip install --upgrade openai


## Step 1: Load Data

In [17]:
# Load ABCD dataset
DATA_PATH = "DATA/abcd_v1.1.json"

with open(DATA_PATH, "r") as f:
    data = json.load(f)

print(f"Train samples: {len(data['train'])}")

Train samples: 8034


## Step 2: Check State Presence in Transcripts

In [18]:
# US States
US_STATES = {
    'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR', 'california': 'CA',
    'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE', 'florida': 'FL', 'georgia': 'GA',
    'hawaii': 'HI', 'idaho': 'ID', 'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA',
    'kansas': 'KS', 'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
    'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS',
    'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV', 'new hampshire': 'NH',
    'new jersey': 'NJ', 'new mexico': 'NM', 'new york': 'NY', 'north carolina': 'NC',
    'north dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK', 'oregon': 'OR', 'pennsylvania': 'PA',
    'rhode island': 'RI', 'south carolina': 'SC', 'south dakota': 'SD', 'tennessee': 'TN',
    'texas': 'TX', 'utah': 'UT', 'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA',
    'west virginia': 'WV', 'wisconsin': 'WI', 'wyoming': 'WY'
}

STATE_ABBREVS = set(US_STATES.values())
STATE_NAMES = set(US_STATES.keys())

def check_state_in_transcript(transcript):
    """Check if transcript mentions any US state."""
    transcript_lower = transcript.lower()
    
    # Check for state names
    for state_name in STATE_NAMES:
        if state_name in transcript_lower:
            return True, US_STATES[state_name]
    
    # Check for abbreviations
    for abbrev in STATE_ABBREVS:
        if re.search(r'\b' + abbrev.lower() + r'\b', transcript_lower):
            return True, abbrev
    
    return False, None

In [19]:
# Analyze state presence in test set
test_data = data['train']

state_analysis = []

for item in test_data:
    transcript = " ".join([f"{speaker}: {text}" for speaker, text in item["original"]])
    true_state = item["scenario"].get("order", {}).get("state", None)
    state_in_transcript, mentioned_state = check_state_in_transcript(transcript)
    
    state_analysis.append({
        "convo_id": item.get("convo_id", ""),
        "transcript": transcript,
        "true_state": true_state,
        "state_mentioned": state_in_transcript,
        "mentioned_state": mentioned_state
    })

state_df = pd.DataFrame(state_analysis)

# Calculate state mention percentage
total = len(state_df)
with_state = state_df['state_mentioned'].sum()
state_percentage = (with_state / total) * 100

print(f"\n{'='*60}")
print(f"STATE PRESENCE IN TRANSCRIPTS")
print(f"{'='*60}")
print(f"Total conversations: {total}")
print(f"Conversations with state mentioned: {with_state}")
print(f"Percentage with state: {state_percentage:.2f}%")
print(f"{'='*60}\n")


STATE PRESENCE IN TRANSCRIPTS
Total conversations: 8034
Conversations with state mentioned: 7937
Percentage with state: 98.79%



## Step 3: Extract States Using OpenAI API

In [20]:
# Initialize OpenAI client
API_KEY = os.getenv("OPENAI_API_KEY")

if OpenAI is None:
    raise ImportError("OpenAI package is not properly installed. Run: pip install --upgrade openai")

client = OpenAI(api_key=API_KEY)

def extract_state_from_conversation(transcript):
    """Use OpenAI API to extract state."""
    
    prompt = f"""Extract the US STATE from this conversation.

Rules:
- Only extract if state is EXPLICITLY mentioned
- Return 2-letter abbreviation (e.g., "CA", "TX", "NY")
- If NO state mentioned, return null

Examples:
- "123 Main St, Los Angeles, CA" â†’ {{"state": "CA"}}
- "I want to return my order" â†’ {{"state": null}}
- "456 Oak Ave, Austin, Texas" â†’ {{"state": "TX"}}

Conversation:
{transcript}

Response (JSON only):"""
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "Extract US state from conversations. Respond with valid JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
            max_tokens=50,
            response_format={"type": "json_object"}
        )
        
        result = json.loads(response.choices[0].message.content)
        return result.get("state", None)
    
    except Exception as e:
        print(f"Error: {e}")
        return None

ImportError: OpenAI package is not properly installed. Run: pip install --upgrade openai

In [None]:
# Extract states from sample (100 conversations)
SAMPLE_SIZE = 100
sample_df = state_df.head(SAMPLE_SIZE).copy()

print(f"Extracting states from {SAMPLE_SIZE} conversations...\n")

extracted_states = []

for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Processing"):
    extracted = extract_state_from_conversation(row['transcript'])
    extracted_states.append(extracted)
    time.sleep(0.1)  # Rate limiting

sample_df['extracted_state'] = extracted_states

print("\nExtraction complete!")

Extracting states from 100 conversations...



Processing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [01:20<00:00,  1.24it/s]


Extraction complete!





## Step 4: Compare Accuracy vs Data Availability

In [None]:
# Normalize extracted states to lowercase for comparison
sample_df['extracted_state_lower'] = sample_df['extracted_state'].apply(lambda x: x.lower() if pd.notna(x) else None)
sample_df['true_state_lower'] = sample_df['true_state'].apply(lambda x: x.lower() if pd.notna(x) else None)

# Calculate metrics

# 1. State mention rate in sample
state_mention_rate = (sample_df['state_mentioned'].sum() / len(sample_df)) * 100

# 2. Extraction rate (how often we extracted a state)
extraction_rate = (sample_df['extracted_state'].notna().sum() / len(sample_df)) * 100

# 3. Exact match accuracy (extracted state matches true state)
sample_df['exact_match'] = sample_df['extracted_state_lower'] == sample_df['true_state_lower']
exact_match_accuracy = (sample_df['exact_match'].sum() / len(sample_df)) * 100

# 4. Accuracy when state IS mentioned
mentioned_subset = sample_df[sample_df['state_mentioned']]
if len(mentioned_subset) > 0:
    correct_when_mentioned = (mentioned_subset['extracted_state'].notna()).sum()
    extraction_when_mentioned = (correct_when_mentioned / len(mentioned_subset)) * 100
    
    # Exact match when mentioned
    exact_match_when_mentioned = (mentioned_subset['exact_match'].sum() / len(mentioned_subset)) * 100
else:
    extraction_when_mentioned = 0
    exact_match_when_mentioned = 0

# 5. Accuracy when state NOT mentioned (should be null)
not_mentioned_subset = sample_df[~sample_df['state_mentioned']]
if len(not_mentioned_subset) > 0:
    correct_when_not_mentioned = (not_mentioned_subset['extracted_state'].isna()).sum()
    accuracy_when_not_mentioned = (correct_when_not_mentioned / len(not_mentioned_subset)) * 100
else:
    accuracy_when_not_mentioned = 0

print(f"\n{'='*70}")
print(f"RESULTS: STATE EXTRACTION ACCURACY")
print(f"{'='*70}")
print(f"\nðŸ“Š DATA AVAILABILITY:")
print(f"   Conversations with state in transcript: {state_mention_rate:.1f}%")
print(f"\nðŸ¤– EXTRACTION PERFORMANCE:")
print(f"   Extraction rate (extracted any state): {extraction_rate:.1f}%")
print(f"   Exact match accuracy (correct state): {exact_match_accuracy:.1f}%")
print(f"\nðŸŽ¯ DETAILED ACCURACY:")
print(f"   When state IS mentioned:")
print(f"      - Extracted something: {extraction_when_mentioned:.1f}%")
print(f"      - Extracted correctly: {exact_match_when_mentioned:.1f}%")
print(f"   When state NOT mentioned:")
print(f"      - Correctly returned null: {accuracy_when_not_mentioned:.1f}%")
print(f"\nâœ… KEY INSIGHT:")
print(f"   Data availability: {state_mention_rate:.1f}%")
print(f"   Extraction rate: {extraction_rate:.1f}%")
print(f"   Exact match accuracy: {exact_match_accuracy:.1f}%")
print(f"\nðŸ’¡ INTERPRETATION:")
if abs(extraction_rate - state_mention_rate) < 10:
    print(f"   âœ“ Prompt is working correctly!")
    print(f"   âœ“ Extraction rate closely matches data availability")
    print(f"   âœ“ Accuracy limited by information in transcripts, not prompt quality")
else:
    print(f"   âš  Extraction rate: {extraction_rate:.1f}% vs State mention rate: {state_mention_rate:.1f}%")
    print(f"   â†’ Difference of {abs(extraction_rate - state_mention_rate):.1f}% suggests the model")
    print(f"   â†’ is being conservative (only extracting when VERY confident)")
print(f"{'='*70}\n")


RESULTS: STATE EXTRACTION ACCURACY

ðŸ“Š DATA AVAILABILITY:
   Conversations with state in transcript: 98.0%

ðŸ¤– EXTRACTION PERFORMANCE:
   Extraction rate (extracted any state): 13.0%
   Exact match accuracy (correct state): 13.0%

ðŸŽ¯ DETAILED ACCURACY:
   When state IS mentioned:
      - Extracted something: 13.3%
      - Extracted correctly: 13.3%
   When state NOT mentioned:
      - Correctly returned null: 100.0%

âœ… KEY INSIGHT:
   Data availability: 98.0%
   Extraction rate: 13.0%
   Exact match accuracy: 13.0%

ðŸ’¡ INTERPRETATION:
   âš  Extraction rate: 13.0% vs State mention rate: 98.0%
   â†’ Difference of 85.0% suggests the model
   â†’ is being conservative (only extracting when VERY confident)



## Step 5: Show Examples

In [None]:
# Save results
sample_df.to_csv('state_extraction_results.csv', index=False)
print("Results saved to: state_extraction_results.csv\n")

# Show successful extractions (exact matches)
print(f"{'='*70}")
print("EXAMPLES: CORRECT EXTRACTIONS (Exact Match)")
print(f"{'='*70}\n")

correct_extractions = sample_df[sample_df['exact_match'] == True].head(5)
for idx, row in correct_extractions.iterrows():
    print(f"Convo {row['convo_id']}:")
    print(f"  True: {row['true_state']} | Extracted: {row['extracted_state']} | Match: âœ“")
    print(f"  Transcript: {row['transcript'][:150]}...\n")

# Show incorrect extractions
print(f"\n{'='*70}")
print("EXAMPLES: INCORRECT EXTRACTIONS")
print(f"{'='*70}\n")

incorrect = sample_df[(sample_df['state_mentioned']) & (sample_df['exact_match'] == False)].head(5)
if len(incorrect) > 0:
    for idx, row in incorrect.iterrows():
        print(f"Convo {row['convo_id']}:")
        print(f"  True: {row['true_state']} | Extracted: {row['extracted_state']} | Match: âœ—")
        print(f"  Mentioned in transcript: {row['mentioned_state']}")
        print(f"  Transcript: {row['transcript'][:150]}...\n")
else:
    print("No incorrect extractions in this sample!\n")

# Show correct nulls
print(f"{'='*70}")
print("EXAMPLES: CORRECT NULLS (No state mentioned â†’ null extracted)")
print(f"{'='*70}\n")

null_cases = sample_df[(~sample_df['state_mentioned']) & (sample_df['extracted_state'].isna())].head(3)
if len(null_cases) > 0:
    for idx, row in null_cases.iterrows():
        print(f"Convo {row['convo_id']}:")
        print(f"  True: {row['true_state']} (not in transcript) | Extracted: None âœ“")
        print(f"  Transcript: {row['transcript'][:150]}...\n")
else:
    print("All conversations in sample mentioned states.\n")

# Summary statistics
print(f"{'='*70}")
print("SUMMARY STATISTICS")
print(f"{'='*70}")
print(f"Total samples: {len(sample_df)}")
print(f"Correct extractions: {sample_df['exact_match'].sum()}")
print(f"Incorrect extractions: {len(sample_df) - sample_df['exact_match'].sum()}")
print(f"Overall accuracy: {exact_match_accuracy:.1f}%")
print(f"{'='*70}")

Results saved to: state_extraction_results.csv

EXAMPLES: CORRECT EXTRACTIONS (Exact Match)

Convo 8141:
  True: ca | Extracted: CA | Match: âœ“
  Transcript: customer: I bought the wrong size, I want to return it. agent: Thanks for contacting client support, how may I help you today? agent: No problem, Can ...

Convo 8937:
  True: wa | Extracted: WA | Match: âœ“
  Transcript: agent: welcome, how may I help you? customer: Hey, I see somethings I may order can you tell me what time the store closes here locally customer: I am...

Convo 4162:
  True: tx | Extracted: TX | Match: âœ“
  Transcript: agent: hello, How can i help you customer: Hi I want to start a return, the item I received was the wrong size. customer: My name is Albert Sanders an...

Convo 3550:
  True: tx | Extracted: TX | Match: âœ“
  Transcript: agent: Hello, thank you for contacting AcmeBrand. How may I help you? customer: HI! I wanted to buy a couple of items and wanted to inquire about your...

Convo 169:
  True: ca |