 # LLM-Based Uncertainty Classification for Earnings Call Transcripts



 This notebook classifies question-answer pairs from earnings calls into three uncertainty levels:

 - **No Uncertainty**: Clear, definitive statements

 - **Intermediate Uncertainty**: Some hedging or conditional language

 - **High Uncertainty**: Explicit lack of visibility, wide outcome ranges, inability to estimate

 ## 1. Setup and Installation

In [1]:
!pip install transformers torch accelerate bitsandbytes pandas tqdm


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [21]:
import torch
import pandas as pd
import json
import re
from dataclasses import dataclass
from typing import Optional, Literal
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

 ## 2. Configuration

In [4]:
@dataclass
class ClassifierConfig:
    """Configuration for the uncertainty classifier."""
    model_path: str
    device: str = "auto"
    torch_dtype: str = "auto"  # "auto", "float16", "bfloat16", "float32"
    load_in_8bit: bool = False
    load_in_4bit: bool = False
    max_new_tokens: int = 500
    temperature: float = 0.1  # Low temperature for consistent classification

# Example configurations for popular models
EXAMPLE_CONFIGS = {
    "llama3-8b": ClassifierConfig(
        model_path="meta-llama/Meta-Llama-3-8B-Instruct",
        load_in_8bit=True
    ),
    "mistral-7b": ClassifierConfig(
        model_path="mistralai/Mistral-7B-Instruct-v0.3",
        load_in_8bit=True
    ),
}


 ## 3. Prompt Template

In [5]:
CLASSIFICATION_PROMPT = """You are an expert financial analyst specializing in analyzing earnings call transcripts.

TASK: Classify the uncertainty level in the following Question-Answer pair from an earnings call.

DEFINITION OF UNCERTAINTY:
- Uncertainty measures second-moment uncertainty: lack of visibility, conditionality, inability to estimate, or wide range of possible outcomes.
- Do not treat positive/negative sentiment or clear numeric guidance as uncertainty by itself.
- Focus on whether the speaker expresses confidence in their knowledge/predictions vs. acknowledges limitations.

CLASSIFICATION LABELS:
- NO_UNCERTAINTY: Speaker provides clear, definitive information with confidence. No hedging about visibility or outcomes.
- INTERMEDIATE_UNCERTAINTY: Some hedging language, mild conditionality, or acknowledgment of moderate unknowns, but still provides substantive guidance.
- HIGH_UNCERTAINTY: Explicit statements about lack of visibility, inability to estimate, dependence on unknown factors, or wide range of possible outcomes.

QUESTION-ANSWER PAIR:
<question>
{question}
</question>

<answer>
{answer}
</answer>

INSTRUCTIONS:
1. Analyze the answer for indicators of second-moment uncertainty.
2. Provide a brief reasoning (maximum 5 sentences).
3. Output your final classification.

REQUIRED OUTPUT FORMAT:
```json
{{
    "reasoning": "<your reasoning here, max 5 sentences>",
    "classification": "<NO_UNCERTAINTY|INTERMEDIATE_UNCERTAINTY|HIGH_UNCERTAINTY>"
}}
```

Your response:"""


 ## 4. Uncertainty Classifier Class

In [9]:
class EarningsCallUncertaintyClassifier:
    """Classifies uncertainty in earnings call Q&A pairs using a HuggingFace LLM."""

    VALID_LABELS = ["NO_UNCERTAINTY", "INTERMEDIATE_UNCERTAINTY", "HIGH_UNCERTAINTY"]

    def __init__(self, config: ClassifierConfig):
        self.config = config
        self.model = None
        self.tokenizer = None
        self._load_model()

    def _load_model(self):
        """Load the model and tokenizer from HuggingFace."""
        print(f"Loading model: {self.config.model_path}")

        # Determine dtype
        if self.config.torch_dtype == "auto":
            dtype = "auto"
        elif self.config.torch_dtype == "float16":
            dtype = torch.float16
        elif self.config.torch_dtype == "bfloat16":
            dtype = torch.bfloat16
        else:
            dtype = torch.float32

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config.model_path,
            trust_remote_code=True
        )
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load model with quantization options
        load_kwargs = {
            "device_map": self.config.device,
            "trust_remote_code": True,
        }

        if dtype != "auto":
            load_kwargs["torch_dtype"] = dtype

        if self.config.load_in_8bit:
            load_kwargs["load_in_8bit"] = True
        elif self.config.load_in_4bit:
            load_kwargs["load_in_4bit"] = True

        self.model = AutoModelForCausalLM.from_pretrained(
            self.config.model_path,
            **load_kwargs
        )
        print("Model loaded successfully!")

    def _format_prompt(self, question: str, answer: str) -> str:
        """Format the classification prompt with the Q&A pair."""
        return CLASSIFICATION_PROMPT.format(question=question, answer=answer)

    def _apply_chat_template(self, prompt: str) -> str:
        """Apply chat template if available."""
        if hasattr(self.tokenizer, 'apply_chat_template'):
            messages = [{"role": "user", "content": prompt}]
            try:
                return self.tokenizer.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True
                )
            except Exception:
                pass
        return prompt

    def _parse_response(self, response: str) -> dict:
        """Parse the model's JSON response."""
        # Try to extract JSON from the response
        json_match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Try to find JSON without code blocks
            json_match = re.search(r'\{[^{}]*"classification"[^{}]*\}', response, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
            else:
                # Fallback: try to extract classification directly
                for label in self.VALID_LABELS:
                    if label in response.upper():
                        return {
                            "reasoning": "Could not parse structured response.",
                            "classification": label,
                            "parse_success": False
                        }
                return {
                    "reasoning": "Failed to parse response.",
                    "classification": "PARSE_ERROR",
                    "parse_success": False
                }

        try:
            parsed = json.loads(json_str)
            classification = parsed.get("classification", "").upper().strip()

            # Normalize classification
            if classification not in self.VALID_LABELS:
                # Try partial matching
                for label in self.VALID_LABELS:
                    if label in classification or classification in label:
                        classification = label
                        break

            return {
                "reasoning": parsed.get("reasoning", ""),
                "classification": classification if classification in self.VALID_LABELS else "PARSE_ERROR",
                "parse_success": classification in self.VALID_LABELS
            }
        except json.JSONDecodeError:
            return {
                "reasoning": "JSON decode error.",
                "classification": "PARSE_ERROR",
                "parse_success": False
            }

    def classify(self, question: str, answer: str) -> dict:
        """
        Classify a single Q&A pair.

        Returns:
            dict with keys: reasoning, classification, raw_response, parse_success
        """
        prompt = self._format_prompt(question, answer)
        formatted = self._apply_chat_template(prompt)

        inputs = self.tokenizer(formatted, return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=self.config.max_new_tokens,
                temperature=self.config.temperature,
                do_sample=self.config.temperature > 0,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )

        # Decode only the new tokens
        response = self.tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )

        result = self._parse_response(response)
        result["raw_response"] = response
        return result

    def classify_batch(self, qa_pairs: list[dict], show_progress: bool = True) -> pd.DataFrame:
        """
        Classify multiple Q&A pairs.

        Args:
            qa_pairs: List of dicts with 'question' and 'answer' keys
            show_progress: Whether to show progress bar

        Returns:
            DataFrame with classification results
        """
        results = []
        iterator = tqdm(qa_pairs, desc="Classifying") if show_progress else qa_pairs

        for i, qa in enumerate(iterator):
            result = self.classify(qa['question'], qa['answer'])
            result['id'] = qa.get('id', i)
            result['question'] = qa['question']
            result['answer'] = qa['answer']
            results.append(result)

        return pd.DataFrame(results)


 ## 5. Example Usage

In [6]:
# Sample earnings call Q&A pairs for testing
SAMPLE_QA_PAIRS = [
    {
        "id": 1,
        "question": "Can you provide guidance on next quarter's revenue?",
        "answer": "We expect revenue to be between $2.1 billion and $2.3 billion, driven by strong demand in our cloud segment and continued growth in enterprise subscriptions."
    },
    {
        "id": 2,
        "question": "What's your outlook on margins for the rest of the year?",
        "answer": "It's really difficult to say at this point. There are so many moving pieces with supply chain costs, and we honestly don't have great visibility into how that's going to play out. It could go either way depending on factors largely outside our control."
    },
    {
        "id": 3,
        "question": "How do you see the competitive landscape evolving?",
        "answer": "We're monitoring the situation closely. While we feel good about our position, there's some uncertainty around new entrants and how pricing dynamics might shift. We're prepared for multiple scenarios but can't predict exactly how things will unfold."
    },
    {
        "id": 4,
        "question": "What was the driver of the revenue miss this quarter?",
        "answer": "The miss was primarily due to delayed enterprise deals that pushed into Q1. We closed $450 million less than expected, specifically from three large contracts that required additional legal review."
    },
]

In [None]:

config = ClassifierConfig(
    model_path="meta-llama/Meta-Llama-3-8B-Instruct",  # <-- Change to your model
    load_in_8bit=True,  # Use quantization for memory efficiency
    temperature=0.1,    # Low temp for consistent classification
)
# Initialize the classifier
classifier = EarningsCallUncertaintyClassifier(config)

# Classify sample Q&A pairs
results_df = classifier.classify_batch(SAMPLE_QA_PAIRS)

In [15]:
# Display results
print("\n" + "="*80)
print("CLASSIFICATION RESULTS")
print("="*80)

for _, row in results_df.iterrows():
    print(f"\n--- Sample {row['id']} ---")
    print(row['raw_response'])
    print(f"Q: {row['question'][:100]}...")
    print(f"A: {row['answer'][:100]}...")
    print(f"\nClassification: {row['classification']}")
    print(f"Reasoning: {row['reasoning']}")
    print(f"Parse Success: {row['parse_success']}")



CLASSIFICATION RESULTS

--- Sample 1 ---
Here is the analysis:

```json
{
    "reasoning": "The answer provides a specific revenue range with a clear direction (upward) and a relatively narrow range ($2.1-2.3 billion). The language used is straightforward and lacks any hedging or conditionality. The speaker attributes the expected revenue to specific factors (strong demand in cloud segment and continued growth in enterprise subscriptions), indicating a high level of confidence in their prediction.",
    "classification": "NO_UNCERTAINTY"
}
```

In this answer, the speaker provides a clear and specific revenue range with a clear direction, indicating a high level of confidence in their prediction. The language used is straightforward and lacks any hedging or conditionality, which further supports the classification of NO_UNCERTAINTY.
Q: Can you provide guidance on next quarter's revenue?...
A: We expect revenue to be between $2.1 billion and $2.3 billion, driven by strong demand in our

In [16]:
# Summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(results_df['classification'].value_counts())



SUMMARY STATISTICS
classification
NO_UNCERTAINTY              2
HIGH_UNCERTAINTY            1
INTERMEDIATE_UNCERTAINTY    1
Name: count, dtype: int64


In [18]:
# Save results to JSON
output_path = "uncertainty_classifications.json"
results_df.to_json(output_path, orient="records", indent=2)
print(f"Results saved to {output_path}")

Results saved to uncertainty_classifications.json


 ## 6. Load Your Own Data

In [19]:
def load_qa_pairs_from_json(filepath: str) -> list[dict]:
    """
    Load Q&A pairs from a JSON file.

    Expected JSON format:
    [
        {
            "id": 1,
            "question": "...",
            "answer": "...",
            "label": "NO_UNCERTAINTY"  // optional, for evaluation
        },
        ...
    ]

    Args:
        filepath: Path to JSON file

    Returns:
        List of dicts ready for classification
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Validate required fields
    for i, item in enumerate(data):
        if 'question' not in item or 'answer' not in item:
            raise ValueError(f"Item {i} missing required 'question' or 'answer' field")
        if 'id' not in item:
            item['id'] = i

    return data

# Example usage:
# my_qa_pairs = load_qa_pairs_from_json("my_earnings_calls.json")
# my_results = classifier.classify_batch(my_qa_pairs)

 ## 7. Analyze Classification Performance

In [22]:
def evaluate_classifications(results_df: pd.DataFrame, label_col: str = "label") -> dict:
    """
    Evaluate classification accuracy when ground truth labels are available.

    Args:
        results_df: DataFrame with 'classification' column from classifier
        label_col: Column name containing true labels

    Returns:
        Dict containing evaluation metrics
    """
    if label_col not in results_df.columns:
        print(f"No '{label_col}' column found. Skipping evaluation.")
        return None

    # Filter out parse errors
    valid_results = results_df[results_df['classification'] != 'PARSE_ERROR'].copy()
    parse_rate = len(valid_results) / len(results_df)

    labels = ["NO_UNCERTAINTY", "INTERMEDIATE_UNCERTAINTY", "HIGH_UNCERTAINTY"]

    metrics = {
        "parse_success_rate": parse_rate,
        "n_total": len(results_df),
        "n_valid": len(valid_results),
        "accuracy": accuracy_score(valid_results[label_col], valid_results['classification']),
        "classification_report": classification_report(
            valid_results[label_col],
            valid_results['classification'],
            labels=labels,
            output_dict=True
        ),
        "confusion_matrix": confusion_matrix(
            valid_results[label_col],
            valid_results['classification'],
            labels=labels
        ).tolist()
    }

    return metrics

## 8. Complete Pipeline


In [24]:
def run_full_pipeline(input_path: str, output_path: str, classifier) -> pd.DataFrame:
    """
    Run the full classification pipeline from JSON input to JSON output.

    Args:
        input_path: Path to input JSON file with Q&A pairs
        output_path: Path for output JSON file with results
        classifier: Initialized EarningsCallUncertaintyClassifier

    Returns:
        DataFrame with classification results
    """
    # Load input data
    qa_pairs = load_qa_pairs_from_json(input_path)
    print(f"Loaded {len(qa_pairs)} Q&A pairs from {input_path}")

    # Check if labels exist for evaluation
    has_labels = 'label' in qa_pairs[0] if qa_pairs else False

    # Run classification
    results_df = classifier.classify_batch(qa_pairs)

    # Merge labels if they exist
    if has_labels:
        results_df['label'] = [qa.get('label') for qa in qa_pairs]

    # Prepare output structure
    output_data = {
        "metadata": {
            "model_path": classifier.config.model_path,
            "temperature": classifier.config.temperature,
            "n_samples": len(results_df),
            "parse_success_rate": (results_df['parse_success'].sum() / len(results_df))
        },
        "results": results_df.to_dict(orient="records")
    }

    # Add evaluation metrics if labels exist
    if has_labels:
        metrics = evaluate_classifications(results_df, "label")
        if metrics:
            output_data["evaluation"] = metrics

    # Save to JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2)

    print(f"Results saved to {output_path}")
    return results_df

# Example usage:
# results_df = run_full_pipeline(
#     input_path="earnings_calls_input.json",
#     output_path="classification_results.json",
#     classifier=classifier
# )

In [26]:
results_df = run_full_pipeline(
    input_path="sample-input.json",
    output_path="classification_results.json",
    classifier=classifier
)

Loaded 10 Q&A pairs from sample-input.json


Classifying: 100%|██████████| 10/10 [04:21<00:00, 26.15s/it]

Results saved to classification_results.json



