In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from qualitative_analysis import (
    clean_and_normalize,
    sanitize_dataframe,
    get_llm_client,
    compute_cohens_kappa,
    process_general_verbatims,
    calculate_and_log,
)
import qualitative_analysis.config as config

In [2]:
# Define the dictionary with dataset splits
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}

# Construct the URL for the 'train' split
train_url = "hf://datasets/sh0416/ag_news/" + splits["train"]

# Read the JSONL file into a DataFrame
df = pd.read_json(train_url, lines=True)

# Display the first few rows of the DataFrame
print(df.head())

  from .autonotebook import tqdm as notebook_tqdm


   label                                              title  \
0      3  Wall St. Bears Claw Back Into the Black (Reuters)   
1      3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2      3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3      3  Iraq Halts Oil Exports from Main Southern Pipe...   
4      3  Oil prices soar to all-time record, posing new...   

                                         description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  


In [3]:
# 1) Now define the new column names for cleaning
text_columns = ["title", "description"]

# 2) Clean and normalize the new columns
for col in text_columns:
    df[col] = clean_and_normalize(df[col])

# 3) Sanitize the DataFrame
data = sanitize_dataframe(df)

In [4]:
# Combine texts and entries

data['verbatim'] = data.apply(
    lambda row: (
        f"Title: {row['title']}\n\n"
        f"Description: {row['description']}"
    ),
    axis=1
)

# Extract the list of verbatims
verbatims = data['verbatim'].tolist()

print(f"Total number of verbatims: {len(verbatims)}")
print(f"Verbatim example:\n{verbatims[:10]}")

Total number of verbatims: 120000
Verbatim example:
["Title: Wall St. Bears Claw Back Into the Black (Reuters)\n\nDescription: Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'Title: Carlyle Looks Toward Commercial Aerospace (Reuters)\n\nDescription: Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.', "Title: Oil and Economy Cloud Stocks' Outlook (Reuters)\n\nDescription: Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.", 'Title: Iraq Halts Oil Exports from Main Southern Pipeline (Reuters)\n\nDescription: Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel milit

In [5]:
# Check unique values in 'IsImpossible'
print(df['label'].unique())
# Check counts of unique values in 'IsImpossible'
print(df['label'].value_counts())

[3 4 2 1]
label
3    30000
4    30000
2    30000
1    30000
Name: count, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split

# Step 1: Get a stratified subset of 1000 samples
data_subset, _ = train_test_split(  # <-- Assign the FIRST output (train_size=1000)
    data,
    train_size=200,
    stratify=data['label'],
    random_state=42
)

# Step 2: Split subset into train/val
train_data, val_data = train_test_split(
    data_subset,
    test_size=0.2,
    stratify=data_subset['label'],
    random_state=42
)

print("Train size:", len(train_data))
print("Val size:", len(val_data))


Train size: 160
Val size: 40


In [14]:
scenarios = [
    {
        "provider": "azure",
        "model_name": "gpt-4o-mini",
        "temperature": 0.0,
        "prefix": None,
        "prompt_name": "base_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World 
2: Sports 
3: Business 
4: Sci/Tech

Data format:
Title: The title of the article
Description: The description of the article

Respond ONLY with the category number (1, 2, 3, or 4). No explanations.
"""
    },
    {
        "provider": "azure",
        "model_name": "gpt-4o-mini",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World 
2: Sports 
3: Business 
4: Sci/Tech

Data format:
Title: The title of the article
Description: The description of the article

First, think step-by-step, and give the classification label in this format only:
Final answer: 1
OR 
Final answer: 2
OR
Final answer: 3
OR
Final answer: 4
"""
    },
            {
        "provider": "azure",
        "model_name": "gpt-4o-mini",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt_examples",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World 
2: Sports 
3: Business 
4: Sci/Tech

Examples:
1.  Title: Leaders Sign Historic Pact Amid Growing Regional Tensions
    Description: In a landmark conference, heads of state from three neighboring countries signed a peace agreement to curb escalating border disputes and promote economic cooperation.
    Reasoning: The article discusses international diplomacy and involves multiple countries working together to resolve conflicts, which falls under the broader context of global affairs.
    Final answer: 1

2.  Title: Hometown Stars Triumph in Nail-Biting Championship Finale
    Description: The Greenfield Gators clinched their first league title with a thrilling last-minute score, sending fans into a frenzy and ending a decades-long championship drought.
    Reasoning: This article is focused on a sports event—a championship game—and the excitement surrounding a team’s victory, which clearly places it in the sports domain.
    Final answer: 2

3.  Title: Tech Retailers Merge to Create Largest Electronics Chain
    Description: The surprise merger between two major electronics retailers is expected to reshape the market, boost consumer choice, and significantly impact industry competitors.
    Reasoning: The content centers on a merger between large companies, discussing market impact and investor interest, which falls under business and financial news.
    Final answer: 3

4.  Title: Scientists Reveal New Satellite Data for Early Storm Detection
    Description: A team of researchers has launched a cutting-edge satellite that uses artificial intelligence to predict tropical storms and hurricanes with unprecedented accuracy.
    Reasoning: This article discusses scientific innovation and technological advancements—specifically, satellite technology and AI—placing it squarely in the sci/tech category.
    Final answer: 4

Data format:
Title: The title of the article
Description: The description of the article

First, think step-by-step, and give the classification label in this format only:
Final answer: 1
OR 
Final answer: 2
OR
Final answer: 3
OR
Final answer: 4
"""
    },
        {
        "provider": "azure",
        "model_name": "gpt-4o",
        "temperature": 0.0,
        "prefix": None,
        "prompt_name": "base_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World 
2: Sports 
3: Business 
4: Sci/Tech

Data format:
Title: The title of the article
Description: The description of the article

Respond ONLY with the category number (1, 2, 3, or 4). No explanations.
"""
    },
    {
        "provider": "azure",
        "model_name": "gpt-4o",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World 
2: Sports 
3: Business 
4: Sci/Tech

Data format:
Title: The title of the article
Description: The description of the article

First, think step-by-step, and give the classification label in this format only:
Final answer: 1
OR 
Final answer: 2
OR
Final answer: 3
OR
Final answer: 4
"""
    },
        {
        "provider": "azure",
        "model_name": "gpt-4o",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt_examples",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World 
2: Sports 
3: Business 
4: Sci/Tech

Examples:
1.  Title: Leaders Sign Historic Pact Amid Growing Regional Tensions
    Description: In a landmark conference, heads of state from three neighboring countries signed a peace agreement to curb escalating border disputes and promote economic cooperation.
    Reasoning: The article discusses international diplomacy and involves multiple countries working together to resolve conflicts, which falls under the broader context of global affairs.
    Final answer: 1

2.  Title: Hometown Stars Triumph in Nail-Biting Championship Finale
    Description: The Greenfield Gators clinched their first league title with a thrilling last-minute score, sending fans into a frenzy and ending a decades-long championship drought.
    Reasoning: This article is focused on a sports event—a championship game—and the excitement surrounding a team’s victory, which clearly places it in the sports domain.
    Final answer: 2

3.  Title: Tech Retailers Merge to Create Largest Electronics Chain
    Description: The surprise merger between two major electronics retailers is expected to reshape the market, boost consumer choice, and significantly impact industry competitors.
    Reasoning: The content centers on a merger between large companies, discussing market impact and investor interest, which falls under business and financial news.
    Final answer: 3

4.  Title: Scientists Reveal New Satellite Data for Early Storm Detection
    Description: A team of researchers has launched a cutting-edge satellite that uses artificial intelligence to predict tropical storms and hurricanes with unprecedented accuracy.
    Reasoning: This article discusses scientific innovation and technological advancements—specifically, satellite technology and AI—placing it squarely in the sci/tech category.
    Final answer: 4

Data format:
Title: The title of the article
Description: The description of the article

First, think step-by-step, and give the classification label in this format only:
Final answer: 1
OR 
Final answer: 2
OR
Final answer: 3
OR
Final answer: 4
"""
    },
]

In [None]:
import time

all_results = []
verbose = True
N_train = len(train_data)
N_val = len(val_data)

for scenario in scenarios:
    provider = scenario["provider"]
    model_name = scenario["model_name"]
    temperature = scenario["temperature"]
    prompt_name = scenario["prompt_name"]
    template = scenario["template"]
    prefix = scenario.get("prefix", None)  # or scenario["prefix"] if guaranteed

    print(f"\n=== Running scenario: {prompt_name} | model={model_name} | temp={temperature} ===")

    # Start timing for this scenario
    start_time = time.time()

    # 1) Possibly re-init the client if needed per scenario
    
    # Initialize the client
    llm_client = get_llm_client(provider=provider, config=config.MODEL_CONFIG[provider])

    # ---- TRAIN ---
    train_verbatims = train_data["verbatim"].tolist()
    train_pred_df, train_cost_info, train_totals = process_general_verbatims(
        verbatims_subset=train_verbatims,
        llm_client=llm_client,
        model_name=model_name,
        prompt_template=template,
        prefix=prefix,
        temperature=temperature,
        verbose=verbose
    )

    train_data["ModelPrediction"] = train_pred_df["Label"].values
    y_true_train = train_data["label"].tolist()
    y_pred_train = train_data["ModelPrediction"].fillna(-1).tolist()

    kappa_train = compute_cohens_kappa(y_true_train, y_pred_train, labels=[1,2,3,4])
    accuracy_train = accuracy_score(y_true_train, y_pred_train)

    # ---- VAL ---
    val_verbatims = val_data["verbatim"].tolist()
    val_pred_df, val_cost_info, val_totals = process_general_verbatims(
        verbatims_subset=val_verbatims,
        llm_client=llm_client,
        model_name=model_name,
        prompt_template=template,
        prefix=prefix,
        temperature=temperature,
        verbose=verbose
    )

    val_data["ModelPrediction"] = val_pred_df["Label"].values
    y_true_val = val_data["label"].tolist()
    y_pred_val = val_data["ModelPrediction"].fillna(-1).tolist()

    kappa_val = compute_cohens_kappa(y_true_val, y_pred_val, labels=[1,2,3,4])
    accuracy_val = accuracy_score(y_true_val, y_pred_val)

    # Summarize cost usage
    total_tokens = train_totals["total_tokens_used"] + val_totals["total_tokens_used"]
    total_cost = train_totals["total_cost"] + val_totals["total_cost"]

    # End timing for this scenario
    end_time = time.time()
    elapsed_time_seconds = end_time - start_time

    # Build a single result row for the scenario
    row = {
        "data_set": "AG_news",
        "N_train": N_train,
        "N_val": N_val,
        "provider": provider,
        "model_name": model_name,
        "temperature": temperature,
        "prompt_name": prompt_name,
        "kappa_train": kappa_train,
        "kappa_val": kappa_val,
        "accuracy_train": accuracy_train,
        "accuracy_val": accuracy_val,
        "tokens_used": total_tokens,
        "cost": total_cost,
        "running_time_s": elapsed_time_seconds,
    }

    all_results.append(row)

# Create a DataFrame of results
summary_df = pd.DataFrame(all_results)


=== Running scenario: base_prompt | model=gpt-4o-mini | temp=0.0 ===

=== Processing Verbatim 1/56 ===
Prompt:
You are a helpful assistant tasked with evaluating:

Input:
Title: El Duque Restores Order in the Bronx (AP)

Description: AP - Orlando Hernandez was one of the few people in New York unaware of the final score of the historic rout the Cleveland Indians handed the Yankees.

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World 
2: Sports 
3: Business 
4: Sci/Tech

Data format:
Title: The title of the article
Description: The description of the article

Respond ONLY with the category number (1, 2, 3, or 4). No explanations.



=== LLM Response ===
2

Label: 2

=== Processing Verbatim 2/56 ===
Prompt:
You are a helpful assistant tasked with evaluating:

Input:
Title: Retail rivals moving to town

Description: A town that hasn't had a grocery store for at least a decade may soon have two. A Super Stop  amp; Shop is set to open today on Pl

In [16]:
summary_df

Unnamed: 0,data_set,N_train,N_val,provider,model_name,temperature,prompt_name,kappa_train,kappa_val,accuracy_train,accuracy_val,total_tokens_used,total_cost,running_time_s
0,AG_news,56,14,azure,gpt-4o-mini,0.0,base_prompt,0.904762,0.716216,0.928571,0.785714,11389,0.001738,15.111477
1,AG_news,56,14,azure,gpt-4o-mini,0.0,reasoning_prompt,0.857143,0.810811,0.892857,0.857143,27731,0.010757,141.831936
2,AG_news,56,14,azure,gpt-4o-mini,0.0,reasoning_prompt_examples,0.857143,0.810811,0.892857,0.857143,42761,0.009315,67.510695
3,AG_news,56,14,azure,gpt-4o,0.0,base_prompt,0.904762,0.619048,0.928571,0.714286,11389,0.02901,55.914787
4,AG_news,56,14,azure,gpt-4o,0.0,reasoning_prompt,0.880952,0.736842,0.910714,0.571429,30868,0.216286,515.056727
5,AG_news,56,14,azure,gpt-4o,0.0,reasoning_prompt_examples,0.878694,0.810811,0.892857,0.857143,49525,0.222937,436.457889


In [17]:
calculate_and_log(summary_df)

In [None]:
import json
import pandas as pd
from typing import List, Dict, Any, Optional

verbose = False

scenarios = [
    {
        "provider_llm1": "azure",
        "model_name_llm1": "gpt-4o-mini",
        "temperature_llm1": 0.0,

        # For the "improver" LLM2
        "provider_llm2": "azure",
        "model_name_llm2": "gpt-4o",
        "temperature_llm2": 0.7,

        # Our initial prompt
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World
2: Sports
3: Business
4: Sci/Tech

Data format:
Title: The title of the article
Description: The description of the article

Respond ONLY with the category number (1, 2, 3, or 4). No explanations.
""",
        "max_iterations": 3,
        "prompt_name": "base_prompt"
    },
    {
        "provider_llm1": "azure",
        "model_name_llm1": "gpt-4o-mini",
        "temperature_llm1": 0.0,
        "prefix": "Final answer",

        "provider_llm2": "azure",
        "model_name_llm2": "gpt-4o",
        "temperature_llm2": 0.7,

        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World
2: Sports
3: Business
4: Sci/Tech

Examples:
1.  Title: Leaders Sign Historic Pact Amid Growing Regional Tensions
    Description: In a landmark conference, heads of state from three neighboring countries signed a peace agreement to curb escalating border disputes and promote economic cooperation.
    Reasoning: The article discusses international diplomacy and involves multiple countries working together to resolve conflicts, which falls under the broader context of global affairs.
    Final answer: 1

2.  Title: Hometown Stars Triumph in Nail-Biting Championship Finale
    Description: The Greenfield Gators clinched their first league title with a thrilling last-minute score, sending fans into a frenzy and ending a decades-long championship drought.
    Reasoning: This article is focused on a sports event—a championship game—and the excitement surrounding a team’s victory, which clearly places it in the sports domain.
    Final answer: 2

3.  Title: Tech Retailers Merge to Create Largest Electronics Chain
    Description: The surprise merger between two major electronics retailers is expected to reshape the market, boost consumer choice, and significantly impact industry competitors.
    Reasoning: The content centers on a merger between large companies, discussing market impact and investor interest, which falls under business and financial news.
    Final answer: 3

4.  Title: Scientists Reveal New Satellite Data for Early Storm Detection
    Description: A team of researchers has launched a cutting-edge satellite that uses artificial intelligence to predict tropical storms and hurricanes with unprecedented accuracy.
    Reasoning: This article discusses scientific innovation and technological advancements—specifically, satellite technology and AI—placing it squarely in the sci/tech category.
    Final answer: 4

First, think step-by-step, and give the classification label in this format only:
Final answer: 1
OR
Final answer: 2
OR
Final answer: 3
OR
Final answer: 4
""",
        "max_iterations": 3,
        "prompt_name": "reasoning_prompt"
    },
]

def find_discrepancies(df: pd.DataFrame, verbose: bool = True) -> List[Dict[str, Any]]:
    """
    Compare the model's predictions to the human labels in df
    and return a list of discrepancies.
    """
    discrepancies = []
    for i, row in df.iterrows():
        if row["ModelPrediction"] != row["label"]:
            discrepancies.append({
                "verbatim": row["verbatim"],
                "human_label": row["label"],
                "llm1_label": row["ModelPrediction"]
            })
    if verbose:
        print(f"Found {len(discrepancies)} discrepancies.")
    return discrepancies

def call_llm2_for_improvement(
    llm2_client,
    llm2_model_name: str,
    current_prompt: str,
    discrepancies: List[Dict[str, Any]],
    temperature: float = 0.7,
    verbose: bool = True
) -> Optional[str]:
    """
    Calls LLM2 to propose a revised prompt that addresses the given discrepancies.
    Returns the updated prompt as a string (or None if parsing fails).
    """
    # Build instructions that ask for JSON:
    instructions = f"""
You are an assistant tasked with improving a codebook that an evaluator uses for classification.
You are provided with a prompt and a list of discrepancies between human labels and LLM1 predictions.
Clearly identify the classifcation task and the possible labels.
Analyse the missclassification (identify what the label should be and what LLM1 predicted).
Make a new prompt for the task that would help LLM1 to improve its predictions.

Here is the current prompt:
{current_prompt}

Here are the discrepancies the current prompt caused:
{json.dumps(discrepancies, indent=2)}

Please output a JSON object in the following format exactly:

{{
  "new_prompt": "Your revised prompt here."
}}

IMPORTANT:
- Do not include backticks or triple quotes.
- Do not include any additional keys or text outside the JSON object.
- The JSON must be valid and parseable by Python's json.loads.
- You MUST preserve exactly the placeholder line: `Input:\n{{verbatim_text}}` in your revised prompt. 
"""

    # Call LLM2
    llm2_response_text, _ = llm2_client.get_response(
        prompt=instructions,
        model=llm2_model_name,
        max_tokens=1000,
        temperature=temperature,
        verbose=verbose
    )

    if verbose:
        print("LLM2 raw response:\n", llm2_response_text)

    # Try to parse the JSON directly
    try:
        data = json.loads(llm2_response_text)
        revised_prompt = data.get("new_prompt", "").strip()
        if revised_prompt:
            if verbose:
                print("Revised prompt successfully extracted from LLM2 JSON.")
            return revised_prompt
        else:
            if verbose:
                print("JSON parsing succeeded, but 'new_prompt' key is empty.")
            return None
    except json.JSONDecodeError as e:
        if verbose:
            print(f"Failed to parse JSON from LLM2 response: {e}")
        return None
    
def run_iterative_prompt_improvement(
    train_data: pd.DataFrame,
    val_data: pd.DataFrame,
    llm1_client,
    llm2_client,
    llm1_model_name: str,
    llm2_model_name: str,
    initial_prompt: str,
    temperature_llm1: float = 0.0,
    temperature_llm2: float = 0.7,
    max_iterations: int = 3,
    scenario_info: Dict[str, Any] = None,
    prefix_llm1: Optional[str] = None,
    verbose: bool = True
):
    """
    Returns:
      best_prompt (str),
      best_kappa (float),
      iteration_rows (List[Dict])  # <--- This stores a row per iteration
    """
    current_prompt = initial_prompt
    best_prompt = initial_prompt
    best_kappa = -1.0

    # We'll store info for each iteration as a separate row/dict
    iteration_rows = []

    # For convenience, gather some info for your row (like data_set, N_train, etc.)
    # In your original code, you presumably know these from outside, or you can add them to scenario_info.
    data_set = scenario_info.get("data_set", "AG_news")
    N_train = scenario_info.get("N_train", len(train_data))
    N_val = scenario_info.get("N_val", len(val_data))
    provider = scenario_info["provider"]          # e.g. "azure"
    model_name = scenario_info["model_name"]      # e.g. "gpt-4o-mini"
    temperature = scenario_info["temperature"]    # e.g. 0.0
    prompt_name = scenario_info.get("prompt_name", "default_prompt")

    # If you want to time each iteration, you can do so
    import time

    for iteration in range(1, max_iterations + 1):
        if verbose:
            print(f"\n=== Iteration {iteration}/{max_iterations} ===")

        # 1) Evaluate on the train set
        start_time = time.time()
        train_pred_df, train_cost_info, train_totals = process_general_verbatims(
            verbatims_subset=train_data["verbatim"].tolist(),
            llm_client=llm1_client,
            model_name=llm1_model_name,
            prompt_template=current_prompt,
            prefix=prefix_1,
            temperature=temperature_llm1,
            verbose=verbose
        )
        end_time = time.time()

        # Extract cost usage for train
        train_tokens = train_totals["total_tokens_used"]
        train_cost = train_totals["total_cost"]
        train_time_s = end_time - start_time

        # Evaluate accuracy/kappa on the train set
        train_data["ModelPrediction"] = train_pred_df["Label"].values
        y_true_train = train_data["label"].tolist()
        y_pred_train = train_data["ModelPrediction"].fillna(-1).tolist()
        kappa_train = compute_cohens_kappa(y_true_train, y_pred_train, labels=[1,2,3,4])
        accuracy_train = accuracy_score(y_true_train, y_pred_train)

        # 2) Evaluate on the val set
        start_time_val = time.time()
        val_pred_df, val_cost_info, val_totals = process_general_verbatims(
            verbatims_subset=val_data["verbatim"].tolist(),
            llm_client=llm1_client,
            model_name=model_name,
            prompt_template=current_prompt,
            prefix=prefix_1,
            temperature=temperature_llm1,
            verbose=False
        )
        end_time_val = time.time()

        # Extract cost usage for val
        val_tokens = val_totals["total_tokens_used"]
        val_cost = val_totals["total_cost"]
        val_time_s = end_time_val - start_time_val

        # Evaluate accuracy/kappa on val
        val_data["ModelPrediction"] = val_pred_df["Label"].values
        y_true_val = val_data["label"].tolist()
        y_pred_val = val_data["ModelPrediction"].fillna(-1).tolist()
        kappa_val = compute_cohens_kappa(y_true_val, y_pred_val, labels=[1,2,3,4])
        accuracy_val = accuracy_score(y_true_val, y_pred_val)

        # 3) Summarize total tokens/cost for this iteration
        total_tokens = train_tokens + val_tokens
        total_cost = train_cost + val_cost

        # 4) Detect discrepancies on the train set
        discrepancies = find_discrepancies(train_data, verbose=verbose)

        # 5) Build a row capturing all iteration-level info
        row = {
            "data_set": data_set,
            "N_train": N_train,
            "N_val": N_val,
            "provider": provider,
            "model_name": model_name,
            "temperature": temperature,
            "prompt_name": prompt_name,
            "iteration": iteration,

            "kappa_train": kappa_train,
            "kappa_val": kappa_val,
            "accuracy_train": accuracy_train,
            "accuracy_val": accuracy_val,

            "tokens_used": total_tokens,
            "cost": total_cost,

            "running_time_s": (train_time_s + val_time_s),
            # Or store the partial times separately if you want
        }

        iteration_rows.append(row)  # Keep track of the iteration results

        # 6) Check improvement
        if kappa_val > best_kappa:
            best_kappa = kappa_val
            best_prompt = current_prompt

        if not discrepancies:
            # If no discrepancies, no more iteration
            break

        # 7) Ask LLM2 for a new prompt
        new_prompt = call_llm2_for_improvement(
            llm2_client=llm2_client,
            llm2_model_name=llm2_model_name,
            current_prompt=current_prompt,
            discrepancies=discrepancies,
            temperature=temperature_llm2,
            verbose=verbose
        )
        if new_prompt:
            current_prompt = new_prompt
        else:
            break

    return best_prompt, best_kappa, iteration_rows


all_results = []

for scenario in scenarios:
    # 1) Extract scenario-level info for LLM1
    provider_1 = scenario["provider_llm1"]
    model_name_1 = scenario["model_name_llm1"]
    temperature_1 = scenario["temperature_llm1"]

    # Get the prefix, if any
    prefix_1 = scenario.get("prefix", None)

    # 2) Extract scenario-level info for LLM2
    provider_2 = scenario["provider_llm2"]
    model_name_2 = scenario["model_name_llm2"]
    temperature_2 = scenario["temperature_llm2"]

    # 3) General scenario info
    prompt_name = scenario.get("prompt_name", "default_prompt")
    max_iterations = scenario.get("max_iterations", 3)
    initial_prompt = scenario["template"]

    # 4) Create a dictionary with scenario info for logging
    scenario_info = {
        "data_set": "AG_news",
        "N_train": len(train_data),
        "N_val": len(val_data),
        "provider": provider_1,
        "model_name": model_name_1,
        "temperature": temperature_1,
        "prompt_name": prompt_name
    }

    # 5) Initialize LLM1 + LLM2 clients
    llm1_client = get_llm_client(provider=provider_1, config=config.MODEL_CONFIG[provider_1])
    llm2_client = get_llm_client(provider=provider_2, config=config.MODEL_CONFIG[provider_2])

    # 6) Run iterative improvement
    best_prompt, best_kappa_val, iteration_rows = run_iterative_prompt_improvement(
        train_data=train_data,
        val_data=val_data,
        llm1_client=llm1_client,
        llm2_client=llm2_client,
        llm1_model_name=model_name_1,
        llm2_model_name=model_name_2,
        initial_prompt=initial_prompt,
        temperature_llm1=temperature_1,
        temperature_llm2=temperature_2,
        max_iterations=max_iterations,
        scenario_info=scenario_info,  # pass scenario details
        verbose=True
    )

    # 7) The function returns iteration_rows, which has one row per iteration
    #    Append them to the global list
    all_results.extend(iteration_rows)

# Finally, build the DataFrame
summary_df = pd.DataFrame(all_results)


=== Iteration 1/3 ===

=== Processing Verbatim 1/80 ===
Prompt:
You are a helpful assistant tasked with evaluating:

Input:
Title: Sears Launches Venture for Online Sales

Description: Sears, Roebuck and Co., which has successfully sold its tools and appliances on the Web, is counting on having the same magic with bedspreads and sweaters, thanks in part to expertise gained by its purchase of Lands' End Inc.

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World
2: Sports
3: Business
4: Sci/Tech

Data format:
Title: The title of the article
Description: The description of the article

Respond ONLY with the category number (1, 2, 3, or 4). No explanations.



=== LLM Response ===
3

Label: 3

=== Processing Verbatim 2/80 ===
Prompt:
You are a helpful assistant tasked with evaluating:

Input:
Title: Nortel delays reporting, again

Description: OTTAWA - Nortel Networks has announced that it #39;s once again delaying the release of financial reports

In [8]:
summary_df

Unnamed: 0,data_set,N_train,N_val,provider,model_name,temperature,prompt_name,iteration,kappa_train,kappa_val,accuracy_train,accuracy_val,total_tokens_used,total_cost,running_time_s
0,AG_news,80,20,azure,gpt-4o-mini,0.0,base_prompt,1,0.816667,0.933333,0.8625,0.95,16124,0.00246,15.488558
1,AG_news,80,20,azure,gpt-4o-mini,0.0,base_prompt,2,0.816667,0.933333,0.8625,0.95,21324,0.003235,16.547199
2,AG_news,80,20,azure,gpt-4o-mini,0.0,base_prompt,3,0.8,0.933333,0.85,0.95,24724,0.00375,16.473289
3,AG_news,80,20,azure,gpt-4o-mini,0.0,reasoning_prompt,1,0.816667,0.933333,0.8625,0.95,56732,0.01162,74.556122
4,AG_news,80,20,azure,gpt-4o-mini,0.0,reasoning_prompt,2,0.8,0.933333,0.85,0.95,73805,0.014931,88.617742
5,AG_news,80,20,azure,gpt-4o-mini,0.0,reasoning_prompt,3,0.8,0.933333,0.85,0.95,73159,0.013288,64.150306


In [10]:
calculate_and_log(summary_df)

ValueError: DataFrame is missing required columns: cost

In [9]:
print(best_prompt)

You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
The article belongs to one of the following categories:
1: World
2: Sports
3: Business
4: Sci/Tech

Examples:
1.  Title: Leaders Sign Historic Pact Amid Growing Regional Tensions
    Description: In a landmark conference, heads of state from three neighboring countries signed a peace agreement to curb escalating border disputes and promote economic cooperation.
    Reasoning: The article discusses international diplomacy and involves multiple countries working together to resolve conflicts, which falls under the broader context of global affairs.
    Final answer: 1

2.  Title: Hometown Stars Triumph in Nail-Biting Championship Finale
    Description: The Greenfield Gators clinched their first league title with a thrilling last-minute score, sending fans into a frenzy and ending a decades-long championship drought.
    Reasoning: This article is focused on a sports event—a champio