In [1]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from qualitative_analysis import (
    clean_and_normalize,
    sanitize_dataframe,
    get_llm_client,
    compute_cohens_kappa,
    process_general_verbatims,
    calculate_and_log,
)
import qualitative_analysis.config as config
import json

In [2]:
# Define the data file path
data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)
data_file_path = os.path.join(data_dir, 'train_squad.json')

# Step 1: Load the JSON file
with open(data_file_path, 'r', encoding='utf-8') as file:
    raw_data = json.load(file)

# Step 2: Validate JSON structure
if not isinstance(raw_data, dict) or "data" not in raw_data:
    raise ValueError("JSON structure does not match expected SQuAD format.")

articles = raw_data["data"]

# Step 3: Convert articles to a DataFrame and explode paragraphs
data = pd.DataFrame(articles)
exploded_data = data.explode('paragraphs')

# Step 4: Normalize paragraphs and retain title
paragraphs_df = pd.json_normalize(exploded_data['paragraphs'])
paragraphs_df = pd.concat(
    [exploded_data[['title']].reset_index(drop=True), paragraphs_df], 
    axis=1
)

# Step 5: Explode QA pairs and normalize
paragraphs_df = paragraphs_df.explode('qas').reset_index(drop=True)
qas_df = pd.json_normalize(paragraphs_df['qas'])

# Step 6: Combine all data
result = pd.concat(
    [paragraphs_df[['title', 'context']], qas_df], 
    axis=1
)

# Step 7: Extract answer texts (example: take first answer if exists)
result['answer_text'] = result['answers'].apply(
    lambda x: x[0]['text'] if isinstance(x, list) and len(x) > 0 else None
)

# Step 8: Rename columns
result = result.rename(columns={
    'title': 'Title',
    'context': 'Context',
    'question': 'Question',
    'id': 'QuestionID',
    'is_impossible': 'IsImpossible',
    'plausible_answers': 'PlausibleAnswers'
})

# Optional: Drop original answers column if needed
# result = result.drop(columns=['answers'])

# Preview the DataFrame
print(result.head())

     Title                                            Context  \
0  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
1  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
2  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
3  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
4  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   

                                            Question  \
0           When did Beyonce start becoming popular?   
1  What areas did Beyonce compete in when she was...   
2  When did Beyonce leave Destiny's Child and bec...   
3      In what city and state did Beyonce  grow up?    
4         In which decade did Beyonce become famous?   

                 QuestionID  \
0  56be85543aeaaa14008c9063   
1  56be85543aeaaa14008c9065   
2  56be85543aeaaa14008c9066   
3  56bf6b0f3aeaaa14008c9601   
4  56bf6b0f3aeaaa14008c9602   

                                             answers  IsImpossible  \
0  [{'text': 'i

In [3]:
# 1) Now define the new column names for cleaning
text_columns = ["Context", "Question"]

# 2) Clean and normalize the new columns
for col in text_columns:
    result[col] = clean_and_normalize(result[col])

# 3) Sanitize the DataFrame
data = sanitize_dataframe(result)


In [4]:
# Combine texts and entries

data['verbatim'] = data.apply(
    lambda row: (
        f"Context: {row['Context']}\n\n"
        f"Question: {row['Question']}"
    ),
    axis=1
)

# Extract the list of verbatims
verbatims = data['verbatim'].tolist()

print(f"Total number of verbatims: {len(verbatims)}")
print(f"Verbatim example:\n{verbatims[0]}")

Total number of verbatims: 130319
Verbatim example:
Context: Beyonce Giselle Knowles-Carter (/bijnse/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonce's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".

Question: When did Beyonce start becoming popular?


In [5]:
# Check unique values in 'IsImpossible'
print(result['IsImpossible'].unique())
# Check counts of unique values in 'IsImpossible'
print(result['IsImpossible'].value_counts())

[False  True]
IsImpossible
False    86821
True     43498
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

# Step 1: Get a stratified subset of 1000 samples
data_subset, _ = train_test_split(  # <-- Assign the FIRST output (train_size=1000)
    data,
    train_size=200,
    stratify=data['IsImpossible'],
    random_state=42
)

# Step 2: Split subset into train/val
train_data, val_data = train_test_split(
    data_subset,
    test_size=0.2,
    stratify=data_subset['IsImpossible'],
    random_state=42
)

print("Train size:", len(train_data))
print("Val size:", len(val_data))


Train size: 160
Val size: 40


In [7]:
scenarios = [
    {
        "provider": "azure",
        "model_name": "gpt-4o-mini",
        "temperature": 0.0,
        "prefix": None,
        "prompt_name": "base_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
0: The question is answerable based on the reference text
1: The question is impossible to answer based on the reference text

Data format:
Context: The reference text
Question: The question

Respond ONLY with 0 or 1. No explanations.
"""
    },
    {
        "provider": "azure",
        "model_name": "gpt-4o-mini",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
0: Answerable (context contains answer)
1: Impossible (context lacks information)

Data format:
Context: The reference text
Question: The question

First, think step-by-step, and give the classification label in this format only:
Final answer: 0
OR
Final answer: 1
"""
    },
            {
        "provider": "azure",
        "model_name": "gpt-4o-mini",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt_examples",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
0: Answerable (context contains answer)
1: Impossible (context lacks information)

Examples:
1. Context: Photosynthesis converts sunlight into chemical energy, occurring in plant chloroplasts. The process requires water and carbon dioxide.
   Question: Where in plant cells does photosynthesis occur?
   Reasoning: The context specifically mentions chloroplasts.
   Final answer: 0

2. Context: The first transatlantic telegraph cable was laid in 1858, allowing communication between Europe and North America in hours rather than weeks.
   Question: What material was used for the first fiber optic cables?
   Reasoning: The context discusses telegraph cables, not fiber optics.
   Final answer: 1

Data format:
Context: The reference text
Question: The question

First, think step-by-step, and give the classification label in this format only:
Final answer: 0
OR
Final answer: 1
"""
    },
        {
        "provider": "azure",
        "model_name": "gpt-4o",
        "temperature": 0.0,
        "prefix": None,
        "prompt_name": "base_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
0: The question is answerable based on the reference text
1: The question is impossible to answer based on the reference text

Data format:
Context: The reference text
Question: The question

Respond ONLY with 0 or 1. No explanations.
"""
    },
    {
        "provider": "azure",
        "model_name": "gpt-4o",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
0: Answerable (context contains answer)
1: Impossible (context lacks information)

Data format:
Context: The reference text
Question: The question

First, think step-by-step, and give the classification label in this format only:
Final answer: 0
OR
Final answer: 1
"""
    },
        {
        "provider": "azure",
        "model_name": "gpt-4o",
        "temperature": 0.0,
        "prefix": "Final answer",
        "prompt_name": "reasoning_prompt_examples",
        "template": """You are a helpful assistant tasked with evaluating:

Input:
{verbatim_text}

Evaluate using:
**Codebook:**
0: Answerable (context contains answer)
1: Impossible (context lacks information)

Examples:
1. Context: Photosynthesis converts sunlight into chemical energy, occurring in plant chloroplasts. The process requires water and carbon dioxide.
   Question: Where in plant cells does photosynthesis occur?
   Reasoning: The context specifically mentions chloroplasts.
   Final answer: 0

2. Context: The first transatlantic telegraph cable was laid in 1858, allowing communication between Europe and North America in hours rather than weeks.
   Question: What material was used for the first fiber optic cables?
   Reasoning: The context discusses telegraph cables, not fiber optics.
   Final answer: 1

Data format:
Context: The reference text
Question: The question

First, think step-by-step, and give the classification label in this format only:
Final answer: 0
OR
Final answer: 1
"""
    },
]

In [8]:
import time

all_results = []
verbose = True

# Convert `IsImpossible` from bool to int
train_data["IsImpossible"] = train_data["IsImpossible"].astype(int)
val_data["IsImpossible"] = val_data["IsImpossible"].astype(int)

for scenario in scenarios:
    provider = scenario["provider"]
    model_name = scenario["model_name"]
    temperature = scenario["temperature"]
    prompt_name = scenario["prompt_name"]
    template = scenario["template"]
    prefix = scenario.get("prefix", None)  # or scenario["prefix"] if guaranteed

    print(f"\n=== Running scenario: {prompt_name} | model={model_name} | temp={temperature} ===")

    # Start timing for this scenario
    start_time = time.time()

    # 1) Possibly re-init the client if needed per scenario
    
    # Initialize the client
    llm_client = get_llm_client(provider=provider, config=config.MODEL_CONFIG[provider])

    # ---- TRAIN ---
    train_verbatims = train_data["verbatim"].tolist()
    train_pred_df, train_cost_info, train_totals = process_general_verbatims(
        verbatims_subset=train_verbatims,
        llm_client=llm_client,
        model_name=model_name,
        prompt_template=template,
        prefix=prefix,
        temperature=temperature,
        verbose=verbose
    )

    train_data["ModelPrediction"] = train_pred_df["Label"].values
    y_true_train = train_data["IsImpossible"].tolist()
    y_pred_train = train_data["ModelPrediction"].fillna(-1).tolist()

    kappa_train = compute_cohens_kappa(y_true_train, y_pred_train, labels=[0,1])
    accuracy_train = accuracy_score(y_true_train, y_pred_train)

    # ---- VAL ---
    val_verbatims = val_data["verbatim"].tolist()
    val_pred_df, val_cost_info, val_totals = process_general_verbatims(
        verbatims_subset=val_verbatims,
        llm_client=llm_client,
        model_name=model_name,
        prompt_template=template,
        prefix=prefix,
        temperature=temperature,
        verbose=verbose
    )

    val_data["ModelPrediction"] = val_pred_df["Label"].values
    y_true_val = val_data["IsImpossible"].tolist()
    y_pred_val = val_data["ModelPrediction"].fillna(-1).tolist()

    kappa_val = compute_cohens_kappa(y_true_val, y_pred_val, labels=[0,1])
    accuracy_val = accuracy_score(y_true_val, y_pred_val)

    # Summarize cost usage
    total_tokens = train_totals["total_tokens_used"] + val_totals["total_tokens_used"]
    total_cost = train_totals["total_cost"] + val_totals["total_cost"]

    # End timing for this scenario
    end_time = time.time()
    elapsed_time_seconds = end_time - start_time

    # Build a single result row for the scenario
    row = {
        "data_set": "SQuAD",
        "provider": provider,
        "model_name": model_name,
        "temperature": temperature,
        "prompt_name": prompt_name,
        "kappa_train": kappa_train,
        "kappa_val": kappa_val,
        "accuracy_train": accuracy_train,
        "accuracy_val": accuracy_val,
        "tokens_used": total_tokens,
        "cost": total_cost,
        "running_time_s": elapsed_time_seconds,
    }

    all_results.append(row)

# Create a DataFrame of results
summary_df = pd.DataFrame(all_results)


=== Running scenario: base_prompt | model=gpt-4o-mini | temp=0.0 ===

=== Processing Verbatim 1/160 ===
Prompt:
You are a helpful assistant tasked with evaluating:

Input:
Context: The idea of building a tunnel under the Irish Sea has been raised since 1895, when it was first investigated. Several potential Irish Sea tunnel projects have been proposed, most recently the Tusker Tunnel between the ports of Rosslare and Fishguard proposed by The Institute of Engineers of Ireland in 2004. A rail tunnel was proposed in 1997 on a different route, between Dublin and Holyhead, by British engineering firm Symonds. Either tunnel, at 50 mi (80 km), would be by far the longest in the world, and would cost an estimated 15 billion or 20 billion. A proposal in 2007, estimated the cost of building a bridge from County Antrim in Northern Ireland to Galloway in Scotland at 3.5bn (5bn).

Question: Either of the tunnels, would be the shortest in the world and are estimated to cost how much?

Evaluate usi

In [9]:
summary_df

Unnamed: 0,provider,model_name,temperature,prompt_name,kappa_train,kappa_val,accuracy_train,accuracy_val,total_tokens_used,total_cost,running_time_s
0,azure,gpt-4o-mini,0.0,base_prompt,0.707174,0.72067,0.875,0.875,53338,0.008088,131.962675
1,azure,gpt-4o-mini,0.0,reasoning_prompt,0.743772,0.593023,0.8875,0.825,74879,0.020193,237.491811
2,azure,gpt-4o-mini,0.0,reasoning_prompt_examples,0.725731,0.77208,0.88125,0.9,85109,0.014826,88.920012
3,azure,gpt-4o,0.0,base_prompt,0.604262,0.752322,0.8375,0.9,53338,0.134875,74.438933
4,azure,gpt-4o,0.0,reasoning_prompt,0.767948,0.818182,0.9,0.925,93404,0.522016,888.298173
5,azure,gpt-4o,0.0,reasoning_prompt_examples,0.724132,0.881306,0.86875,0.95,132170,0.717676,1062.043289


In [None]:
calculate_and_log(summary_df)