In [None]:
import pandas as pd
import json
import numpy as np

In [None]:
from pprint import pprint

In [None]:
val_data = pd.read_pickle("rephrasals/val_extracts.rephrased.pickle")
test_data = pd.read_pickle("results/test_extracts.pickle")

In [None]:
val_data

In [None]:
val_data = val_data.drop_duplicates(subset=["claim_id"], keep="last")
test_data = test_data.drop_duplicates(subset=["claim_id"], keep="last")

In [None]:
def sort_by_max_score(data):
    """
    Sorts a list of dictionaries by the maximum score among 'mid_lev_scores', 'high_lev_scores', and 'low_lev_scores'.

    Args:
        data (list): List of dictionaries containing 'top_n' with 'mid_lev_scores', 'high_lev_scores', and 'low_lev_scores'.

    Returns:
        list: Sorted list of dictionaries.
    """
    def get_max_score(item):
        top_n = item.get('top_n', {})
        scores = []
        for level in ['low_lev_scores', 'mid_lev_scores', 'high_lev_scores']:
            if level in top_n and top_n[level]:
                scores.append(top_n[level][0][0] if top_n[level][0][1] and len(top_n[level][0][1].split()) >= 50 else float('-inf'))  # Extract the score or set to -inf if conditions are not met
        return max(scores) if scores else float('-inf')

    return sorted(data, key=get_max_score, reverse=True)

In [None]:
def merge_highest_score_text(top_n):
    merged_text = ""
    for level in ['low_lev_scores', 'mid_lev_scores', 'high_lev_scores']:
        highest_score_text = ""
        highest_score = float('-inf')
        if level in top_n and top_n[level]:
            for score, text in top_n[level]:
                if score > highest_score:
                    highest_score = score
                    highest_score_text = text
        if highest_score_text:
            merged_text += f"\n...{highest_score_text}..."
    
    merged_text = merged_text.lstrip("\n")
    merged_text = merged_text.rstrip("\n")
        
    return merged_text

In [None]:
# print(merge_highest_score_text(q[0].get("top_n")))

In [None]:
def sort_by_max_score(data):
    """
    Sorts a list of dictionaries by the maximum score among 'mid_lev_scores', 'high_lev_scores', and 'low_lev_scores'.

    Args:
        data (list): List of dictionaries containing 'top_n' with 'mid_lev_scores', 'high_lev_scores', and 'low_lev_scores'.

    Returns:
        list: Sorted list of dictionaries.
    """
    def get_max_score(item):
        top_n = item.get('top_n', {})
        scores = []
        for level in ['low_lev_scores', 'mid_lev_scores', 'high_lev_scores']:
            if level in top_n and top_n[level]:
                scores.append(top_n[level][0][0] if top_n[level][0][1] and len(top_n[level][0][1].split()) >= 50 else float('-inf'))  # Extract the score or set to -inf if conditions are not met
        return max(scores) if scores else float('-inf')

    return sorted(data, key=get_max_score, reverse=True)

In [None]:
def create_relevant_text_list(row):
    filtered_docs = row["filtered_docs"]
    sorted_docs = sort_by_max_score(filtered_docs)
    extracts = []
    for doc in sorted_docs[:5]:
        this_extract = merge_highest_score_text(doc.get("top_n", {}))
        if this_extract:
            extracts.append(this_extract)
    return extracts

In [None]:
test_data["relevant_docs"] = test_data.apply(lambda row: create_relevant_text_list(row), axis=1)
val_data["relevant_docs"] = val_data.apply(lambda row: create_relevant_text_list(row), axis=1)

In [None]:
val_data.to_pickle("rephrasals/val.rephrased.withextracts.pickle")

In [None]:
test_data.to_pickle("results/test.withextracts.pickle")

In [None]:
test_data

In [None]:
# data["relevant_docs"] = data.apply(lambda row: create_relevant_text_list(row), axis=1)

In [None]:
# data

In [None]:
 
with open("/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data/dev.json") as f:
    dev_tasks = json.load(f)

In [None]:
def filter_doc(doc):
    items = list(doc.values())
    items.sort(key = lambda x: x.get("top_n", {}).get("low_level_scores", [-10])[0])
    first = items[0]
    ret_docs = []
    for first in items[:3]:
        str_val = ""
        print(first.get("top_n"))
        for k, v in first.get("top_n", {}).items():
            for doc_list in v:
                # print(doc_list)
                str_val += f"\n{doc_list[1]}"
        ret_docs.append(str_val)
    return ret_docs
        

In [None]:
import pandas as pd

In [None]:
doc_with_rephrasals = pd.read_csv("rephrasals/validation.rephrasals.csv")

In [None]:
doc_with_rephrasals.sort_values(by=["claim_id"])

In [None]:
doc_with_rephrasals.drop_duplicates(subset=["claim_id"])

In [None]:
import re

def extract_rephrasals(text):
    """
    Extracts numbered items from a formatted string.

    Args:
        text (str): The input string containing numbered items.

    Returns:
        list: A list of extracted items.
    """
    if not text or not isinstance(text, str):
        return []
    # Use regex to match numbered items
    pattern = r'\d+\.\s(.*?)\n'
    matches = re.findall(pattern, text + '\n')  # Add newline to ensure last item is captured
    return matches

In [None]:
doc_with_rephrasals["rephrasal_list"] = doc_with_rephrasals["rephrasals"].apply(extract_rephrasals)

In [None]:
doc_with_rephrasals.sort_values(by=['claim_id'])

In [None]:
val_data.sort_values(by=["claim_id"])

In [None]:
rephrasals_csv = pd.read_csv("rephrasals/val.rephrasals.csv")

In [None]:
rephrasals_csv.sort_values(by=["claim_id"], inplace=True)
rephrasals_csv =rephrasals_csv.reset_index()

In [None]:
rephrasals_csv["rephrasal_list"] =  rephrasals_csv["rephrasals"].apply(extract_rephrasals)

In [None]:
rephrasals_csv.to_json("rephrasals/val.with_rephrasals.json")

In [None]:
rephrasal_df = rephrasals_csv

In [None]:
dict(zip(rephrasal_df["claim_id"], rephrasal_df["rephrasal_list"]))

In [None]:
import re

def parse_question_answer(text):
    """
    Parses a string containing a question and an answer into a dictionary.

    Args:
        text (str): The input string containing the question and answer.

    Returns:
        dict: A dictionary with 'question' and 'answer' keys.
    """
    if not text or not isinstance(text, str):
        return {"question": "", "answer": ""}
    pattern = r"<question>\s*(.*?)\s*<answer>\s*(.*)"
    match = re.search(pattern, text, re.DOTALL|re.MULTILINE)
    if match:
        return {
            "question": match.group(1).strip(),
            "answer": match.group(2).strip()
        }
    return {"question": "", "answer": ""}

In [None]:
def process_and_save_json(class_no_rephrasal_path, qa_no_rephrasal_path, output_json_path, fill=500):
    """
    Processes two CSV files, merges them, and saves the aggregated data as a JSON file.

    Args:
        class_no_rephrasal_path (str): Path to the classification CSV file.
        qa_no_rephrasal_path (str): Path to the QA CSV file.
        output_json_path (str): Path to save the output JSON file.
    """
    class_no_rephrasal = pd.read_csv(class_no_rephrasal_path)
    qa_no_rephrasal = pd.read_csv(qa_no_rephrasal_path)
    no_rephrasal_df = class_no_rephrasal.merge(qa_no_rephrasal, on=["claim_id", "label", "claim"])
    no_rephrasal_df["parsed_qa"] = no_rephrasal_df["qanda"].apply(parse_question_answer)
    
    aggregated_data = no_rephrasal_df.groupby("claim_id").agg({
        "document": list,
        "qanda": list,
        "parsed_qa": list,
        "relevant_docs": list,
        "filtered_docs": "first",
        "classification": "first",
        "label": "first",
        "claim": "first"
    }).reset_index()
    
    
    
    # Fill in missing claim_ids within the range of 500
    all_claim_ids = set(range(1, fill))
    existing_claim_ids = set(aggregated_data["claim_id"])
    missing_claim_ids = all_claim_ids - existing_claim_ids
    print(missing_claim_ids)

    # for missing_id in missing_claim_ids:
    #     aggregated_data = pd.concat([
    #         aggregated_data,
    #         pd.DataFrame([{
    #             "claim_id": missing_id,
    #             "parsed_qa": [],
    #             "filtered_docs": "",
    #             "classification": "",
    #             "pred_label": "",
    #             "claim": ""
    #         }])
    #     ], ignore_index=True)
    
    output_json = aggregated_data.apply(
        lambda row: {
            "claim_id": row["claim_id"],
            "claim": row["claim"],
            "pred_label": row["classification"],
            "evidence": row["parsed_qa"]
        },
        axis=1
    ).tolist()

    # Save the JSON to a file
    with open(output_json_path, "w") as f:
        json.dump(output_json, f, indent=4)
    
    return aggregated_data


In [None]:
with_rephrasals_args = ["results/dev/rephrasals/classification.rephrased.csv", "results/dev/rephrasals/qanda.rephrased.csv", "results/dev/rephrasals/dev.rephrasals.averitec.json"]
no_rephrasals_args = ["results/dev/"]
data = process_and_save_json(*with_rephrasals_args)

In [None]:
test_set_no_rephrasals_args = ["results/dev/test/classification_testset_results.csv", "results/dev/test/qanda_testset_results.csv", "results/dev/test/test.averitec.json"]
data_test = process_and_save_json(*test_set_no_rephrasals_args)

In [None]:
data

In [None]:
pd.read_csv("results/dev/rephrasals/dev.qa.rephrasals.csv")

In [None]:
pd.read_csv("results/dev/test/qanda_testset_results.csv")

In [None]:
import json

# List of claim_ids to filter out
claim_ids_to_filter = [161, 35, 261, 135, 361, 42, 235, 461, 142, 335, 242, 435, 499, 342, 442, 61]

{161, 35, 261, 135, 361, 42, 235, 461, 142, 335, 242, 435, 499, 342, 442, 61}


# Read the original JSON file
with open('baseline/AVeriTeC/data/dev.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

filtered_data = []

for idx, x in enumerate(data):
    if idx in claim_ids_to_filter:
        continue
    else:
        filtered_data.append(x)

# Filter out records with the specified claim_ids
# filtered_data = [record for record in data if record['claim_id'] not in claim_ids_to_filter]

# Write the filtered data to a new JSON file
with open('baseline/AVeriTeC/data/dev.filtered.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=4)


In [None]:
[{'question': 'What are the policies of Joe Biden regarding immigration?', 'answer': 'Joe Biden\'s immigration policies include:\n\n* Repealing the travel ban on countries in Africa and the Middle East\n* Ending the Trump administration\'s "zero-tolerance" policy at the southern border\n* Reinstating the Deferred Action for Childhood Arrivals (DACA) program\n* Providing a pathway to citizenship for undocumented immigrants\n* Increasing the number of refugees admitted to the United States\n* Implementing a merit-based immigration system\n* Repealing the Trump administration\'s public charge rule\n* Repealing the Trump administration\'s travel ban on countries in Africa and the Middle East\n* Increasing funding for border security and immigration enforcement\n* Creating a pathway to citizenship for undocumented immigrants\n* Implementing a pathway to citizenship for undocumented immigrants\n* Repealing the Trump administration\'s travel ban on countries in Africa and the Middle East\n* Implementing a pathway to citizenship for undocumented immigrants'}, {'question': 'Does Joe Biden support open borders?', 'answer': 'Joe Biden supports a pathway to citizenship for undocumented immigrants.'}, {'question': 'Does Joe Biden support open borders?', 'answer': '...the border is a national security issue, and we will take the necessary steps to secure the border...'}]

In [None]:
data_test