# Filter dataset

first we loop trough each training set for example BioASQ-trainingDataset2b.json and extract the pubmed IDS used to answers questions 

In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Define the directories
json_dir = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets'
csv_dir = os.path.expanduser(json_dir + '/csv')  # Ensure the path is expanded to the user's home directory

# Create the CSV directory if it doesn't exist
os.makedirs(csv_dir, exist_ok=True)

# Initialize a set to hold all unique PubMed IDs across files
all_pubmed_ids = set()

# List all JSON files in the directory
json_files = [f for f in os.listdir(os.path.expanduser(json_dir)) if f.endswith('.json')]  # Ensure the path is expanded

# Loop through files with a tqdm progress bar
for json_file in tqdm(json_files, desc="Processing JSON Files"):
    json_path = os.path.join(os.path.expanduser(json_dir), json_file)

    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)

    # Initialize a set for this file's PubMed IDs
    file_pubmed_ids = set()

    # Extract unique PubMed IDs from the 'questions' section
    for question in data.get('questions', []):
        documents = question.get('documents', [])
        for url in documents:
            pubmed_id = int(url.split('/')[-1])
            file_pubmed_ids.add(pubmed_id)

    # Update the set of all PubMed IDs, since a set can only contain unique numbers the same  PUBMEDIDS wont be stored twice
    all_pubmed_ids.update(file_pubmed_ids)

    # Save to DataFrame and CSV for this file
    df = pd.DataFrame({'pubmedid': list(file_pubmed_ids)})
    csv_filename = os.path.splitext(json_file)[0] + '.csv'
    csv_path = os.path.join(csv_dir, csv_filename)
    df.to_csv(csv_path, index=False)

# Convert the set to a list with tqdm progress
all_pubmed_ids_list = list(tqdm(all_pubmed_ids, desc="Aggregating PubMed IDs"))

# Save all PubMed IDs to a DataFrame with an extra column and save to CSV
all_pubmed_ids_df = pd.DataFrame({'pubmedid': all_pubmed_ids_list, 'enthalten_in_dataset': 0})
complete_csv_path = os.path.join(csv_dir, 'csv_complete.csv')
all_pubmed_ids_df.to_csv(complete_csv_path, index=False)

Processing JSON Files: 100%|██████████| 11/11 [00:09<00:00,  1.21it/s]
Aggregating PubMed IDs: 100%|██████████| 43188/43188 [00:00<00:00, 3358304.77it/s]


now we read in all the pubmedids we currently have loaded into our 

this script reads in the before created list of all pubmed ids used to answer questions and all the pubmed ids used in our dataset

it then flags all  the pubmedids which are avaible in the questions and our data subset used (remember we created the latter in the previous stepp)

in the end we first update the csv_complete.csv on wether or not the pubmedid is containted flag afterwards we save the matched pubmed ids into a seperate file.


In [4]:
import pandas as pd

# Paths to your CSV files
complete_csv_path = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets/csv/csv_complete.csv' #csv file with all unique pubmed ids that are used to answer questions
rag_pubmed_csv_path = '~/data/faiss_indices/bioBERT/PMIDs/concatenated_pubmed_ids.csv' #csv file with all the pubmedids currently in our system
matched_ids_csv_path = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets/csv/matched_pubmed_ids.csv' # csv file that results from running this script which containes alll the pubmed ids that 
# should be able to answer questions 

# Read the DataFrames
complete_df = pd.read_csv(complete_csv_path)
# Read the RAGPubMed.csv file assuming it has no header and only one column of integers
rag_pubmed_df = pd.read_csv(rag_pubmed_csv_path, header=None, names=['PMID'], dtype={'PMID': int})


# Check for presence and update the column
complete_df['enthalten_in_dataset'] = complete_df['pubmedid'].isin(rag_pubmed_df['PMID']).astype(int)

# Save the updated DataFrame
complete_df.to_csv(complete_csv_path, index=False)

# Extract the PubMed IDs that have a match (1 in the 'enthalten_in_dataset' column)
matched_pubmed_ids = complete_df[complete_df['enthalten_in_dataset'] == 1]['pubmedid']

# Save the matched PubMed IDs to a separate CSV file
matched_pubmed_ids.to_csv(matched_ids_csv_path, index=False, header=['pubmedid'])

# Calculate the percentage
percentage = (complete_df['enthalten_in_dataset'].sum() / len(complete_df)) * 100

print(f"Percentage of PubMed IDs with a 1: {percentage}%")



Percentage of PubMed IDs with a 1: 3.035565434843012%


now we extract each questions that has at least one pubmed id as answer which is present in our dataset and save it into the json file. we also provide the count of how many questions should be answerable 

In [11]:
import os
import json
import pandas as pd

# Define the directories
json_dir = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets'
matched_ids_csv_path = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets/csv/matched_pubmed_ids.csv'
output_json_path = '~/Questions_answers_data/all_questions_in_system.json'

# Read the matched PubMed IDs
matched_ids_df = pd.read_csv(matched_ids_csv_path)
matched_pubmed_ids = set(matched_ids_df['pubmedid'])

# List all JSON files in the directory
json_files = [f for f in os.listdir(os.path.expanduser(json_dir)) if f.endswith('.json')]

# Initialize a list to hold entries that meet the criteria and a counter for all entries
selected_entries = []
total_entries = 0

# Loop through each JSON file
for json_file in json_files:
    json_path = os.path.join(os.path.expanduser(json_dir), json_file)
    
    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Increment total_entries by the number of questions in the current file
    total_entries += len(data.get('questions', []))
    
    # Check each entry for matched PubMed IDs
    for question in data.get('questions', []):
        documents = question.get('documents', [])
        pubmed_ids = [int(url.split('/')[-1]) for url in documents]
        # Count how many PubMed IDs in this question are in the matched list
        match_count = sum(id_ in matched_pubmed_ids for id_ in pubmed_ids)
        # If at least one (or two) match(es), save the entire entry
        if match_count >= 1:  # Change to `>= 2` if you need at least two matches ore more
            selected_entries.append(question)

# Save the selected entries to a new JSON file
with open(os.path.expanduser(output_json_path), 'w') as file:
    json.dump({'questions': selected_entries}, file, indent=4)

# Print the total count of entries and the count of selected entries
print(f"Total entries in all JSON files: {total_entries}")
print(f"Total selected entries saved: {len(selected_entries)}")


Total entries in all JSON files: 30212
Total selected entries saved: 723


note for later:

if only one pubmed id required we can answer 4000 questions
with at least 2 we can answer 2036
with at least 3 1221
with at least 4 731
with all 23

this script above does exactly the same as above but only counts the question if all the pubmed ids to answer are in our dataset

In [7]:
import os
import json
import pandas as pd

# Define the directories
json_dir = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets'
matched_ids_csv_path = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets/csv/matched_pubmed_ids.csv'
output_json_path = '~/Questions_answers_data/all_questions_with_all_ids_matched.json'  # Updated file name

# Read the matched PubMed IDs
matched_ids_df = pd.read_csv(matched_ids_csv_path)
matched_pubmed_ids = set(matched_ids_df['pubmedid'])

# List all JSON files in the directory
json_files = [f for f in os.listdir(os.path.expanduser(json_dir)) if f.endswith('.json')]

# Initialize a list to hold entries that meet the criteria and a counter for all entries
selected_entries = []
total_entries = 0

# Loop through each JSON file
for json_file in json_files:
    json_path = os.path.join(os.path.expanduser(json_dir), json_file)
    
    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Increment total_entries by the number of questions in the current file
    total_entries += len(data.get('questions', []))
    
    # Check each entry for matched PubMed IDs
    for question in data.get('questions', []):
        documents = question.get('documents', [])
        pubmed_ids = [int(url.split('/')[-1]) for url in documents]
        # Check if all PubMed IDs in this question are in the matched list
        if all(id_ in matched_pubmed_ids for id_ in pubmed_ids):
            selected_entries.append(question)

# Save the selected entries to a new JSON file
with open(os.path.expanduser(output_json_path), 'w') as file:
    json.dump({'questions': selected_entries}, file, indent=4)

# Print the total count of entries and the count of selected entries
print(f"Total entries in all JSON files: {total_entries}")
print(f"Total selected entries saved: {len(selected_entries)}")


Total entries in all JSON files: 30212
Total selected entries saved: 23


since i didnt think about it that much before we now proceed to remove the duplicates from all generated 

In [2]:
import json


def remove_duplicates(json_file_path):
    # Load the JSON data from the file
    with open(json_file_path, "r") as file:
        data = json.load(file)

    # Extract the questions list
    questions = data.get("questions", [])

    # Store unique questions and count types
    unique_questions = {}
    type_counts = {}
    for question in questions:
        question_id = question.get("id")
        question_type = question.get(
            "type", "unknown"
        )  # Handling cases where type might not be present

        # Count the types of questions
        if question_type in type_counts:
            type_counts[question_type] += 1
        else:
            type_counts[question_type] = 1

        # Add unique questions
        if question_id not in unique_questions:
            unique_questions[question_id] = question

    # Calculate the numbers for reporting
    initial_count = len(questions)
    final_count = len(unique_questions)
    removed_count = initial_count - final_count

    # Save the unique questions back to the file
    data["questions"] = list(unique_questions.values())
    with open(json_file_path, "w") as file:
        json.dump(data, file, indent=4)

    # Return the counts and type summary
    return {
        "initial_count": initial_count,
        "final_count": final_count,
        "removed_count": removed_count,
        "type_counts": type_counts,
    }


# Example usage:
path = "/home/ubuntu/questions_answers_data/all_questions_in_system_min2.json"
result = remove_duplicates(path)
print(f"Entries before: {result['initial_count']}, Entries after: {result['final_count']}, "
      f"Duplicates removed: {result['removed_count']}")
print("Entries per type:", result['type_counts'])



Entries before: 298, Entries after: 298, Duplicates removed: 0
Entries per type: {'factoid': 91, 'summary': 57, 'yesno': 87, 'list': 63}


In [3]:
import json


def remove_duplicates(json_file_path):
    # Load the JSON data from the file
    with open(json_file_path, "r") as file:
        data = json.load(file)

    # Extract the questions list
    questions = data.get("questions", [])

    # Store unique questions and count types
    unique_questions = {}
    type_counts = {}
    for question in questions:
        question_id = question.get("id")
        question_type = question.get(
            "type", "unknown"
        )  # Handling cases where type might not be present

        # Count the types of questions
        if question_type in type_counts:
            type_counts[question_type] += 1
        else:
            type_counts[question_type] = 1

        # Add unique questions
        if question_id not in unique_questions:
            unique_questions[question_id] = question

    # Calculate the numbers for reporting
    initial_count = len(questions)
    final_count = len(unique_questions)
    removed_count = initial_count - final_count

    # Save the unique questions back to the file
    data["questions"] = list(unique_questions.values())
    with open(json_file_path, "w") as file:
        json.dump(data, file, indent=4)

    # Return the counts and type summary
    return {
        "initial_count": initial_count,
        "final_count": final_count,
        "removed_count": removed_count,
        "type_counts": type_counts,
    }


# Example usage:
path = "/home/ubuntu/questions_answers_data/all_questions_in_system.json"
result = remove_duplicates(path)
print(
    f"Entries before: {result['initial_count']}, Entries after: {result['final_count']}, "
    f"Duplicates removed: {result['removed_count']}"
)
print("Entries per type:", result["type_counts"])

Entries before: 4064, Entries after: 593, Duplicates removed: 3471
Entries per type: {'yesno': 1145, 'factoid': 1190, 'list': 900, 'summary': 829}


In [4]:
import json


def remove_duplicates(json_file_path):
    # Load the JSON data from the file
    with open(json_file_path, "r") as file:
        data = json.load(file)

    # Extract the questions list
    questions = data.get("questions", [])

    # Store unique questions and count types
    unique_questions = {}
    type_counts = {}
    for question in questions:
        question_id = question.get("id")
        question_type = question.get(
            "type", "unknown"
        )  # Handling cases where type might not be present

        # Count the types of questions
        if question_type in type_counts:
            type_counts[question_type] += 1
        else:
            type_counts[question_type] = 1

        # Add unique questions
        if question_id not in unique_questions:
            unique_questions[question_id] = question

    # Calculate the numbers for reporting
    initial_count = len(questions)
    final_count = len(unique_questions)
    removed_count = initial_count - final_count

    # Save the unique questions back to the file
    data["questions"] = list(unique_questions.values())
    with open(json_file_path, "w") as file:
        json.dump(data, file, indent=4)

    # Return the counts and type summary
    return {
        "initial_count": initial_count,
        "final_count": final_count,
        "removed_count": removed_count,
        "type_counts": type_counts,
    }


# Example usage:
path = "/home/ubuntu/questions_answers_data/all_questions_in_system.json"
result = remove_duplicates(path)
print(
    f"Entries before: {result['initial_count']}, Entries after: {result['final_count']}, "
    f"Duplicates removed: {result['removed_count']}"
)
print("Entries per type:", result["type_counts"])

Entries before: 593, Entries after: 593, Duplicates removed: 0
Entries per type: {'yesno': 160, 'factoid': 179, 'list': 131, 'summary': 123}


In [1]:
import csv
import json


def read_pubmed_ids_from_csv(csv_path):
    pubmed_ids = set()
    with open(csv_path, newline="") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            pubmed_ids.add(row[0].strip())  # Assuming the IDs are in the first column
    return pubmed_ids


def filter_documents_in_json(json_path, pubmed_ids):
    with open(json_path, "r") as file:
        data = json.load(file)

    empty_document_count = 0

    # Process each question in the JSON data
    for question in data["questions"]:
        original_documents = question["documents"]
        filtered_documents = [
            doc for doc in original_documents if doc.split("/")[-1] in pubmed_ids
        ]
        question["documents"] = filtered_documents

        if not filtered_documents:
            empty_document_count += 1

    return data, empty_document_count


def save_updated_json(data, output_path):
    with open(output_path, "w") as file:
        json.dump(data, file, indent=2)
    print(f"Updated JSON saved to {output_path}")


def main():
    csv_path = "/home/ubuntu/questions_answers_data/bioASQ_data/trainings_sets/csv/matched_pubmed_ids.csv"
    json_path = "/home/ubuntu/questions_answers_data/all_questions_in_system.json"
    output_json_path = (
        "/home/ubuntu/questions_answers_data/filtered_min1_questions_in_system.json"
    )

    pubmed_ids = read_pubmed_ids_from_csv(csv_path)
    filtered_data, empty_count = filter_documents_in_json(json_path, pubmed_ids)

    print(f"Number of questions with empty 'documents': {empty_count}")
    save_updated_json(filtered_data, output_json_path)  # Save the updated JSON


main()

Number of questions with empty 'documents': 0
Updated JSON saved to /home/ubuntu/questions_answers_data/filtered_min1_questions_in_system.json


In [2]:
import csv
import json


def read_pubmed_ids_from_csv(csv_path):
    pubmed_ids = set()
    with open(csv_path, newline="") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            pubmed_ids.add(row[0].strip())  # Assuming the IDs are in the first column
    return pubmed_ids


def filter_documents_in_json(json_path, pubmed_ids):
    with open(json_path, "r") as file:
        data = json.load(file)

    empty_document_count = 0

    # Process each question in the JSON data
    for question in data["questions"]:
        original_documents = question["documents"]
        filtered_documents = [
            doc for doc in original_documents if doc.split("/")[-1] in pubmed_ids
        ]
        question["documents"] = filtered_documents

        if not filtered_documents:
            empty_document_count += 1

    return data, empty_document_count


def save_updated_json(data, output_path):
    with open(output_path, "w") as file:
        json.dump(data, file, indent=2)
    print(f"Updated JSON saved to {output_path}")


def main():
    csv_path = "/home/ubuntu/questions_answers_data/bioASQ_data/trainings_sets/csv/matched_pubmed_ids.csv"
    json_path = "/home/ubuntu/questions_answers_data/all_questions_in_system_min2.json"
    output_json_path = (
        "/home/ubuntu/questions_answers_data/filtered_min2_questions_in_system.json"
    )

    pubmed_ids = read_pubmed_ids_from_csv(csv_path)
    filtered_data, empty_count = filter_documents_in_json(json_path, pubmed_ids)

    print(f"Number of questions with empty 'documents': {empty_count}")
    save_updated_json(filtered_data, output_json_path)  # Save the updated JSON


main()

Number of questions with empty 'documents': 0
Updated JSON saved to /home/ubuntu/questions_answers_data/filtered_min2_questions_in_system.json


Here is a script that extract 20 list type questions in order to find a optimal solution how to evalute those kinda questions


In [3]:
import json

# Load the data from the JSON file
with open(
    "/home/ubuntu/questions_answers_data/filtered_min2_questions_in_system.json", "r"
) as file:
    data = json.load(file)

# Filter entries where the type is "list"
list_entries = [entry for entry in data["questions"] if entry["type"] == "list"]

# If there are more than 20 list entries, limit to the first 20
if len(list_entries) > 20:
    list_entries = list_entries[:20]

# Create a new dictionary with these entries
output_data = {"questions": list_entries}

# Save the filtered data to a new JSON file
with open("/home/ubuntu/questions_answers_data/listexample.json", "w") as outfile:
    json.dump(output_data, outfile, indent=4)

print("File 'listexample.json' has been created with 20 'list' type entries.")

File 'listexample.json' has been created with 20 'list' type entries.


In [4]:
import json

# Load the data from the JSON file
with open(
    "/home/ubuntu/questions_answers_data/filtered_min2_questions_in_system.json", "r"
) as file:
    data = json.load(file)

# Filter entries where the type is "list"
list_entries = [entry for entry in data["questions"] if entry["type"] == "factoid"]

# If there are more than 20 list entries, limit to the first 20
if len(list_entries) > 20:
    list_entries = list_entries[:20]

# Create a new dictionary with these entries
output_data = {"questions": list_entries}

# Save the filtered data to a new JSON file
with open("/home/ubuntu/questions_answers_data/factoidexample.json", "w") as outfile:
    json.dump(output_data, outfile, indent=4)

print("File 'listexample.json' has been created with 20 'factoid' type entries.")

File 'listexample.json' has been created with 20 'factoid' type entries.


In [2]:
import json
import os


def process_json(input_file, output_dir):
    with open(input_file, "r") as file:
        data = json.load(file)

    # Initialize dictionaries to hold each type of question
    questions_by_type = {"yesno": [], "list": [], "summary": [], "factoid": []}

    # Sort questions into their respective type lists
    for question in data["questions"]:
        question_type = question["type"]
        if question_type in questions_by_type:
            questions_by_type[question_type].append(question)
        else:
            print(
                f"Warning: Unrecognized question type '{question_type}' found and will be ignored."
            )

    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Write each group to a separate file
    for type_key, questions in questions_by_type.items():
        output_file_name = os.path.join(output_dir, f"{type_key}_questions.json")
        with open(output_file_name, "w") as file:
            json.dump(questions, file, indent=4)
        print(f"Written {len(questions)} entries to {output_file_name}")


# Example usage
process_json(
    "/home/ubuntu/questions_answers_data/filtered_min2_questions_in_system.json",
    "/home/ubuntu/questions_answers_data/Experimental_data_min_2",
)

Written 87 entries to /home/ubuntu/questions_answers_data/Experimental_data_min_2/yesno_questions.json
Written 63 entries to /home/ubuntu/questions_answers_data/Experimental_data_min_2/list_questions.json
Written 57 entries to /home/ubuntu/questions_answers_data/Experimental_data_min_2/summary_questions.json
Written 91 entries to /home/ubuntu/questions_answers_data/Experimental_data_min_2/factoid_questions.json
