# Additional parsing of the dataset

Here, we created the script where we additionally parsed data and combined training13b.json and retrived_articles_sampled.json file
We removed data which may not be relevant for eaxtraction

In [3]:
import json


# retreived_articles_sampled.json path
retreived_articles_sampled_path = "../datasets/test/batch_1/retrieved_articles_sampled_test_batch_1.json"

# path to the original file which is downloaded form the BioASQ site path
original_downloaded_file_path = '../datasets/test/batch_1/BioASQ-task13bPhaseB-testset1'

# link to parsed json file
parsed_data_path = "../datasets/test/batch_1/parsed_data_final_test_batch_1.json"

In [4]:
# Load sample document dataset
with open(retreived_articles_sampled_path, 'r', encoding='utf-8') as f:
    retreived_articles_sampled_data = json.load(f)['results']  # assumes your sample is under the 'results' key

# Load BioASQ training dataset
with open(original_downloaded_file_path, 'r', encoding='utf-8') as f:
    training13b_data = json.load(f)['questions']  

In [5]:
# this is function that converts the PubMed ID to a link, as same as showed in the training13b.json
def convert_pid_to_link(pid):
    """
    Convert PubMed ID to link.
    """
    link_prefix = "http://www.ncbi.nlm.nih.gov/pubmed/"
    return link_prefix + pid if pid and link_prefix not in pid else pid

Now, we will extract relevant info from each dataset
retreived_articles_sampled.json -> [question id, question, error_rate, ground truth list, articles (with info)]
training_articles_sampled.json -> [question id, ground truth snippets]

We will first create dictionary from training13b.json, because we need only groun truth snippets


In [6]:
training13b_gt_snippets = {}
for element in training13b_data:
    training13b_gt_snippets[element['id']] = element['snippets']

In [7]:
# extract the text from the retrieved articles
retreived_articles_sampled_relevant_info = []
for object in retreived_articles_sampled_data:
    # we need for each article to convert pid from the id to link
    relevant_articles = []
    for article in object['details']:
        article['pid'] = convert_pid_to_link(article['pid'])
        relevant_articles.append(article)
    
    relevant_element = {
        'qid': object['qid'],
        'question': object['question'],
        'ground_truth_documents_pid': [convert_pid_to_link(gt_pid) for gt_pid in object['ground_truth']], 
        'error_rate': object['error_rate'],
        'ground_truth_snippets': training13b_gt_snippets[object['qid']],
        'all_retreived_articles': relevant_articles
    }
    retreived_articles_sampled_relevant_info.append(relevant_element)

print(retreived_articles_sampled_relevant_info[0:3])

[{'qid': '67d74cde18b1e36f2e00003c', 'question': 'Describe RankMHC', 'ground_truth_documents_pid': ['http://www.ncbi.nlm.nih.gov/pubmed/39555889'], 'error_rate': {'value': 1.0, 'details': '1 found out of 1'}, 'ground_truth_snippets': [{'beginSection': 'title', 'endSection': 'title', 'text': 'RankMHC: Learning to Rank Class-I Peptide-MHC Structural Models.', 'document': 'http://www.ncbi.nlm.nih.gov/pubmed/39555889', 'offsetInBeginSection': 0, 'offsetInEndSection': 64}, {'beginSection': 'abstract', 'endSection': 'abstract', 'text': 'We present RankMHC, an LTR-based pMHC binding mode identification predictor, which is specifically trained to predict the most accurate ranking of an ensemble of pMHC conformations. RankMHC outperforms classical peptide-ligand scoring functions, as well as previous Machine Learning (ML)-based binding pose predictors. We further demonstrate that RankMHC can be used with many pMHC structural modeling tools that use different structural modeling protocols.', 'do

In [8]:
# write this info to the json file
parsed_data_final = {
    'data': retreived_articles_sampled_relevant_info
}

data_serialized = json.dumps(parsed_data_final, indent = 4)

with open(parsed_data_path, 'w') as outfile:
    outfile.write(data_serialized)
