In [None]:
import csv
import json
import re

# Path to the uploaded file
file_path = 'US_qbank.jsonl'
output_csv = 'output_questions.csv'

def fix_encoding(text):
    try:
        # Try to encode the text as latin1 and decode it back as utf-8
        return text.encode('latin1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        # If it fails, return the text as is
        return text
    
# Load the dataset
asthma_questions = []

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = fix_encoding(line)
            question = json.loads(line)
            # Check if the word 'asthma' appears in the question text (case-insensitive)
            if 'endoscopy' in question['question'].lower():
                asthma_questions.append(question)
except UnicodeDecodeError:
    print("Encoding error: The file may contain non-UTF-8 encoded characters.")

# Output the first few asthma-related questions and total count
print(asthma_questions[:5], len(asthma_questions))  # Show first 5 entries and total count

# Write the asthma-related questions to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['question', 'answer_detail', 'answer']  # Adjust field names based on JSON structure
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for question in asthma_questions:
        options = question.get('options', {})
        answer_label = question.get('answer', '').strip()
        answer = options.get(answer_label, '') if options else ''
        writer.writerow({
            'question': question['question'],
            'answer_detail': answer,
            'answer': answer_label,
        })

print(f"Endoscopy-related questions saved to {output_csv}")


In [9]:
import csv
import json
import re
import pandas as pd

# Path to the uploaded file
file_path = 'US_qbank.jsonl'
output_csv = 'output_questions.csv'

additional_path = 'US_qbank_extracted_phrases.jsonl'

In [3]:
# Load the dataset, see head() to check the structure
df = pd.read_json(file_path, lines=True)
df.head()

Unnamed: 0,question,answer,options,meta_info
0,A 4670-g (10-lb 5-oz) male newborn is delivere...,D,"{'A': 'Nerve conduction study', 'B': 'Surgical...",step2
1,A 66-year-old man comes to the physician for a...,F,"{'A': 'Cirrhosis', 'B': 'Acute lymphoblastic l...",step2
2,A 40-year-old man comes to the physician becau...,E,"{'A': 'Asphyxia', 'B': 'Achlorhydria', 'C': 'T...",step2
3,A 4-year-old boy is brought to the emergency d...,E,{'A': 'Get consent from the patient's brother'...,step2
4,A 9-year-old girl is brought to the emergency ...,C,"{'A': 'Oculomotor nerve damage', 'B': 'Retrobu...",step2


In [8]:
# meta_info return the type / difficulty of the questions. We will check what types of questions are available by checking the unique values in the meta_info column.
df['meta_info'].unique()

array(['step2', 'step1'], dtype=object)

In [10]:
# There are only step2 and step1 in the df. we will check the additional file to see if there are other types of questions.
df_additional = pd.read_json(additional_path, lines=True)
df_additional.head()

Unnamed: 0,question,answer,options,meta_info,answer_idx,metamap_phrases
0,A 23-year-old pregnant woman at 22 weeks gesta...,Nitrofurantoin,"{'A': 'Ampicillin', 'B': 'Ceftriaxone', 'C': '...",step2&3,E,"[23 year old, weeks presents, burning, urinati..."
1,A 3-month-old baby died suddenly at night whil...,Placing the infant in a supine position on a f...,{'A': 'Placing the infant in a supine position...,step2&3,A,"[3 month old baby died, night, asleep, died on..."
2,A mother brings her 3-week-old infant to the p...,Abnormal migration of ventral pancreatic bud,{'A': 'Abnormal migration of ventral pancreati...,step1,A,"[week old infant, pediatrician's office, born,..."
3,A pulmonary autopsy specimen from a 58-year-ol...,Thromboembolism,"{'A': 'Thromboembolism', 'B': 'Pulmonary ische...",step1,A,"[pulmonary autopsy, 58 year old woman, died, a..."
4,A 20-year-old woman presents with menorrhagia ...,Von Willebrand disease,"{'A': 'Factor V Leiden', 'B': 'Hemophilia A', ...",step1,E,"[20 year old woman presents, menorrhagia, past..."


In [11]:
df_additional['meta_info'].unique()

array(['step2&3', 'step1'], dtype=object)

In [12]:
# since step 2 and step 3 are the same, we will ignore the additional file and only use the original file.
# query the questions that contain the word 'colonoscopy' or 'endoscopy' or 'sigmoidoscopy' or 'gastro' in the question text. Search is case-insensitive. Search all keywords at once to avoid multiple queries and duplicate results.
keywords = ['colonoscopy', 'endoscopy', 'sigmoidoscopy', 'gastro']
pattern = '|'.join(keywords)
mask = df['question'].str.contains(pattern, case=False)
endoscopy_questions = df[mask]

# Output the first few endoscopy-related questions and total count
print(endoscopy_questions.head(), len(endoscopy_questions))  # Show first 5 entries and total count


                                              question answer  \
9    A 50-year-old man comes to the physician becau...      B   
16   A 65-year-old Asian woman comes to the physici...      C   
114  A gastroenterology fellow is interested in the...      C   
119  A 23-year-old woman with Ehlers-Danlos syndrom...      A   
171  A 60-year-old man comes to the physician for a...      D   

                                               options meta_info  
9    {'A': 'Injection sclerotherapy', 'B': 'Nadolol...     step2  
16   {'A': 'Cholesterol embolism', 'B': 'Traumatic ...     step2  
114  {'A': 'Random error', 'B': 'Effect modificatio...     step2  
119  {'A': 'Arthroscopy', 'B': 'Above knee cast', '...     step2  
171  {'A': 'Perform prostate biopsy', 'B': 'Obtain ...     step2   502


In [13]:
# export the endoscopy-related questions to a jsonl file
output_jsonl = 'endoscopy_questions.jsonl'
endoscopy_questions.to_json(output_jsonl, orient='records', lines=True)
print(f"Endoscopy-related questions saved to {output_jsonl}")

Endoscopy-related questions saved to endoscopy_questions.jsonl


In [14]:
# import the jsonl file to check if the export was successful
df_exported = pd.read_json(output_jsonl, lines=True)
print(df_exported.head())


                                            question answer  \
0  A 50-year-old man comes to the physician becau...      B   
1  A 65-year-old Asian woman comes to the physici...      C   
2  A gastroenterology fellow is interested in the...      C   
3  A 23-year-old woman with Ehlers-Danlos syndrom...      A   
4  A 60-year-old man comes to the physician for a...      D   

                                             options meta_info  
0  {'A': 'Injection sclerotherapy', 'B': 'Nadolol...     step2  
1  {'A': 'Cholesterol embolism', 'B': 'Traumatic ...     step2  
2  {'A': 'Random error', 'B': 'Effect modificatio...     step2  
3  {'A': 'Arthroscopy', 'B': 'Above knee cast', '...     step2  
4  {'A': 'Perform prostate biopsy', 'B': 'Obtain ...     step2  
