In [19]:
import jsonlines
import json
import os

# JSON Handling

#### Convert Spark's JsonLine format to valid Json
Spark creates a special kind of json output: Multiple jsons are just concatted and only seperated by a new line. This is not a valid json and can't be inspected easily. The outer body is missing and the whole structure is not an array. Moreover these files are excessivly big. Therefore we separate these JsonLine files into multiple jsons - one for each block (brackets) within. Having valid jsons allows using tree viewers for inspection.

In [20]:
# We need to set up the folder where the files to manipulate reside.
data_base_path = "../../data/"
experiment_json_folder = "indicatorOutput/rawSpark"
raw_jsonline_input_path = os.path.join(data_base_path, experiment_json_folder)

In [21]:
# We need another location where to put the results.
cleaned_json_folder = "indicatorOutput/refinedNotebook"
refined_json_output_path = os.path.join(data_base_path, cleaned_json_folder)

In [22]:
# Load in spark's JosonLine output files.
file_list = []
for file in os.listdir(raw_jsonline_input_path):
    if file.endswith('.json'):
        file_list.append(os.path.join(raw_jsonline_input_path, file))

# Write multiple json files.
for file in file_list:
    with jsonlines.open(file, 'r') as reader:
        for item in reader:
            file_name = item['paper_id'] + '_p' + str(item['paragraph_index']) + '.json'
            with open(os.path.join(refined_json_output_path, file_name), 'w') as outfile:
                json.dump(item, outfile) 

# HTML Conversion

In [73]:
# Path to the json files which make up the refined spark output.
tagged_json_input_path = refined_json_output_path
# Path to the folder where to put the html files.
tagged_html_folder = "indicatorOutput/taggedHtml"
tagged_html_output_path = os.path.join(data_base_path, tagged_html_folder)

In [74]:
# Load in the refined json output files.
file_list = []
for file in os.listdir(tagged_json_input_path):
    if file.endswith('.json'):
        file_list.append(os.path.join(tagged_json_input_path, file))

data = []
for file in file_list:
    with open(file) as json_file:
        data.append(json.load(json_file))      

In [75]:
open_tag = '<b>'
close_tag = '</b>'
open_html_tag = '<html>'
close_html_tag = '</html>'
open_body_tag = '<body>'
close_body_tag = '</body>'

def convertToHtml(data_dict):
    text = data_dict["text"]
    smoking_data = data_dict['medical_indicator_words']
    smoking_dicts = sorted(smoking_data, key=lambda k: k['begin'], reverse=True)
    
    for word in smoking_dicts:
        text = text.replace(word['word'], open_tag + word['word'] + close_tag)
        
    return {'paper_id': data_dict['paper_id'], 'paragraph_index': data_dict['paragraph_index'], 'html': open_html_tag + open_body_tag + text + close_body_tag + close_html_tag}

In [76]:
html_list = []
for paragraph in data:
    html_list.append(convertToHtml(paragraph))

In [77]:
print(html_list[0])

{'paper_id': '0a6ce864e9696cf6bc21a64bb8b1abda99d4744a', 'paragraph_index': 11, 'html': '<html><body><b>obesity</b> can reduce immune cell functionality, induce gut microbiome/virome imbalance, inflammatory cytokine phenotype and increase antiviral, antimicrobial and anticoagulant resistance as depicted in Fig. 1 . in over<b>weight</b> children, anti-tetanus igg antibodies were significantly lower compared to normal <b>weight</b> controls due to the chronic low grade inflammation expressed by the higher levels of il-6 (84). Similarly, researchers reported in a 2019 study that reduction or elimination of food in over<b>weight</b>/obese adults can lead to a decreased igg concentration over time (85) .</body></html>'}


In [80]:
for html in html_list:
    file = os.path.join(tagged_html_output_path, html['paper_id'] + 'p' + str(html['paragraph_index']) + '.html')
    with open(file, 'w') as html_file:
        html_file.write(html['html'])