In [1]:
import pypandoc
from bs4 import BeautifulSoup
import re
import json
from pathlib import Path
from copy import deepcopy
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import pysbd

In [2]:
def extract_divs(html):
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.select('div')
    return divs

def clean_div(div):
    html_div = str(div)
    # replace_empty_italic = re.compile(r'</?div.*?>')
    # html_div = replace_empty_italic.sub('', html_div)
    html_div = re.sub(r'<sup>(.*?)</sup>', r' [\1]', html_div)
    html_div = html_div.strip()
    return html_div

def convert_to_markdown_sections(html_div):
    markdown_conversion = pypandoc.convert_text(html_div, 'gfm' ,'html')
    markdown_conversion = markdown_conversion.replace('<div>\n','')
    markdown_conversion = re.sub(r'</?div>[\s\n]*', '', markdown_conversion)
    return markdown_conversion

def clean_sections(markdown_conversion):
    markdown_sections_with_delimiters = re.split(r'(##)', markdown_conversion)
    markdown_sections_with_delimiters = [s for s in markdown_sections_with_delimiters if s]
    combined_sections = []
    current_section = ""
    for item in markdown_sections_with_delimiters:
        if item == '##' or item == "#":
            if current_section: # Add the previous section if it exists
                combined_sections.append(current_section.strip())
            current_section = item # Start the new section with the delimiter
        else:
            current_section += item
    if current_section: # Add the last section
        combined_sections.append(current_section.strip())
    return combined_sections

def condense_paragraphs(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        if sentence[-2:] == '\n\n' and sentence[0] in ['*','#']:
            cleaned_sentences.append(sentence[:-1])
        elif sentence.strip()[0] == '-':
            cleaned_sentences.append('\n' + sentence)
        elif sentence[-2:] == '\n\n':
            cleaned_sentences.append(sentence[:-1])
        elif sentence[-1:] == '\n':
            cleaned_sentences.append(sentence[:-1])
        elif sentence.strip(' ')[-1:] == '\n':
            cleaned_sentences.append(sentence.strip())
        else:
            cleaned_sentences.append(sentence)
    return ' '.join(cleaned_sentences)

def label_section(sections):
    labeled_sections = []
    for section in sections:
        section_type = "section"
        if '#_Toc' in section:
            section_type = 'table_of_contents'
        elif '---------' in section:
            section_type = 'header'
        labeled_sections.append({'text': section, 'type': section_type})
    return labeled_sections

def clean_passage(sentences):
    cleaned_passage = condense_paragraphs(sentences)
    cleaned_passage = re.sub(r"\\\[<a.*?>(\w*)<.*?\]", r'[\1]', cleaned_passage)
    cleaned_passage = re.sub(r"<span.*?>.*?</span>","", cleaned_passage, flags=re.DOTALL)
    cleaned_passage = cleaned_passage.strip()
    cleaned_passage = re.sub(r'\[\]\(#_.*?\)', '', cleaned_passage)
    cleaned_passage = re.sub(r'\[.*?Table.*?\]\(#_.*?\)', '', cleaned_passage)
    cleaned_passage = re.sub(r'\[.*?Figure.*?\]\(#_.*?\)', '', cleaned_passage)
    cleaned_passage = re.sub(r'\*{4,}', '', cleaned_passage)
    cleaned_passage = re.sub(r'<div.*?>', '', cleaned_passage)
    return cleaned_passage

def process_div(div):
    seg = pysbd.Segmenter(language="en", clean=False)
    div = clean_div(div)
    markdown_conversion = convert_to_markdown_sections(div)
    sections = clean_sections(markdown_conversion)
    cleaned_passages = list()
    for section in sections:
        sentences = seg.segment(section)
        cleaned_passages.append(clean_passage(sentences))
    return cleaned_passages

def extract_unique_segments(html_segments):
    html_segments = [segment for segment in html_segments if segment.get_text().strip() != '']
    unique_segments = list()
    unique_tracker = list()
    for segment in html_segments:
        if segment.get_text().strip() not in unique_tracker:
            unique_segments.append(segment)
            unique_tracker.append(segment.get_text().strip())
    if unique_segments[-1].get_text().strip() == 'Footnotes':
        unique_segments = unique_segments[:-1]
    return unique_segments

def process_div_segments_to_text(unique_segments):
    processed_sections = process_div(unique_segments[0])
    processed_sections = [x.strip() for x in processed_sections if not re.match(r'^\#+$', x.strip())]
    return processed_sections

def extract_json_information(report_json):
    overview_fields = ['id','type','typeId','number','active','topics']
    document_fields = ['date','title','summary']

    document = dict()
    for _field in overview_fields:
        document[_field] = report_json[_field]

    sorted_metadata = sorted(report_json['versions'], key=lambda x: x['date'], reverse=True)[0]

    # for report_file in sorted_metadata['formats']:
    #     if report_file['format'] == 'HTML':
    #         file_name = report_file["filename"]

    file_name = [x.name for x in html_folder.glob('*.html') if report_json['id'] in x.name]
    if len(file_name) > 0:
        file_name = file_name[0]
    else:
        file_name = None

    for _field in document_fields:
        document[_field] = sorted_metadata[_field]
    document['doc_id'] = sorted_metadata['id']

    document['filename'] = file_name
    return document

def process_html_file(filename, min_len=30):
    with open(html_folder.joinpath(filename),'r') as f:
        html = f.read()

    html = re.sub('<table>(.*?)</table>', '<div>', html, flags=re.MULTILINE | re.DOTALL)
    html = re.sub('(</?img.+?>)', '', html, flags=re.MULTILINE | re.DOTALL)
    return html

def process_aname(div):
    seg = pysbd.Segmenter(language="en", clean=False)
    div = clean_div(div)
    markdown_conversion = convert_to_markdown_sections(div)
    sections = clean_sections(markdown_conversion)
    cleaned_passages = list()
    for section in sections:
        sentences = seg.segment(section)
        cleaned_passages.append(clean_passage(sentences))

    cleaned_passages = [x.strip() for x in cleaned_passages if not re.match(r'^\#+$', x.strip())]
    return cleaned_passages[0]

In [15]:
report_folder = Path('../wonky_data/full_reports')
json_folder = report_folder.joinpath('reports')
html_folder = Path('../wonky_data/files')

save_folder = Path('../wonky_data/parsed_reports')
whole_folder = save_folder.joinpath('whole_doc')
section_folder = save_folder.joinpath('sections')
whole_folder.mkdir(parents=True, exist_ok=True)
section_folder.mkdir(parents=True, exist_ok=True)

existing_files = [x.name for x in list(section_folder.glob('*.json'))]

report_files = [x for x in list(json_folder.glob('*.json'))]

new_files = [x for x in report_files if x.name not in existing_files]

In [16]:
len(report_files)

22195

In [17]:
len(existing_files)

21514

In [18]:
len(new_files)

681

In [19]:
error_count = 0

In [20]:
sources = list()
for source_file in tqdm(sorted(new_files)):
    with open(source_file,'r') as f:
        report = json.load(f)
    
    extracted_report = extract_json_information(report)
    if extracted_report['filename'] is None:
        continue
        
    extracted_report['source_file'] = source_file.name
    
    html = process_html_file(extracted_report['filename'])
    if len(html.strip()) == 0:
        continue
    try:
        if re.search(r'<a name=\d+>', html):
            html = html.replace('<br/>','\n')
            html_segments = [BeautifulSoup(x, 'html.parser') for x in html.split('<hr/>')]
            text_segments = extract_unique_segments(html_segments)
            text_segments = [process_aname(segment) for segment in text_segments]
        else:
            html_segments = extract_divs(html)
            html_segments = extract_unique_segments(html_segments)
            text_segments = process_div_segments_to_text(html_segments)
    except Exception as e:
        error_count += 1
        continue
    full_text = '\n\n'.join(text_segments)
    _number_of_sections = len(text_segments)
    numbered_sections = {i:segment for i,segment in enumerate(text_segments, start=1)}
    metadata = deepcopy(extracted_report)
    metadata['text'] = full_text
    metadata['sections'] = numbered_sections
    if len(metadata['text'].strip()) == 0:
        continue
    with open(section_folder.joinpath(f"""{source_file.stem}.json"""),'w') as f:
        json.dump(metadata, f)
    # break
    # full_doc_data['document'] = sections_doc
    # sources.append(full_doc_data)
    # with open(whole_folder.joinpath(source_file.name),'w') as f:
    #     json.dump(full_doc_data, f)
    
        
    # sections_data = list()
    # for i, section in enumerate(sections_doc):
    #     section_data = deepcopy(extracted_report)
    #     section_data['page'] = i
    #     section_data['page_id'] = f"{section_data['id']}_{i:03}"
    #     section_data['document'] = section
    #     sections_data.append(section_data)
    #     with open(section_folder.joinpath(f"""{source_file.stem}_{i:03}.json"""),'w') as f:
    #         json.dump(full_doc_data, f)


  0%|          | 0/681 [00:00<?, ?it/s]

In [125]:
type(text_segments[0])

str

In [126]:
html

'\n<a name=1></a>\n\n<hr/>\n<a name=2></a>\n\n<hr/>\n<a name=3></a>\n\n<hr/>\n<a name=4></a>\n\n<hr/>\n<a name=5></a>\n\n<hr/>\n<a name=6></a>\n\n<hr/>\n<a name=7></a>\n\n<hr/>\n<a name=8></a>\n\n<hr/>\n<a name=9></a>\n\n<hr/>\n<a name=10></a>\n\n<hr/>\n<a name=11></a>\n\n<hr/>\n<a name=12></a>\n\n<hr/>\n<a name=13></a>\n\n<hr/>\n<a name=14></a>\n\n<hr/>\n<a name=15></a>\n\n<hr/>\n<a name=16></a>\n\n<hr/>\n<a name=17></a>\n\n<hr/>\n<a name=18></a>\n\n<hr/>\n<a name=19></a>\n\n<hr/>\n<a name=20></a>\n\n<hr/>\n<a name=21></a>\n\n<hr/>\n<a name=22></a>\n\n<hr/>\n<a name=23></a>\n\n<hr/>\n<a name=24></a>\n\n<hr/>\n<a name=25></a>\n\n<hr/>\n<a name=26></a>\n\n<hr/>\n<a name=27></a>\n\n<hr/>\n<a name=28></a>\n\n<hr/>\n<a name=29></a>\n\n<hr/>\n<a name=30></a>\n\n<hr/>\n<a name=31></a>\n\n<hr/>\n<a name=32></a>\n\n<hr/>\n<a name=33></a>\n\n<hr/>\n<a name=34></a>\n\n<hr/>\n<a name=35></a>\n\n<hr/>\n<a name=36></a>\n\n<hr/>\n<a name=37></a>\n\n<hr/>\n<a name=38></a>\n\n<hr/>\n<a name=39></a>\n\

In [127]:
with open('test_html.html','w') as f:
    f.write(html)

In [128]:
re.search(r'<a name=\d+', html)

<re.Match object; span=(1, 10), match='<a name=1'>

In [129]:
'<a name' in html

True

In [131]:
extracted_report['filename']

'19840302_84-29S_071a5bb73087555a529778e4b292f7c980c586fe.html'

In [133]:
with open(html_folder.joinpath(extracted_report['filename']), 'r') as f:
    html = f.read()

In [134]:
with open('test_html.html','w') as f:
    f.write(html)