In [1]:
import mammoth

import re
from pprint import pprint
import json
from bs4 import BeautifulSoup

In [2]:
def docx_to_html(file_path):
    with open(file_path, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html = result.value  
        messages = result.messages  # Any warnings or errors during conversion
    return html

docx_file_path = "register.docx"
html_output = docx_to_html(docx_file_path)

# find and delete this pattern in html_output </table><table><tr>(.*?)</tr> - this is for tables that span pages
matches = re.findall(r'</table><table><tr>(.*?)</tr>', html_output)
cleaned_html = re.sub(r'</table><table><tr>(.*?)</tr>', '', html_output)

# replace </p><p> with "" - This is for paragraphs in <td>
cleaned_html = re.sub(r'</p><p>', ' ', cleaned_html)

# Save the HTML to a file
with open("output.html", "w") as html_file:
    html_file.write(cleaned_html)

In [9]:

def split_document_by_pattern(html_content):

    # Split at table after TRUSTS

    pattern = r"(<ol>\s*<li>\s*<strong>TRUSTS</strong>\s*</li>\s*</ol>.*?</table>)"

    # Split the document using the pattern
    sections = re.split(pattern, html_content, flags=re.DOTALL)

    # Combine the sections after splitting (capturing groups leave pattern matches in the split result)
    combined_sections = []
    for i in range(0, len(sections) - 1, 2):
        combined_sections.append(sections[i] + sections[i + 1])  # Add the content before and including the match

    # Add the final leftover content if any
    if len(sections) % 2 != 0:
        combined_sections.append(sections[-1])

    return combined_sections

with open("output.html", "r", encoding="utf-8") as file:
    html_data = file.read()

sections = split_document_by_pattern(html_data)



In [13]:
# Section Patterns

sections_split = {
    "RAW-SHARES AND OTHER FINANCIAL INTERESTS": r"(<ol>\s*<li>\s*<strong>SHARES AND OTHER FINANCIAL INTERESTS</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT": r"(<ol>\s*<li>\s*<strong>REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-DIRECTORSHIPS AND PARTNERSHIPS": r"(<ol>\s*<li>\s*<strong>DIRECTORSHIPS AND PARTNERSHIPS</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-CONSULTANCIES AND RETAINERSHIPS": r"(<ol>\s*<li>\s*<strong>CONSULTANCIES AND RETAINERSHIPS</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-SPONSORSHIPS": r"(<ol>\s*<li>\s*<strong>SPONSORSHIPS</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-GIFTS AND HOSPITALITY": r"(<ol>\s*<li>\s*<strong>GIFTS AND HOSPITALITY</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-BENEFITS AND INTERESTS FREE LOANS": r"(<ol>\s*<li>\s*<strong>BENEFITS AND INTERESTS FREE LOANS</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-TRAVEL": r"(<ol>\s*<li>\s*<strong>TRAVEL</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-OWNERSHIP IN LAND AND PROPERTY": r"(<ol>\s*<li>\s*<strong>OWNERSHIP IN LAND AND PROPERTY</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-PENSIONS": r"(<ol>\s*<li>\s*<strong>PENSIONS</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-RENTED PROPERTY": r"(<ol>\s*<li>\s*<strong>RENTED PROPERTY</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-INCOME GENERATING ASSETS": r"(<ol>\s*<li>\s*<strong>INCOME GENERATING ASSETS</strong>\s*</li>\s*</ol>.*?</table>)",
    "RAW-TRUSTS": r"(<ol>\s*<li>\s*<strong>TRUSTS</strong>\s*</li>\s*</ol>.*?</table>)"
}

def parse_table_to_json(html_table, key_name):
  if not isinstance(html_table, str):
    return

  soup = BeautifulSoup(html_table, "html.parser")
  rows = soup.find_all("tr")

  # Extract headers from the first row
  headers = [header.get_text(strip=True) for header in rows[0].find_all("p")]

  # Extract data from the remaining rows
  data = []
  for row in rows[1:]:
      values = [value.get_text(strip=True) for value in row.find_all("p")]
      entry = {headers[i]: values[i] if i < len(values) else "" for i in range(len(headers))}
      data.append(entry)

  # Construct the final JSON object
  result = data
  return result

def process_person(person_html, section_name):


  content = {}




  # Extract each section
  for key, pattern in sections_split.items():
      matches = re.findall(pattern, person_html)
      content[key] = matches[0] if matches else None


  for html in content:
    table_pattern = r"<table.*?>(.*?)</table>"

    if isinstance(content[html], str):
      table_contents = re.findall(table_pattern, content[html])[0]
      content[html] = "<table>" + table_contents + "</table>"


  key_name = section_name.replace("RAW-", "")
  result = parse_table_to_json(content['RAW-' + key_name], key_name)

  return(result)


people = []

for person in sections:
    
    if isinstance(person, str):
      person_name = ""
      person_title = ""
      person_party = ""

      if re.findall(r"<ul><li><ol><li>(.*?)</li></ol></li></ul>", person):
        person_name = re.findall(r"<ul><li><ol><li>(.*?)</li></ol></li></ul>", person)[0]

      if person_name == "<strong>SHARES AND OTHER FINANCIAL INTERESTS</strong>":
        person_name = re.findall(r"<ol><li>(.*?)<ol><li>(.*?)</li></ol></li></ol>",person)[0][1]

      person_party = re.findall(r"<p>(.*?)</p>",person)[0] if re.findall(r"<p>(.*?)</p>",person) else None

      if person_name:
            parts = person_name.split(", ")
            surname = parts[0].strip()  # Always the first part
            if len(parts) > 1:
                person_title = parts[1].split()[0].strip()  # Only the first word is the title
                given_names = " ".join(parts[1].split()[1:]).strip()  # Remaining words are given names
                person_name = f"{given_names} {surname}".strip()
      
      people.append({
        "mp": person_name,
        "title": person_title,
        "party": person_party,
        "SHARES AND OTHER FINANCIAL INTERESTS": process_person(person, "RAW-SHARES AND OTHER FINANCIAL INTERESTS"),
        "REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT": process_person(person, "RAW-REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT"),
        "DIRECTORSHIPS AND PARTNERSHIPS": process_person(person, "RAW-DIRECTORSHIPS AND PARTNERSHIPS"),
        "CONSULTANCIES AND RETAINERSHIPS": process_person(person, "RAW-CONSULTANCIES AND RETAINERSHIPS"),
        "SPONSORSHIPS": process_person(person, "RAW-SPONSORSHIPS"),
        "GIFTS AND HOSPITALITY": process_person(person, "RAW-GIFTS AND HOSPITALITY"),
        "BENEFITS AND INTERESTS FREE LOANS": process_person(person, "RAW-BENEFITS AND INTERESTS FREE LOANS"),
        "TRAVEL": process_person(person, "RAW-TRAVEL"),
        "OWNERSHIP IN LAND AND PROPERTY": process_person(person, "RAW-OWNERSHIP IN LAND AND PROPERTY"),
        "PENSIONS": process_person(person, "RAW-PENSIONS"),
        "RENTED PROPERTY": process_person(person, "RAW-RENTED PROPERTY"),
        "INCOME GENERATING ASSETS": process_person(person, "RAW-INCOME GENERATING ASSETS"),
        "TRUSTS": process_person(person, "RAW-TRUSTS")
      })

    else:
        print(f"Skipping non-string person entry: {type(person)}")



# clean people by dumping any entry where mp = None
people = [person for person in people if person['mp'] is not None]



with open("output.json", "w") as outfile:
    json.dump(people, outfile)







In [11]:
# Just to debug final JSON

import json

def load_json_as_array(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            if isinstance(data, list):
                return data
            else:
                print(f"Warning: JSON file does not contain an array of objects. Returning the loaded data as is.")
                return data
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {file_path}")
        return None

# Example usage
file_path = "output.json" 
data = load_json_as_array(file_path)
