In [4]:
from bs4 import BeautifulSoup
import os
import json

def convert_html_to_json(html_file):
    with open(html_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    data = {}

    # Extract chapter title from the filename
    chapter_title = os.path.splitext(os.path.basename(html_file))[0]
    data['chapter_title'] = chapter_title

    # Extract content from the HTML file
    content = soup.get_text(separator='\n', strip=True)
    data['content'] = content

    return data


def process_mpep_html_files(directory):
    json_data = []

    for filename in os.listdir(directory):
        if filename.endswith('.html'):
            html_file = os.path.join(directory, filename)
            json_data.append(convert_html_to_json(html_file))

    with open('mpep_data.json', 'w', encoding='utf-8') as file:
        json.dump(json_data, file, indent=4)

# Specify the directory containing the MPEP HTML files
html_directory = '.\MPEP_HTML'

# Process the HTML files and generate JSON
process_mpep_html_files(html_directory)


In [5]:
import json

with open('mpep_data.json', 'r', encoding='utf-8') as file:
    mpep_data = json.load(file)

# Access and process the MPEP data
for chapter in mpep_data:
    chapter_title = chapter['chapter_title']
    chapter_content = chapter['content']

    # Find the start of the relevant chapter content
    start_index = chapter_content.find("Chapter\\t")
    if start_index != -1:
        chapter_content = chapter_content[start_index:]

    # Find the end of the relevant chapter content
    end_index = chapter_content.find("\\n\\n", start_index)
    if end_index != -1:
        chapter_content = chapter_content[:end_index]

    # Perform desired operations with the chapter title and content
    print(f"Chapter: {chapter_title}")
    print(f"Content: {chapter_content[:100]}...")  # Print the first 100 characters of the content
    print("="*50)

Chapter: .html
Content: MPEP
Relevance
Outline
Help
Public Quick Reference Guide
Public User Manual
Switch Version
E8r8
e8r9...
Chapter: 0100  Secrecy Access National Security and Foreign Filing
Content: MPEP
Relevance
Outline
Help
Public Quick Reference Guide
Public User Manual
Switch Version
E8r8
e8r9...
Chapter: 0200  Types and Status of Application Benefit and Priority Claims
Content: MPEP
Relevance
Outline
Help
Public Quick Reference Guide
Public User Manual
Switch Version
E8r8
e8r9...
Chapter: 0300  Ownership and Assignment
Content: MPEP
Relevance
Outline
Help
Public Quick Reference Guide
Public User Manual
Switch Version
E8r8
e8r9...
Chapter: 0400  Representative of Applicant or Owner
Content: MPEP
Relevance
Outline
Help
Public Quick Reference Guide
Public User Manual
Switch Version
E8r8
e8r9...
Chapter: 0500  Receipt and Handling of Mail and Papers
Content: MPEP
Relevance
Outline
Help
Public Quick Reference Guide
Public User Manual
Switch Version
E8r8
e8r9...
Chapter: 0600  Par

In [6]:
import json
import re

def extract_chapter_content(content):
    # Use regular expressions to extract the relevant chapter content
    pattern = r"Chapter\s*\d+\s*-\s*(.+?)(?=Chapter\s*\d+\s*-|\Z)"
    match = re.search(pattern, content, re.DOTALL)

    if match:
        return match.group(1).strip()
    else:
        return ""

with open('mpep_data_clean.json', 'r', encoding='utf-8') as file:
    mpep_data = json.load(file)

# Access and process the MPEP data
for chapter in mpep_data:
    chapter_title = chapter['chapter_title']
    chapter_content = chapter['content']

    # Extract the relevant chapter content using regular expressions
    extracted_content = extract_chapter_content(chapter_content)

    # Perform desired operations with the chapter title and content
    print(f"Chapter: {chapter_title}")
    print(f"Content: {extracted_content[:500]}...")  # Print the first 100 characters of the content
    print("="*50)

Chapter: .html
Content: ...
Chapter: 0100  Secrecy Access National Security and Foreign Filing
Content: Secrecy, Access, National Security, and Foreign Filing 101 General [R-07.2022] 35 U.S.C. 122 Confidential status of applications; publication of patent applications. (a) CONFIDENTIALITY.— Except as provided in subsection (b), applications for patents shall be kept in confidence by the Patent and Trademark Office and no information concerning the same given without authority of the applicant or owner unless necessary to carry out the provisions of an Act of Congress or in such special circumstanc...
Chapter: 0200  Types and Status of Application Benefit and Priority Claims
Content: Types and Status of Application; Benefit and Priority Claims 201 Types of Applications [R-07.2015] 35 U.S.C. 101 Inventions patentable. Whoever invents or discovers any new and useful process, machine, manufacture, or composition of matter, or any new and useful improvement thereof, may obtain a patent ther

##### unused code

In [14]:


# def analyze_html_structure(html_file):
#     with open(html_file, 'r', encoding='utf-8') as file:
#         html_content = file.read()

#     soup = BeautifulSoup(html_content, 'html.parser')

#     print(f"Analyzing HTML file: {html_file}")
#     print("="*50)

#     # Find the chapter title element
#     chapter_title_element = soup.find('h2', class_='navTitleAnchor')
#     if chapter_title_element:
#         print("Chapter Title Element:")
#         print(chapter_title_element)
#         print("-"*50)
#     else:
#         print("Chapter Title Element not found.")
#         print("-"*50)

#     # Find section elements
#     sections = soup.find_all('div', class_='section')
#     if sections:
#         print(f"Number of Sections: {len(sections)}")
#         print("Section Elements:")
#         for i, section in enumerate(sections, start=1):
#             print(f"Section {i}:")
#             print(section)
#             print("-"*50)
#     else:
#         print("No Section Elements found.")
#         print("-"*50)

#     # Find subsection elements
#     subsections = soup.find_all('div', class_='subsection')
#     if subsections:
#         print(f"Number of Subsections: {len(subsections)}")
#         print("Subsection Elements:")
#         for i, subsection in enumerate(subsections, start=1):
#             print(f"Subsection {i}:")
#             print(subsection)
#             print("-"*50)
#     else:
#         print("No Subsection Elements found.")
#         print("-"*50)

#     print("="*50)
#     print()

# def analyze_mpep_html_files(directory):
#     for filename in os.listdir(directory):
#         if filename.endswith('.html'):
#             html_file = os.path.join(directory, filename)
#             analyze_html_structure(html_file)

# # Specify the directory containing the MPEP HTML files
# html_directory = 'MPEP_HTML'

# # Analyze the HTML files
# analyze_mpep_html_files(html_directory)

Analyzing HTML file: MPEP_HTML\.html
Chapter Title Element not found.
--------------------------------------------------
No Section Elements found.
--------------------------------------------------
No Subsection Elements found.
--------------------------------------------------

Analyzing HTML file: MPEP_HTML\0100  Secrecy Access National Security and Foreign Filing.html
Chapter Title Element not found.
--------------------------------------------------
No Section Elements found.
--------------------------------------------------
No Subsection Elements found.
--------------------------------------------------

Analyzing HTML file: MPEP_HTML\0200  Types and Status of Application Benefit and Priority Claims.html
Chapter Title Element not found.
--------------------------------------------------
No Section Elements found.
--------------------------------------------------
No Subsection Elements found.
--------------------------------------------------

Analyzing HTML file: MPEP_HTML\0300