In [1]:
from pathlib import Path
from tqdm import tqdm
import re
from loggers.loggers import get_custom_logger
from bs4 import BeautifulSoup

raw_dataset_dir = Path("../../data/raw_dataset")

# Metadata

In [2]:
parsing_logger = get_custom_logger(name='parsing_logger', log_path="../../logs/parsing.log")
# everything_in_pre_pattern = r'<pre>(.*?)<span class="h1">'

patterns = {
    "updates_pattern": r"(Updates(?: RFC's)?|\bUpdates:\s*(.*?)\n)",
    "obsoletes_pattern": r"Obsoletes:?+\s*(.*?)\n",
    "category_pattern": r"(Categories:|Categories|Category:|Category)\s*(.*?)\n",
    "issn_pattern": r"ISSN:?+\s*(.*?)\n",
    "updated_by_pattern": r"Updated by:?+\s*(.*?)\n",
    "bcp_pattern": r"BCP:?+\s*(.*?)\n",
    "NIC_pattern": r"NIC:?+\s*(.*?)\n",
    "obsoleted_by_pattern": r"Obsoleted by:?+\s*(.*?)\n",
    "related_rfcs_pattern": r"(Related RFCs:|Related Functional Documents:|Related|Related:|References)\s*(.*?)\n"
}

pattern_indices = {
    "updates_pattern": 0,
    "obsoletes_pattern": 0,
    "category_pattern": 0,
    "issn_pattern": 0,
    "updated_by_pattern": 0,
    "bcp_pattern": 0,
    "NIC_pattern": 0,
    "obsoleted_by_pattern": 0,
    "related_rfcs_pattern": 0
}

# html_file = Path("../../data/raw_dataset/rfc1720.html")
for html_file in tqdm(raw_dataset_dir.glob("*.html"), desc="Processing RFCs", unit="file"):
    with html_file.open('r', encoding='utf-8') as file:
        html_content = file.read()
        
        for pattern_name, pattern in patterns.items():
            match = re.search(pattern, html_content, re.DOTALL)
            matching_element = match.group(1).strip() if match else None
            if matching_element:
                pattern_indices[pattern_name] += 1
        
        # everything_in_pre_match = re.search(everything_in_pre_pattern, html_content, re.DOTALL)
        # everything_in_pre = everything_in_pre_match.group(1).strip() if everything_in_pre_match else f'NO PRE AT: {html_file}'
        # parsing_logger.info(everything_in_pre)
            
print(pattern_indices)

Processing RFCs: 9266file [00:30, 308.69file/s]

{'updates_pattern': 1670, 'obsoletes_pattern': 1239, 'category_pattern': 7982, 'issn_pattern': 3744, 'updated_by_pattern': 1103, 'bcp_pattern': 6635, 'NIC_pattern': 1353, 'obsoleted_by_pattern': 1327, 'related_rfcs_pattern': 8080}





# Titles

In [2]:
titles_logger = get_custom_logger(name='titles_logger', log_path="../../logs/titles.log", format="%(message)s")

for html_file in tqdm(raw_dataset_dir.glob("*.html"), desc="Processing RFCs", unit="file"):
    with html_file.open('r', encoding='utf-8') as file:
        html_content = file.read()
        
        soup = BeautifulSoup(html_content, 'html.parser')
        title_tag = soup.title
        if title_tag:
            title_content = title_tag.text
            titles_logger.info(title_content)
        else:
            titles_logger.error(f'No title tag found in the HTML {html_file}')

Processing RFCs: 9266file [03:29, 44.29file/s] 


# Page content

In [2]:
test_file = Path('../../data/raw_dataset/txt/rfc1720.txt')
with test_file.open('r', encoding='utf-8') as file:
    html_content = file.read()

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')

def extract_visible_text(soup):
    texts = soup.find_all(string=True)
    visible_texts = []
    for text in texts:
        if text.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            visible_texts.append(text)
    return ' '.join(visible_texts).strip(), visible_texts

visible_text, visible_texts = extract_visible_text(soup)
print(visible_text)



In [4]:
visible_texts

[]

In [5]:
from unstructured.partition.auto import partition

elements = partition(filename=test_file)
print("\n\n".join([str(el) for el in elements]))

Network Working Group                        Internet Architecture Board Request for Comments: 1720                             J. Postel, Editor Obsoletes: RFCs 1610, 1600, 1540, 1500,                    November 1994 1410, 1360, 1280, 1250, 1100, 1083, 1130, 1140, 1200 STD: 1 Category: Standards Track

INTERNET OFFICIAL PROTOCOL STANDARDS

Status of this Memo

This memo describes the state of standardization of protocols used in the Internet as determined by the Internet Architecture Board (IAB). This memo is an Internet Standard. Distribution of this memo is unlimited.

Table of Contents

Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . 2 1. The Standardization Process  . . . . . . . . . . . . . . . . 3 2. The Request for Comments Documents . . . . . . . . . . . . . 5 3. Other Reference Documents  . . . . . . . . . . . . . . . . . 6 3.1. Assigned Numbers . . . . . . . . . . . . . . . . . . . . . 6 3.2. Gateway Requirements . . . . . . . . . . . . . . . . . . . 6 3.3.

In [24]:
pattern = r'RFC \d+ .* (?:January|February|March|April|May|June|July|August|September|October|November|December) \d{4}'

with open(test_file, 'r') as file:
    lines = file.readlines()

raw_text = ''
for line in lines:
    if '[Page' in line:
        continue
    if re.findall(pattern, line):
        continue
    raw_text += line.strip()
    print(line.strip())







Network Working Group                        Internet Architecture Board
Request for Comments: 1720                             J. Postel, Editor
Obsoletes: RFCs 1610, 1600, 1540, 1500,                    November 1994
1410, 1360, 1280, 1250, 1100, 1083,
1130, 1140, 1200
STD: 1
Category: Standards Track


INTERNET OFFICIAL PROTOCOL STANDARDS


Status of this Memo

This memo describes the state of standardization of protocols used in
the Internet as determined by the Internet Architecture Board (IAB).
This memo is an Internet Standard.  Distribution of this memo is
unlimited.

Table of Contents

Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . 2
1.  The Standardization Process  . . . . . . . . . . . . . . . . 3
2.  The Request for Comments Documents . . . . . . . . . . . . . 5
3.  Other Reference Documents  . . . . . . . . . . . . . . . . . 6
3.1.  Assigned Numbers . . . . . . . . . . . . . . . . . . . . . 6
3.2.  Gateway Requirements . . . . . . . . . . . . . . . 

In [28]:
raw_text = re.sub(r'\n+', '\n', raw_text)
raw_text = re.sub(r'\s+', ' ', raw_text)
raw_text = raw_text.replace('\t', ' ')
raw_text



In [30]:
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

loader = TextLoader(test_file)

In [31]:
loader.load()



In [32]:
doc = Document(page_content=raw_text)

In [33]:
doc

