In [1]:
from pathlib import Path
from tqdm import tqdm
import re
from loggers.loggers import get_custom_logger
from bs4 import BeautifulSoup

raw_dataset_dir = Path("../../data/raw_dataset/html")

# Metadata

In [2]:
parsing_logger = get_custom_logger(name='parsing_logger', log_path="../../logs/parsing.log")
# everything_in_pre_pattern = r'<pre>(.*?)<span class="h1">'

patterns = {
    "updates_pattern": r"(Updates(?: RFC's)?:\s*(.*?)\n)",
    "obsoletes_pattern": r"Obsoletes:?+\s*(.*?)\n",
    "category_pattern": r"\b(?:Categories?|Category):?\s*(.*?)\n",
    "issn_pattern": r"ISSN:?+\s*(.*?)\n",
    "updated_by_pattern": r"Updated by:?+\s*(.*?)\n",
    # "bcp_pattern": r"BCP:?+\s*(.*?)\n",
    "NIC_pattern": r"NIC:?+\s*(.*?)\n",
    "obsoleted_by_pattern": r"Obsoleted by:?+\s*(.*?)\n",
    "related_rfcs_pattern": r"\b(?:Related\s*(?:RFCs|Functional\s*Documents?)?|References)\s*:\s*(.*?)(?:\n|$)"
}

pattern_contents = {
    "updates_pattern": [],  # needs numbers only (excluding MONTH 4digit number)
    "obsoletes_pattern": [],  # needs numbers only (excluding MONTH 4digit number)
    "category_pattern": [],  # needs all words before 3 or more spaces
    "issn_pattern": [],  # 4 digit - 4 digit
    "updated_by_pattern": [],  # needs numbers only
    # "bcp_pattern": [],  # needs numbers only (first)
    "NIC_pattern": [],  # needs numbers only (excluding MONTH 4digit number)
    "obsoleted_by_pattern": [],  # needs numbers only (excluding MONTH 4digit number)
    "related_rfcs_pattern": [],  # needs numbers only (excluding MONTH 4digit number)
}

# html_file = Path("../../data/raw_dataset/rfc1720.html")
for html_file in tqdm(raw_dataset_dir.glob("*.html"), desc="Processing RFCs", unit="file"):
    with html_file.open('r', encoding='utf-8') as file:
        html_content = file.read()

        for pattern_name, pattern in patterns.items():
            match = re.search(pattern, html_content, re.DOTALL)

            matching_element = match.group(1) if match else None
            if matching_element:
                # print(html_file)
                # print(matching_element)
                pattern_contents[pattern_name].append(matching_element)

Processing RFCs: 9266file [00:27, 337.58file/s]


In [3]:
pattern_contents['updates_pattern']

['Updates: RFC <a href="./rfc732">732</a>                                           February 1988\n',
 'Updates: RFCs <a href="./rfc1034">1034</a>, <a href="./rfc1035">1035</a>                                      April 1989\n',
 'Updates:  RFC <a href="./rfc976">976</a>                                          December 1989\n',
 'Updates: RFCs <a href="./rfc822">822</a>, <a href="./rfc987">987</a>, <a href="./rfc1026">1026</a>                               December 1989\n',
 'Updates: RFCs <a href="./rfc822">822</a>, <a href="./rfc987">987</a>, <a href="./rfc1026">1026</a>, <a href="./rfc1138">1138</a>                            March 1990\n',
 'Updates: RFC <a href="./rfc908">908</a>                                              R. Hinden\n',
 'Updates: <a href="./rfc99">99</a>\n',
 'Updates: RFCs <a href="./rfc1034">1034</a>, <a href="./rfc1035">1035</a>                                      L. Mamakos\n',
 'Updates: RFC <a href="./rfc907">907</a>                                      

In [6]:
from dataset.regexes import NUMBERS_PATTERN

for entry in tqdm(pattern_contents['updates_pattern'], desc="Processing stuff", unit="file"):
    pattern = NUMBERS_PATTERN
    correct_entry = entry.split('    ')[0]
    match = re.findall(pattern, correct_entry, re.DOTALL)
    print(f'{entry, correct_entry, match}\n\n')

Processing stuff: 100%|██████████| 1170/1170 [00:00<00:00, 97514.82file/s]

('Updates: RFC <a href="./rfc732">732</a>                                           February 1988\n', 'Updates: RFC <a href="./rfc732">732</a>', ['732'])


('Updates: RFCs <a href="./rfc1034">1034</a>, <a href="./rfc1035">1035</a>                                      April 1989\n', 'Updates: RFCs <a href="./rfc1034">1034</a>, <a href="./rfc1035">1035</a>', ['1034', '1035'])


('Updates:  RFC <a href="./rfc976">976</a>                                          December 1989\n', 'Updates:  RFC <a href="./rfc976">976</a>', ['976'])


('Updates: RFCs <a href="./rfc822">822</a>, <a href="./rfc987">987</a>, <a href="./rfc1026">1026</a>                               December 1989\n', 'Updates: RFCs <a href="./rfc822">822</a>, <a href="./rfc987">987</a>, <a href="./rfc1026">1026</a>', ['822', '987', '1026'])


('Updates: RFCs <a href="./rfc822">822</a>, <a href="./rfc987">987</a>, <a href="./rfc1026">1026</a>, <a href="./rfc1138">1138</a>                            March 1990\n', 'Updates: RFCs




In [18]:
from collections import Counter

counter = Counter()
categories = []
for entry in tqdm(pattern_contents['category_pattern'], desc="Processing stuff", unit="file"):
    correct_entry = entry.split('    ')[0]
    categories.append(correct_entry)
    # print(f'{entry, correct_entry}\n\n')

element_counts = Counter(categories)
sorted_elements = element_counts.most_common()

# Print each element and its count on a new line
for element, count in sorted_elements:
    print(f'Element: {element} ------- Count: {count}')

Processing stuff: 100%|██████████| 7981/7981 [00:00<00:00, 1141197.29file/s]

Element: Standards Track ------- Count: 3688
Element: Informational ------- Count: 2538
Element: </dt> ------- Count: 809
Element: Experimental ------- Count: 471
Element: Best Current Practice ------- Count: 283
Element: Historic ------- Count: 26
Element: F, G.3 ------- Count: 14
Element: B.1 ------- Count: 10
Element: D.6 ------- Count: 5
Element: Standard Track ------- Count: 5
Element: D.4, D.7 ------- Count: 4
Element: D.4, D.5, and D.7 ------- Count: 4
Element: A.2 ------- Count: 3
Element: D.1 ------- Count: 3
Element: D.7 ------- Count: 3
Element: D.3 ------- Count: 3
Element: Proposed Standard ------- Count: 3
Element: Updates: ------- Count: 3
Element: G.3 ------- Count: 2
Element: C.4 ------- Count: 2
Element: G.2 ------- Count: 2
Element: D ------- Count: 2
Element: D.4 ------- Count: 2
Element: F ------- Count: 2
Element: Telnet ------- Count: 2
Element: F. G.3 ------- Count: 2
Element: TELNET ------- Count: 2
Element: Protocols, TELNET ------- Count: 2
Element: Socket Nu




In [3]:
from dataset.regexes import ISSN_CLEAN_PATTERN
for entry in tqdm(pattern_contents['issn_pattern'], desc="Processing stuff", unit="file"):
    pattern = ISSN_CLEAN_PATTERN
    correct_entry = entry.split('    ')[0]
    match = re.findall(pattern, correct_entry, re.DOTALL)
    print(f'{entry, correct_entry, match}\n\n')

Processing stuff: 100%|██████████| 3744/3744 [00:00<00:00, 197066.91file/s]

('0894-5926),', '0894-5926),', ['0894-5926'])


('0894-5926),', '0894-5926),', ['0894-5926'])


('1055-4769)', '1055-4769)', ['1055-4769'])


('2070-1721', '2070-1721', ['2070-1721'])


('1201-0758.', '1201-0758.', ['1201-0758'])


('for serials.', 'for serials.', [])


('s,', 's,', [])


('and SICI) can be supported within the URN framework', 'and SICI) can be supported within the URN framework', [])


('and SICI) can be supported within the URN framework and the', 'and SICI) can be supported within the URN framework and the', [])


('s are both collections of identifiers used', 's are both collections of identifiers used', [])


('s,', 's,', [])


('(International Serial Standard Number) as URN (Uniform Resource Names) within an ISSN-URN Namespace "/>', '(International Serial Standard Number) as URN (Uniform Resource Names) within an ISSN-URN Namespace "/>', [])


('(International', '(International', [])


('and SICI) as URNs.  This document', 'and SICI) as URNs.  This document', [])




In [4]:
from dataset.regexes import NIC_PATTERN
for entry in tqdm(pattern_contents['NIC_pattern'], desc="Processing stuff", unit="file"):
    pattern = NIC_PATTERN
    correct_entry = entry.split('    ')[0]
    match = re.findall(pattern, correct_entry, re.DOTALL)
    print(f'{entry, correct_entry, match}\n\n')

Processing stuff: 100%|██████████| 1353/1353 [00:00<00:00, 150348.21file/s]

('5761                                                26 February 1971', '5761', ['5761'])


('/NLS Service', '/NLS Service', [])


('AST, MULTICAST, AND BROADCAST                     55', 'AST, MULTICAST, AND BROADCAST', [])


('host table that', 'host table that', [])


('-50004, NIC-50005, NIC-50006,', '-50004, NIC-50005, NIC-50006,', ['50004', '50005', '50006'])


('5762                                              February 23, 1971', '5762', ['5762'])


('.  Other collections of older or', '.  Other collections of older or', [])


('DPH USE', 'DPH USE', [])


('4687), UCLA/NMC,', '4687), UCLA/NMC,', ['4687'])


('AL APPROACH', 'AL APPROACH', [])


('#5763                                            22, 23 February 1971', '#5763', ['5763'])


(').', ').', [])


('5764', '5764', ['5764'])


(')', ')', [])


(') of Defense Data Network (DDN), and', ') of Defense Data Network (DDN), and', [])


('.', '.', [])


(') in a single file (HOSTS.TXT) which', ') in a single file (HOSTS.TXT) wh




In [7]:
for entry in tqdm(pattern_contents['obsoleted_by_pattern'], desc="Processing stuff", unit="file"):
    pattern = NUMBERS_PATTERN
    correct_entry = entry.split('    ')[0]
    match = re.findall(pattern, correct_entry, re.DOTALL)
    print(f'{entry, correct_entry, match}\n\n')

Processing stuff: 100%|██████████| 1327/1327 [00:00<00:00, 102114.29file/s]

('<a href="/rfc/rfc16" target="_blank">16</a>                                                 UNKNOWN</span><br /><span class="pre noprint docinfo">Updated by: <a href="/rfc/rfc24" target="_blank">24</a>, <a href="/rfc/rfc27" target="_blank">27</a>, <a href="/rfc/rfc30" target="_blank">30</a>                                                  </span><pre>Network Working Group                                           S. Crocker', '<a href="/rfc/rfc16" target="_blank">16</a>', ['16'])


('<a href="./rfc10">RFC 10</a>).', '<a href="./rfc10">RFC 10</a>).', ['10'])


('<a href="/rfc/rfc1812" target="_blank">1812</a>                                              HISTORIC</span><br /><span class="pre noprint docinfo">                                                                        </span><pre>Network Working Group                                          R. Braden', '<a href="/rfc/rfc1812" target="_blank">1812</a>', ['1812'])


('<a href="/rfc/rfc1060" target="_blank">1060</a>           




In [8]:
for entry in tqdm(pattern_contents['related_rfcs_pattern'], desc="Processing stuff", unit="file"):
    pattern = NUMBERS_PATTERN
    correct_entry = entry.split('    ')[0]
    match = re.findall(pattern, correct_entry, re.DOTALL)
    print(f'{entry, correct_entry, match}\n\n')

Processing stuff: 100%|██████████| 149/149 [00:00<00:00, 149474.12file/s]

('</span>', '</span>', [])


('[INTRO:9]  "A Protocol for Packet Network Intercommunication," V. Cerf', '[INTRO:9]  "A Protocol for Packet Network Intercommunication," V. Cerf', ['9'])


('Supported.', 'Supported.', [])


('Supported.', 'Supported.', [])


('Supported.', 'Supported.', [])


('<a href="./rfc129">RFC-129</a> (NIC-5845)', '<a href="./rfc129">RFC-129</a> (NIC-5845)', ['129', '5845'])


('IPMS.Heading.related-IPMs', 'IPMS.Heading.related-IPMs', [])


("Krol, Ed. (1992) The Whole Internet User's Guide and Catalog, 400", "Krol, Ed. (1992) The Whole Internet User's Guide and Catalog, 400", ['1992', '400'])


('As of the publication of this RFC, a version of [12], titled', 'As of the publication of this RFC, a version of [12], titled', ['12'])


('#147, #129</span>', '#147, #129</span>', ['147', '129'])


('134', '134', ['134'])


('109, 110, 105, 158', '109, 110, 105, 158', ['109', '110', '105', '158'])


('[<a id="ref-1">1</a>] IANA, "Class A Subnet Experiment", <a href="./rf




# Titles

In [6]:
titles_logger = get_custom_logger(name='titles_logger', log_path="../../logs/titles.log", format="%(message)s")
from dataset.regexes import COLON_SPACE_PATTERN
for html_file in tqdm(raw_dataset_dir.glob("*.html"), desc="Processing RFCs", unit="file"):
    with html_file.open('r', encoding='utf-8') as file:
        html_content = file.read()

        soup = BeautifulSoup(html_content, 'html.parser')
        title_tag = soup.title
        if title_tag:
            title_content = title_tag.text
            match = re.search(COLON_SPACE_PATTERN, title_content, re.DOTALL)
            match_phrase = match.group(1) if match else None
            # print(match_phrase)
            titles_logger.info(match_phrase)
        # else:
            # titles_logger.error(f'No title tag found in the HTML {html_file}')

Processing RFCs: 9266file [03:36, 42.87file/s] 


# Page content

In [7]:
test_file = Path('../../data/raw_dataset/txt/rfc1720.txt')
with test_file.open('r', encoding='utf-8') as file:
    html_content = file.read()

In [16]:
soup = BeautifulSoup(html_content, 'html.parser')


def extract_visible_text(soup):
    texts = soup.find_all(string=True)
    visible_texts = []
    for text in texts:
        if text.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            visible_texts.append(text)
    return ' '.join(visible_texts).strip(), visible_texts


visible_text, visible_texts_not_joined = extract_visible_text(soup)
print(visible_texts_not_joined)

[]


In [13]:
from unstructured.partition.auto import partition

elements = partition(filename=test_file)
print("\n\n".join([str(el) for el in elements]))

Network Working Group                        Internet Architecture Board Request for Comments: 1720                             J. Postel, Editor Obsoletes: RFCs 1610, 1600, 1540, 1500,                    November 1994 1410, 1360, 1280, 1250, 1100, 1083, 1130, 1140, 1200 STD: 1 Category: Standards Track

INTERNET OFFICIAL PROTOCOL STANDARDS

Status of this Memo

This memo describes the state of standardization of protocols used in the Internet as determined by the Internet Architecture Board (IAB). This memo is an Internet Standard. Distribution of this memo is unlimited.

Table of Contents

Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . 2 1. The Standardization Process  . . . . . . . . . . . . . . . . 3 2. The Request for Comments Documents . . . . . . . . . . . . . 5 3. Other Reference Documents  . . . . . . . . . . . . . . . . . 6 3.1. Assigned Numbers . . . . . . . . . . . . . . . . . . . . . 6 3.2. Gateway Requirements . . . . . . . . . . . . . . . . . . . 6 3.3.

In [14]:
pattern = r'RFC \d+ .* (?:January|February|March|April|May|June|July|August|September|October|November|December) \d{4}'

with open(test_file, 'r') as file:
    lines = file.readlines()

raw_text = ''
for line in lines:
    if '[Page' in line:
        continue
    if re.findall(pattern, line):
        continue
    raw_text += line.strip()
    print(line.strip())







Network Working Group                        Internet Architecture Board
Request for Comments: 1720                             J. Postel, Editor
Obsoletes: RFCs 1610, 1600, 1540, 1500,                    November 1994
1410, 1360, 1280, 1250, 1100, 1083,
1130, 1140, 1200
STD: 1
Category: Standards Track


INTERNET OFFICIAL PROTOCOL STANDARDS


Status of this Memo

This memo describes the state of standardization of protocols used in
the Internet as determined by the Internet Architecture Board (IAB).
This memo is an Internet Standard.  Distribution of this memo is
unlimited.

Table of Contents

Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . 2
1.  The Standardization Process  . . . . . . . . . . . . . . . . 3
2.  The Request for Comments Documents . . . . . . . . . . . . . 5
3.  Other Reference Documents  . . . . . . . . . . . . . . . . . 6
3.1.  Assigned Numbers . . . . . . . . . . . . . . . . . . . . . 6
3.2.  Gateway Requirements . . . . . . . . . . . . . . . 

In [15]:
raw_text = re.sub(r'\n+', '\n', raw_text)
raw_text = re.sub(r'\s+', ' ', raw_text)
raw_text = raw_text.replace('\t', ' ')
raw_text



In [30]:
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

loader = TextLoader(test_file)

In [31]:
loader.load()



In [32]:
doc = Document(page_content=raw_text)

In [33]:
doc

