In [1]:
import re
import json

import requests
from bs4 import BeautifulSoup

from IPython.display import JSON

In [2]:
document_relationships = {
    "informatively references": {
        "desc": "Informative Reference",
        "inverse": "is informatively referenced by"
    },
    "normatively references": {
        "desc": "Normative Reference",
        "inverse": "is normatively referenced by"
    },
    "Reference": {
        "desc": "A reference found in a document which does not have split normative/informative reference sections",
        "inverse": "Referenced by"
    },
    "Possible Reference": {
        "desc": "Reference of unknown type, likely found in the text of the document",
        "inverse": "Possibly Referenced By"
    }
}

In [3]:
def _find_by_text(soup, tag, text):
    tags = soup.find_all(tag)
    thetag = None
    for tag in tags:
        if tag.find(text=re.compile(text)):
            thetag = tag
    return thetag

In [4]:
def parse_page(html):
    soup = BeautifulSoup(html, "html.parser")
    title, rfc_id = list(soup.find("h2").stripped_strings)
    last_updated = soup.find('th', string=re.compile("Last updated")).find_next("td", class_=None).text.strip()

    aka_div = _find_by_text(soup, 'div', "Also known as")
    aka = [{"uri": a.get("href"), "name": a.text} for a in aka_div.find_all("a")] if aka_div else []
    
    updates_div = _find_by_text(soup, 'div', "Updates")
    updates = [{"uri": a.get("href"), "name": a.text} for a in updates_div.find_all("a")] if updates_div else []
    
    updated_by_div = _find_by_text(soup, 'div', "Updated by")
    updated_by = [{"uri": a.get("href"), "name": a.text} for a in updated_by_div.find_all("a")] if updated_by_div else []

    obsoletes_div = _find_by_text(soup, 'div', "Obsoletes")
    obsoletes = [{"uri": a.get("href"), "name": a.text} for a in obsoletes_div.find_all("a")] if obsoletes_div else []
    
    obsoleted_by_div = _find_by_text(soup, 'div', "Obsoleted by")
    obsoleted_by = [{"uri": a.get("href"), "name": a.text} for a in obsoleted_by_div.find_all("a")] if obsoleted_by_div else []
            
    authors = [
        {"email": x.get("href"), "name": x.text}
        for x in soup.find('th', string=re.compile("Author"))
        .find_next("td", class_=None)
        .find_all("a", string=re.compile(".+"))
    ]
    
    return {
        "title": title,
        "text": _find_by_text(soup, 'a', 'plain text').get("href"),
        "pdf": _find_by_text(soup, 'a', 'pdf').get("href"),
        "html": _find_by_text(soup, 'a', 'html').get("href"),
        "label": rfc_id,
        "aka": aka,
        "last_updated": last_updated,
        "updates": updates,
        "updated_by": updated_by,
        "obsoletes": obsoletes,
        "obsoleted_by": obsoleted_by,
        "authors": authors
    }

In [5]:
def parse_references(html):
    result = []
    soup = BeautifulSoup(html, 'html.parser')
    try:
        rows = soup.find("h1").find_next("table").findAll('tr')[1:]
        for row in rows:
            col = row.find_all("td")
            doc = col[0].text.strip().split("\n")[0]
            status = col[2].text.strip()
            Type = col[3].text.strip()
            downref = col[4].text.strip()
            result.append({
                "name": doc,
                "status": status,
                "type": Type,
                "downref": downref
            })
    except AttributeError:
        pass
    return result
    

In [6]:
DATATRACKER_URL = "datatracker.ietf.org"
def get_datatracker_doc_info(uri, url=DATATRACKER_URL):
    r = requests.get(f"https://{url}/doc/{uri}/")
    
    main = parse_page(r.content)

    ref = requests.get(f"https://{url}/doc/{main['label'].lower().replace(' ', '')}/references/")
    refd = requests.get(f"https://{url}/doc/{main['label'].lower().replace(' ', '')}/referencedby/")
    
    main = parse_page(r.content)
    refs = parse_references(ref.content)
    refds = parse_references(refd.content)
    main["references"] = refs
    main["referenced_by"] = refds
    
    return main

In [7]:
docs = []
parsed = []
to_parse = ["rfc3261"]

In [16]:
from urllib.parse import urlparse

In [23]:
def parse_one(doc):
    print(f"Parsing {doc}")
    result = get_datatracker_doc_info(doc)
    related = []
    
    with open(f"files/{doc}.txt", "wb") as f:
        f.write(requests.get(result["text"]).content)
    
    for link in [*result["updates"], *result["updated_by"], *result["obsoletes"], *result["obsoleted_by"], *result["references"], *result["referenced_by"]]:
        i = link["name"].lower().replace(" ", "")
        if i not in parsed:
            related.append(i)
            
    return result, related

In [24]:
parse_one("rfc3261")

Parsing rfc3261


({'title': 'SIP: Session Initiation Protocol',
  'text': 'https://www.rfc-editor.org/rfc/rfc3261.txt',
  'pdf': 'https://www.rfc-editor.org/rfc/pdfrfc/rfc3261.txt.pdf',
  'html': '/doc/html/rfc3261',
  'label': 'RFC 3261',
  'aka': [],
  'last_updated': '2020-01-21',
  'updates': [],
  'updated_by': [{'uri': '/doc/rfc5393/', 'name': 'RFC 5393'},
   {'uri': '/doc/rfc6141/', 'name': 'RFC 6141'},
   {'uri': '/doc/rfc5630/', 'name': 'RFC 5630'},
   {'uri': '/doc/rfc8217/', 'name': 'RFC 8217'},
   {'uri': '/doc/rfc5954/', 'name': 'RFC 5954'},
   {'uri': '/doc/rfc5621/', 'name': 'RFC 5621'},
   {'uri': '/doc/rfc3853/', 'name': 'RFC 3853'},
   {'uri': '/doc/rfc7463/', 'name': 'RFC 7463'},
   {'uri': '/doc/rfc7462/', 'name': 'RFC 7462'},
   {'uri': '/doc/rfc3265/', 'name': 'RFC 3265'},
   {'uri': '/doc/rfc4320/', 'name': 'RFC 4320'},
   {'uri': '/doc/rfc5626/', 'name': 'RFC 5626'},
   {'uri': '/doc/rfc6026/', 'name': 'RFC 6026'},
   {'uri': '/doc/rfc6665/', 'name': 'RFC 6665'},
   {'uri': '/do

In [32]:
from multiprocessing.dummy import Pool as ThreadPool

def collect_data_about_rfc(name):
    json_result = {}
    center_rfc, related = parse_one(name)
    json_result[name] = center_rfc
    
    pool = ThreadPool(20)
    results = pool.map(parse_one, related)
    pool.close()
    pool.join()
    
    json_result.update(dict(zip(related, [r[0] for r in results])))
    
    with open("data.json", "w+") as f:
        json.dump(json_result, f)
    

In [33]:
collect_data_about_rfc("rfc3261")

Parsing rfc3261
Parsing rfc5393
Parsing rfc5954
Parsing rfc7462
Parsing rfc6026
Parsing rfc4916
Parsing rfc6878
Parsing rfc1123
Parsing rfc1889
Parsing rfc2183
Parsing rfc2326
Parsing rfc2401
Parsing rfc2617
Parsing rfc2822
Parsing rfc2976
Parsing rfc3264
Parsing std1
Parsing rfc3263
Parsing rfc3313
Parsing rfc3325
Parsing rfc3351
Parsing rfc3372
Parsing rfc3015
Parsing rfc2543
Parsing rfc6665
Parsing rfc6141
Parsing rfc5922
Parsing rfc5621
Parsing rfc3319
Parsing rfc3265
Parsing rfc3326
Parsing rfc3265
Parsing rfc2234
Parsing rfc2630
Parsing rfc2327
Parsing rfc2046
Parsing rfc2849
Parsing rfc3388
Parsing rfc5630
Parsing rfc3204
Parsing rfc3323
Parsing rfc8996
Parsing rfc3327
Parsing rfc8898
Parsing rfc3853
Parsing rfc2426
Parsing rfc3268
Parsing rfc4320
Parsing bcp14
Parsing rfc2633
Parsing std3
Parsing rfc1321
Parsing rfc2914
Parsing rfc3310
Parsing rfc2368
Parsing rfc3398
Parsing rfc2246
Parsing rfc7463
Parsing rfc8217
Parsing rfc3329
Parsing rfc3263
Parsing rfc761
Parsing rfc8591
P