In [2]:
import re
import json

import requests
from bs4 import BeautifulSoup

from IPython.display import JSON

In [5]:
document_relationships = {
    "informatively references": {
        "desc": "Informative Reference",
        "inverse": "is informatively referenced by"
    },
    "normatively references": {
        "desc": "Normative Reference",
        "inverse": "is normatively referenced by"
    },
    "Reference": {
        "desc": "A reference found in a document which does not have split normative/informative reference sections",
        "inverse": "Referenced by"
    },
    "Possible Reference": {
        "desc": "Reference of unknown type, likely found in the text of the document",
        "inverse": "Possibly Referenced By"
    }
}

In [6]:
r = requests.get("https://datatracker.ietf.org/doc/rfc3261/")
ref = requests.get("https://datatracker.ietf.org/doc/rfc3261/references")
refd = requests.get("https://datatracker.ietf.org/doc/rfc3261/references")

In [7]:
def parse_page(html):
    soup = BeautifulSoup(html, "html.parser")
    title, rfc_id = list(soup.find("h2").stripped_strings)
    last_updated = soup.find('th', string=re.compile("Last updated")).find_next("td", class_=None).text.strip()
    
    updated_by_div = None
    for div in soup.find_all('div'):
        if div.find(text=re.compile("Updated by")):
            updated_by_div = div
    updated_by = [{"uri": a.get("href"), "name": a.text} for a in updated_by_div.find_all("a")] if updated_by_div else []

    obsoletes_div = None
    for div in soup.find_all('div'):
        if div.find(text=re.compile("Obsoletes")):
            obsoletes_div = div
    obsoletes = [{"uri": a.get("href"), "name": a.text} for a in obsoletes_div.find_all("a")] if obsoletes_div else []
            
    authors = [
        {"email": x.get("href"), "name": x.text}
        for x in soup.find('th', string=re.compile("Authors"))
        .find_next("td", class_=None)
        .find_all("a", string=re.compile(".+"))
    ]
    
    return {
        "title": title,
        "label": rfc_id,
        "last_updated": last_updated,
        "updated_by": updated_by,
        "obsoletes": obsoletes,
        "authors": authors
    }

In [8]:
def parse_references(html):
    result = []
    soup = BeautifulSoup(html, 'html.parser')
    for row in soup.find("h1").find_next("table").findAll('tr')[1:]:
        col = row.find_all("td")
        doc = col[0].text.strip()
        status = col[2].text.strip()
        Type = col[3].text.strip()
        downref = col[4].text.strip()
        result.append({
            "doc": doc,
            "status": status,
            "type": Type,
            "downref": downref
        })
    return result
    

In [9]:
parse_references(ref.content)

[{'doc': 'BCP 14',
  'status': 'Best Current Practice',
  'type': 'normatively references',
  'downref': ''},
 {'doc': 'BCP 18',
  'status': 'Best Current Practice',
  'type': 'normatively references',
  'downref': ''},
 {'doc': 'RFC 1123',
  'status': 'Internet Standard',
  'type': 'Possible Reference',
  'downref': ''},
 {'doc': 'RFC 1321',
  'status': 'Informational',
  'type': 'informatively references',
  'downref': ''},
 {'doc': 'RFC 1750',
  'status': 'Informational',
  'type': 'normatively references',
  'downref': 'Downref'},
 {'doc': 'RFC 1847',
  'status': 'Proposed Standard',
  'type': 'normatively references',
  'downref': ''},
 {'doc': 'RFC 1889',
  'status': 'Proposed Standard',
  'type': 'informatively references',
  'downref': ''},
 {'doc': 'RFC 2046',
  'status': 'Draft Standard',
  'type': 'normatively references',
  'downref': ''},
 {'doc': 'RFC 2069',
  'status': 'Proposed Standard',
  'type': 'informatively references',
  'downref': ''},
 {'doc': 'RFC 2076',
  'st

In [10]:
DATATRACKER_URL = "datatracker.ietf.org"
def get_datatracker_doc_info(uri):
    r = requests.get(f"https://{DATATRACKER_URL}/doc/{uri}/")
    ref = requests.get(f"https://{DATATRACKER_URL}/doc/{uri}/references/")
    refd = requests.get(f"https://{DATATRACKER_URL}/doc/{uri}/referencedby/")
    
    main = parse_page(r.content)
    refs = parse_references(ref.content)
    refds = parse_references(refd.content)
    main["references"] = refs
    main["referenced_by"] = refds
    
    return main

In [11]:
print(json.dumps(get_datatracker_doc_info("rfc3261"), indent=4))

        "type": "normatively references",
            "downref": "Downref"
        },
        {
            "doc": "RFC 5627",
            "status": "Proposed Standard",
            "type": "normatively references",
            "downref": ""
        },
        {
            "doc": "RFC 5629",
            "status": "Proposed Standard",
            "type": "normatively references",
            "downref": ""
        },
        {
            "doc": "RFC 5631",
            "status": "Informational",
            "type": "normatively references",
            "downref": ""
        },
        {
            "doc": "RFC 5806",
            "status": "Historic",
            "type": "normatively references",
            "downref": ""
        },
        {
            "doc": "RFC 5853",
            "status": "Informational",
            "type": "normatively references",
            "downref": ""
        },
        {
            "doc": "RFC 5898",
            "status": "Proposed Standard",
            