In [1]:
import re
import json

import requests
from bs4 import BeautifulSoup

from IPython.display import JSON

In [2]:
document_relationships = {
    "informatively references": {
        "desc": "Informative Reference",
        "inverse": "is informatively referenced by"
    },
    "normatively references": {
        "desc": "Normative Reference",
        "inverse": "is normatively referenced by"
    },
    "Reference": {
        "desc": "A reference found in a document which does not have split normative/informative reference sections",
        "inverse": "Referenced by"
    },
    "Possible Reference": {
        "desc": "Reference of unknown type, likely found in the text of the document",
        "inverse": "Possibly Referenced By"
    }
}

In [121]:
def _find_by_text(soup, tag, text):
    tags = soup.find_all(tag)
    thetag = None
    for tag in tags:
        if tag.find(text=re.compile(text)):
            thetag = tag
    return thetag

In [122]:
def parse_page(html):
    soup = BeautifulSoup(html, "html.parser")
    title, rfc_id = list(soup.find("h2").stripped_strings)
    last_updated = soup.find('th', string=re.compile("Last updated")).find_next("td", class_=None).text.strip()

    aka_div = _find_by_text(soup, 'div', "Also known as")
    aka = [{"uri": a.get("href"), "name": a.text} for a in aka_div.find_all("a")] if aka_div else []
    
    updates_div = _find_by_text(soup, 'div', "Updates")
    updates = [{"uri": a.get("href"), "name": a.text} for a in updates_div.find_all("a")] if updates_div else []
    
    updated_by_div = _find_by_text(soup, 'div', "Updated by")
    updated_by = [{"uri": a.get("href"), "name": a.text} for a in updated_by_div.find_all("a")] if updated_by_div else []

    obsoletes_div = _find_by_text(soup, 'div', "Obsoletes")
    obsoletes = [{"uri": a.get("href"), "name": a.text} for a in obsoletes_div.find_all("a")] if obsoletes_div else []
    
    obsoleted_by_div = _find_by_text(soup, 'div', "Obsoleted by")
    obsoleted_by = [{"uri": a.get("href"), "name": a.text} for a in obsoleted_by_div.find_all("a")] if obsoleted_by_div else []
            
    authors = [
        {"email": x.get("href"), "name": x.text}
        for x in soup.find('th', string=re.compile("Author"))
        .find_next("td", class_=None)
        .find_all("a", string=re.compile(".+"))
    ]
    
    return {
        "title": title,
        "text": _find_by_text(soup, 'a', 'plain text').get("href"),
        "pdf": _find_by_text(soup, 'a', 'pdf').get("href"),
        "html": _find_by_text(soup, 'a', 'html').get("href"),
        "label": rfc_id,
        "aka": aka,
        "last_updated": last_updated,
        "updates": updates,
        "updated_by": updated_by,
        "obsoletes": obsoletes,
        "obsoleted_by": obsoleted_by,
        "authors": authors
    }

In [123]:
def parse_references(html):
    result = []
    soup = BeautifulSoup(html, 'html.parser')
    try:
        rows = soup.find("h1").find_next("table").findAll('tr')[1:]
        for row in rows:
            col = row.find_all("td")
            doc = col[0].text.strip().split("\n")[0]
            status = col[2].text.strip()
            Type = col[3].text.strip()
            downref = col[4].text.strip()
            result.append({
                "name": doc,
                "status": status,
                "type": Type,
                "downref": downref
            })
    except AttributeError:
        pass
    return result
    

In [124]:
DATATRACKER_URL = "datatracker.ietf.org"
def get_datatracker_doc_info(uri, url=DATATRACKER_URL):
    r = requests.get(f"https://{url}/doc/{uri}/")
    
    main = parse_page(r.content)

    ref = requests.get(f"https://{url}/doc/{main['label'].lower().replace(' ', '')}/references/")
    refd = requests.get(f"https://{url}/doc/{main['label'].lower().replace(' ', '')}/referencedby/")
    
    main = parse_page(r.content)
    refs = parse_references(ref.content)
    refds = parse_references(refd.content)
    main["references"] = refs
    main["referenced_by"] = refds
    
    return main

In [None]:
docs = []
parsed = []
to_parse = ["rfc3261"]

In [151]:
def parse_one(doc):
    print(f"Parsing {doc}")
    if doc in parsed:
        return
    result = get_datatracker_doc_info(doc)
    docs.append(result)
    parsed.append(doc)
    for aka in result["aka"]:
        parsed.append(aka["name"].lower().replace(" ", ""))
    
    for link in [*result["updates"], *result["updated_by"], *result["obsoletes"], *result["obsoleted_by"], *result["references"], *result["referenced_by"]]:
        i = link["name"].lower().replace(" ", "")
        if i not in parsed:
            to_parse.append(i)

In [144]:
x = 100
while x:
    doc = to_parse.pop(0)
    print(f"Parsing {doc}")
    if doc in parsed:
        continue
    result = get_datatracker_doc_info(doc)
    docs.append(result)
    parsed.append(doc)
    for aka in result["aka"]:
        parsed.append(aka["name"].lower().replace(" ", ""))
    
    for link in [*result["updates"], *result["updated_by"], *result["obsoletes"], *result["obsoleted_by"], *result["references"], *result["referenced_by"]]:
        i = link["name"].lower().replace(" ", "")
        if i not in parsed:
            to_parse.append(i)
    
    x -= 1

Parsing rfc4189
Parsing rfc4215
Parsing rfc4235
Parsing rfc4240
Parsing rfc4244
Parsing rfc4245
Parsing rfc4320
Parsing rfc4354
Parsing rfc4376
Parsing rfc4411
Parsing rfc4412
Parsing rfc4453
Parsing rfc4457
Parsing rfc4458
Parsing rfc4474
Parsing rfc4483
Parsing rfc4484
Parsing rfc4485
Parsing rfc4488
Parsing rfc4497
Parsing rfc4508
Parsing rfc4538
Parsing rfc4567
Parsing rfc4569
Parsing rfc4575
Parsing rfc4583
Parsing rfc4590
Parsing rfc4660
Parsing rfc4662
Parsing rfc4722
Parsing rfc4730
Parsing rfc4740
Parsing rfc4759
Parsing rfc4769
Parsing rfc4780
Parsing rfc4825
Parsing rfc4826
Parsing rfc4904
Parsing rfc4916
Parsing rfc4964
Parsing rfc4967
Parsing rfc4975
Parsing rfc4976
Parsing rfc5025
Parsing rfc5027
Parsing rfc5031
Parsing rfc5049
Parsing rfc5079
Parsing rfc5168
Parsing rfc5194
Parsing rfc5245
Parsing rfc5263
Parsing rfc5264
Parsing rfc5359
Parsing rfc5363
Parsing rfc5365
Parsing rfc5367
Parsing rfc5368
Parsing rfc5369
Parsing rfc5370
Parsing rfc5373
Parsing rfc5456
Parsing 

In [149]:
from multiprocessing.dummy import Pool as ThreadPool

In [152]:
pool = ThreadPool(20)
results = pool.map(parse_one, to_parse)

pool.close()
pool.join()

Parsing rfc4081
Parsing rfc5025Parsing rfc6314
Parsing rfc6881
Parsing rfc7647
Parsing rfc5263
Parsing rfc5359Parsing rfc7748
Parsing rfc3043


Parsing rfc5362
Parsing rfc5802
Parsing rfc2955
Parsing bcp9
Parsing rfc6055
Parsing draft-ietf-httpbis-rfc6265bis
Parsing rfc3464
Parsing rfc2964
Parsing rfc1898
Parsing rfc1320
Parsing rfc2653
Parsing draft-ietf-acme-email-smime
Parsing rfc8710
Parsing rfc4356
Parsing rfc3603
Parsing rfc3605
Parsing rfc3611
Parsing draft-ietf-lisp-ecdsa-auth
Parsing rfc3044
Parsing draft-ietf-extra-imap4rev2
Parsing rfc2993
Parsing rfc1421
Parsing rfc1983
Parsing rfc5367
Parsing rfc5369
Parsing rfc5552
Parsing rfc8048
Parsing rfc2821
Parsing rfc4119
Parsing rfc2045
Parsing rfc8032
Parsing rfc2476
Parsing rfc2959
Parsing rfc6055
Parsing rfc6365
Parsing rfc3613
Parsing rfc6919
Parsing rfc3507
Parsing rfc7255
Parsing draft-ietf-regext-rfc7482bis
Parsing draft-ietf-jmap-jscontact
Parsing rfc3120
Parsing rfc1422
Parsing rfc3022
Parsing draft-ietf-rum-rue
Parsing r

Parsing rfc7804
Parsing rfc3805
Parsing rfc3808
Parsing rfc3108
Parsing rfc4122
Parsing rfc4230
Parsing rfc1533
Parsing bcp14
Parsing rfc2119
Parsing rfc2617
Parsing rfc5234
Parsing rfc6151
Parsing rfc7234
Parsing rfc5411
Parsing rfc1725
Parsing std66
Parsing std68
Parsing rfc7849
Parsing rfc3983
Parsing rfc8141
Parsing rfc5923
Parsing rfc5924
Parsing rfc6216
Parsing rfc3983
Parsing rfc2183
Parsing rfc2184
Parsing rfc4566
Parsing rfc4047
Parsing rfc3748
Parsing rfc3862
Parsing rfc3863
Parsing rfc3863
Parsing rfc3920
Parsing rfc3931
Parsing rfc7810
Parsing rfc4253
Parsing rfc3164
Parsing rfc1750
Parsing rfc1751
Parsing rfc1545
Parsing rfc3987
Parsing draft-ietf-rum-rue
Parsing rfc3608
Parsing bcp14
Parsing bcp85
Parsing rfc2119
Parsing rfc2327
Parsing rfc2543
Parsing rfc3327Parsing rfc7616

Parsing rfc3515
Parsing rfc3608Parsing rfc5661
Parsing rfc6230
Parsing rfc3725
Parsing rfc3891
Parsing rfc3893
Parsing rfc3911

Parsing rfc6406
Parsing rfc4168
Parsing rfc4244
Parsing rfc4474
Parsing

Parsing rfc5751
Parsing rfc4460
Parsing rfc7866
Parsing rfc8224
Parsing rfc8396
Parsing rfc3508
Parsing rfc2245
Parsing rfc2058Parsing rfc3280

Parsing rfc952
Parsing bcp13Parsing rfc4234
Parsing bcp14
Parsing rfc2045

Parsing rfc2046
Parsing rfc2119
Parsing rfc2183
Parsing rfc2387
Parsing rfc2205
Parsing rfc1846
Parsing rfc8435
Parsing rfc2480
Parsing rfc7678
Parsing rfc2506
Parsing rfc5415
Parsing rfc3002
Parsing rfc4796
Parsing rfc5466
Parsing rfc3617
Parsing rfc5412
Parsing rfc4660
Parsing rfc7621
Parsing rfc2253
Parsing draft-ietf-lamps-rfc5751-bis
Parsing rfc2059
Parsing rfc1849
Parsing rfc1869
Parsing rfc822
Parsing rfc7766
Parsing rfc2392
Parsing rfc2552
Parsing rfc8780
Parsing rfc5965
Parsing rfc7719
Parsing rfc4896
Parsing rfc4882
Parsing rfc3624
Parsing bcp13
Parsing bcp14
Parsing bcp26Parsing rfc5466
Parsing bcp67Parsing rfc2782
Parsing bcp98
Parsing rfc2119
Parsing rfc2779

Parsing rfc1948
Parsing rfc2002
Parsing rfc2065
Parsing rfc5610

Parsing rfc2255
Parsing rfc2065
Par

Parsing rfc7788
Parsing rfc2083
Parsing rfc1052
Parsing std1
Parsing std7
Parsing std8
Parsing rfc7111Parsing rfc5321

Parsing rfc5452
Parsing rfc5754
Parsing rfc2286
Parsing rfc2320
Parsing rfc6533
Parsing rfc7877
Parsing rfc2919
Parsing rfc2930
Parsing rfc6787
Parsing rfc7051
Parsing rfc2435
Parsing rfc2924
Parsing rfc5347
Parsing rfc2331
Parsing rfc2506
Parsing rfc2530
Parsing rfc2083
Parsing rfc2110
Parsing rfc2131
Parsing rfc2132
Parsing rfc2518
Parsing rfc3318
Parsing rfc3023
Parsing rfc7252
Parsing rfc2289
Parsing rfc3539
Parsing rfc3560
Parsing rfc3631
Parsing rfc3712
Parsing rfc1065
Parsing rfc6121
Parsing std9
Parsing draft-ietf-tls-oldversions-deprecate
Parsing rfc7869Parsing rfc5507

Parsing rfc2326
Parsing rfc2327
Parsing rfc2344
Parsing rfc2448
Parsing rfc2943
Parsing rfc7877
Parsing rfc8881
Parsing rfc7587
Parsing rfc5389
Parsing rfc5438
Parsing rfc5486
Parsing rfc2976
Parsing rfc2995
Parsing rfc2533
Parsing rfc2336
Parsing rfc3331
Parsing rfc3746
Parsing rfc2311
Parsing

Parsing rfc2806
Parsing rfc2808
Parsing rfc4175
Parsing rfc5272
Parsing rfc3170
Parsing rfc8700
Parsing rfc3514
Parsing rfc2431
Parsing rfc2434
Parsing rfc4278Parsing rfc4162

Parsing rfc1211
Parsing rfc8216
Parsing rfc2521
Parsing rfc5760
Parsing rfc5888
Parsing rfc5945
Parsing rfc3322
Parsing rfc2467
Parsing rfc733
Parsing rfc5198
Parsing rfc5280
Parsing rfc5280
Parsing rfc5284
Parsing rfc4306
Parsing rfc2812
Parsing rfc4189
Parsing rfc4215
Parsing rfc4317
Parsing rfc4435
Parsing rfc4442
Parsing rfc4463
Parsing rfc4473
Parsing rfc4542
Parsing rfc6466
Parsing rfc4252
Parsing rfc2573
Parsing rfc3189
Parsing rfc1327
Parsing rfc4168
Parsing rfc4217
Parsing rfc4235
Parsing rfc4244
Parsing rfc4261
Parsing rfc1122
Parsing rfc1175
Parsing rfc3562
Parsing rfc4297
Parsing rfc8225
Parsing rfc2525
Parsing rfc2617
Parsing rfc2630
Parsing rfc2633
Parsing rfc2693
Parsing rfc2716
Parsing rfc5993
Parsing rfc3362
Parsing rfc3435
Parsing rfc3508
Parsing rfc3639
Parsing rfc2471
Parsing rfc734
Parsing rf

Parsing rfc6876
Parsing rfc1738
Parsing rfc1808
Parsing rfc2047
Parsing rfc822
Parsing std1
Parsing std11
Parsing rfc3696
Parsing rfc4355
Parsing rfc5281
Parsing rfc2045
Parsing rfc2047
Parsing rfc2048
Parsing rfc2049
Parsing rfc821
Parsing rfc822
Parsing rfc934
Parsing rfc5996
Parsing rfc2446
Parsing rfc2447
Parsing rfc2652
Parsing rfc2660
Parsing rfc2798
Parsing rfc2562
Parsing rfc3154
Parsing rfc861
Parsing rfc4046
Parsing rfc4067
Parsing rfc4767
Parsing rfc6585
Parsing rfc3041
Parsing rfc3075
Parsing rfc3110
Parsing rfc3824
Parsing rfc3830
Parsing rfc3840
Parsing rfc3862
Parsing rfc3863
Parsing rfc3875
Parsing rfc3881
Parsing rfc2720
Parsing rfc2616
Parsing rfc2652
Parsing rfc2661
Parsing rfc2683
Parsing rfc3367
Parsing rfc1600
Parsing rfc5905
Parsing rfc959
Parsing std1
Parsing std11
Parsing draft-bhutton-json-schema-validation
Parsing rfc5422
Parsing rfc5436
Parsing rfc7525
Parsing rfc3199
Parsing rfc3792
Parsing rfc7273
Parsing rfc2564
Parsing rfc3162
Parsing rfc882
Parsing rfc4

Parsing rfc2068
Parsing rfc2070
Parsing rfc6415
Parsing rfc8551
Parsing rfc2798
Parsing rfc3792
Parsing rfc3851Parsing rfc2327

Parsing rfc2632
Parsing rfc2361
Parsing rfc2588
Parsing rfc2705
Parsing rfc2775
Parsing rfc3303
Parsing rfc3322
Parsing rfc3435
Parsing rfc3266
Parsing rfc4566
Parsing rfc1035
Parsing rfc1305
Parsing rfc1630
Parsing rfc1641
Parsing rfc2748
Parsing rfc2757
Parsing rfc4157
Parsing rfc4340
Parsing rfc4409
Parsing rfc3423
Parsing draft-bhutton-json-schema-validation
Parsing draft-ietf-dnsop-rfc8499bis
Parsing rfc1475
Parsing rfc1952
Parsing rfc4168
Parsing rfc4460
Parsing rfc4497
Parsing rfc4820
Parsing rfc7235
Parsing rfc7612
Parsing rfc7615
Parsing rfc3157
Parsing rfc3165
Parsing rfc3196
Parsing rfc3205
Parsing rfc3229
Parsing rfc3230
Parsing rfc3234
Parsing rfc3239
Parsing rfc5335
Parsing rfc5337
Parsing draft-hoehlhubmer-https-addon
Parsing draft-ietf-emailcore-as
Parsing rfc3487
Parsing rfc5411
Parsing rfc3499
Parsing rfc7984
Parsing rfc8226
Parsing rfc2680
P

Parsing rfc3795
Parsing rfc4324
Parsing rfc4483
Parsing rfc5070
Parsing rfc3836
Parsing rfc3954
Parsing rfc6690
Parsing rfc950
Parsing rfc7604
Parsing rfc7605
Parsing rfc2820
Parsing rfc4047
Parsing rfc4130
Parsing rfc4227
Parsing rfc7567
Parsing draft-ietf-i2nsf-nsf-monitoring-data-model
Parsing rfc6140
Parsing rfc6271
Parsing rfc6461
Parsing rfc3235
Parsing rfc3274
Parsing rfc5027
Parsing rfc5104
Parsing rfc5124
Parsing rfc5188
Parsing rfc5194
Parsing rfc5215
Parsing rfc4765
Parsing rfc3850
Parsing rfc5365
Parsing rfc5389
Parsing rfc5408
Parsing rfc6121
Parsing rfc7860
Parsing rfc6350
Parsing rfc6648
Parsing rfc2626
Parsing rfc2739
Parsing rfc5056
Parsing rfc4166
Parsing rfc4629
Parsing rfc5781
Parsing rfc2822
Parsing rfc2829
Parsing rfc2869
Parsing rfc2926
Parsing rfc2927
Parsing draft-omar-ipmix
Parsing rfc4236Parsing draft-ietf-intarea-gue

Parsing rfc6950
Parsing rfc6712Parsing rfc7245

Parsing rfc7720
Parsing rfc3268
Parsing rfc3303
Parsing rfc3354
Parsing rfc5359
Parsing rfc536

Parsing draft-ietf-sipcore-rfc4028bis
Parsing draft-jennings-rtcweb-deps
Parsing rfc7100
Parsing rfc2249Parsing rfc3126

Parsing rfc8636
Parsing rfc3554
Parsing rfc1035
Parsing rfc2045
Parsing rfc2047
Parsing rfc2048
Parsing rfc2049
Parsing rfc2119
Parsing rfc2821
Parsing rfc822
Parsing rfc974
Parsing std13
Parsing draft-ietf-openpgp-crypto-refresh
Parsing rfc4682
Parsing rfc1392
Parsing draft-irtf-pearg-numeric-ids-history
Parsing rfc5552
Parsing rfc5576
Parsing rfc4774
Parsing rfc7118
Parsing rfc7199
Parsing draft-morton-tsvwg-sce
Parsing draft-kunze-ark
Parsing rfc3380
Parsing rfc6795
Parsing rfc7252
Parsing rfc7728
Parsing rfc7825
Parsing rfc7826
Parsing rfc7850
Parsing rfc7143
Parsing rfc8376
Parsing rfc3262
Parsing rfc3311
Parsing rfc3388
Parsing rfc3611
Parsing rfc3665
Parsing rfc3666
Parsing rfc3725
Parsing rfc3959
Parsing rfc3960
Parsing rfc3984
Parsing rfc4028
Parsing rfc4040
Parsing rfc4060
Parsing rfc4092
Parsing rfc4103
Parsing rfc4145
Parsing rfc4184Parsing rfc762

Parsin

Parsing rfc5273
Parsing rfc4294
Parsing rfc3819Parsing draft-morand-http-digest-2g-aka
Parsing rfc3485
Parsing rfc4083
Parsing rfc4080

Parsing rfc4367
Parsing rfc4590
Parsing rfc5090
Parsing rfc5411
Parsing rfc3968
Parsing draft-ietf-sipcore-rfc4028bis
Parsing rfc3725
Parsing rfc3841
Parsing rfc4028
Parsing rfc4235
Parsing rfc4412
Parsing rfc4497
Parsing rfc4538
Parsing rfc5009
Parsing rfc5552
Parsing rfc5876
Parsing rfc5853Parsing rfc6230

Parsing rfc6337
Parsing rfc6442Parsing rfc5888
Parsing rfc5939
Parsing rfc5956

Parsing rfc6794
Parsing rfc6917
Parsing draft-ietf-rum-rue
Parsing draft-ietf-stir-servprovider-oob
Parsing draft-peterson-stir-messaging
Parsing draft-peterson-stir-rfc4916-update
Parsing rfc3313
Parsing rfc3398
Parsing rfc3455
Parsing rfc3485
Parsing rfc3891
Parsing rfc3911
Parsing rfc3960
Parsing rfc4032
Parsing rfc4353
Parsing rfc4485
Parsing rfc4568
Parsing rfc4964
Parsing rfc4975
Parsing rfc5027
Parsing rfc5411
Parsing rfc6050
Parsing rfc6236
Parsing rfc3828
Parsi

Parsing rfc2369
Parsing rfc2371
Parsing rfc2590
Parsing rfc2628
Parsing rfc2661
Parsing rfc2663
Parsing rfc820
Parsing rfc6794
Parsing rfc7315
Parsing rfc7702
Parsing rfc2205
Parsing rfc2747
Parsing rfc3398
Parsing rfc3725
Parsing rfc4244
Parsing rfc4411
Parsing rfc4575
Parsing rfc5552
Parsing rfc6044
Parsing rfc5905
Parsing rfc5936
Parsing rfc5966
Parsing rfc6012
Parsing rfc5842
Parsing rfc4850
Parsing rfc4196
Parsing rfc5390
Parsing rfc7141
Parsing rfc1122
Parsing rfc1323
Parsing rfc6056
Parsing rfc6064
Parsing rfc6088
Parsing rfc6849
Parsing rfc5216
Parsing rfc4505
Parsing rfc4719
Parsing rfc4083
Parsing rfc4215
Parsing rfc6157
Parsing rfc6632
Parsing rfc7227
Parsing rfc1595
Parsing rfc2374
Parsing rfc2381
Parsing rfc2390
Parsing rfc3525
Parsing rfc1006
Parsing rfc1118
Parsing rfc6228
Parsing rfc6432
Parsing rfc4969
Parsing rfc5198
Parsing rfc5218
Parsing rfc5411
Parsing rfc6086
Parsing rfc3968
Parsing rfc3969
Parsing rfc5727
Parsing bcp9
Parsing rfc2026
Parsing rfc2434
Parsing rfc4

Parsing rfc2861
Parsing rfc773
Parsing rfc780
Parsing rfc7195
Parsing rfc7206
Parsing rfc7261
Parsing rfc5015
Parsing rfc7865
Parsing rfc3968
Parsing bcp9
Parsing rfc2026
Parsing rfc2223
Parsing rfc7082
Parsing rfc1657
Parsing rfc2888
Parsing rfc2518
Parsing rfc2534
Parsing rfc923
Parsing rfc6462
Parsing rfc4215
Parsing rfc4339
Parsing rfc6450
Parsing rfc793
Parsing rfc896
Parsing rfc6012Parsing rfc5785
Parsing rfc6077

Parsing rfc5839
Parsing rfc5890
Parsing rfc5927
Parsing rfc5934
Parsing rfc5638
Parsing rfc6442
Parsing rfc6447
Parsing rfc6881
Parsing rfc7248
Parsing rfc8048
Parsing rfc3857
Parsing rfc4079
Parsing rfc7273
Parsing rfc7310
Parsing rfc7989
Parsing rfc7989
Parsing rfc8197
Parsing rfc8197
Parsing rfc8445
Parsing rfc8445
Parsing rfc8506
Parsing rfc3690Parsing rfc760

Parsing rfc6450
Parsing rfc762
Parsing draft-asai-tsvwg-transport-review
Parsing draft-fairhurst-tsvwg-udp-options-dplpmtud
Parsing draft-gandhi-spring-sr-enhanced-plm
Parsing draft-gandhi-spring-stamp-srpm
Pa

Parsing rfc7701
Parsing rfc7798
Parsing rfc7866
Parsing rfc8035
Parsing rfc6706
Parsing rfc6881
Parsing rfc7195
Parsing rfc7200
Parsing rfc7544
Parsing rfc7566
Parsing rfc7643
Parsing rfc7749
Parsing rfc7852
Parsing rfc8224
Parsing draft-haluska-sipping-directory-assistance
Parsing draft-ietf-jmap-jscontact
Parsing draft-ietf-rum-rue
Parsing rfc4083
Parsing rfc4474
Parsing rfc4479
Parsing rfc4504
Parsing rfc4575
Parsing rfc5012
Parsing rfc5031
Parsing rfc5222
Parsing rfc5361
Parsing rfc5435
Parsing rfc2677
Parsing rfc2688
Parsing rfc2728
Parsing rfc2733
Parsing rfc2739
Parsing rfc2740
Parsing rfc6118
Parsing bcp78
Parsing rfc1034
Parsing rfc2119
Parsing rfc2915
Parsing rfc2916
Parsing rfc3219
Parsing rfc3401
Parsing rfc5044
Parsing rfc2960
Parsing rfc2983
Parsing draft-ietf-tcpm-rfc793bis
Parsing rfc5502
Parsing rfc5876
Parsing rfc6050
Parsing rfc7316
Parsing rfc8498
Parsing draft-haluska-sipping-directory-assistance
Parsing rfc3325
Parsing rfc3455
Parsing rfc3487
Parsing rfc6404
Parsi

Parsing rfc5406
Parsing rfc4590
Parsing rfc4694
Parsing rfc4715
Parsing rfc961
Parsing rfc6887Parsing rfc6271

Parsing bcp26
Parsing bcp31Parsing rfc6936

Parsing bcp78
Parsing rfc2434
Parsing rfc2506
Parsing rfc2533
Parsing rfc2703
Parsing rfc2738
Parsing rfc2913
Parsing rfc2987
Parsing rfc3053
Parsing rfc2023
Parsing rfc1697
Parsing rfc2745
Parsing rfc961
Parsing rfc6401
Parsing rfc6463
Parsing rfc2821
Parsing rfc3066
Parsing rfc3087
Parsing rfc3530
Parsing rfc3550
Parsing rfc3986
Parsing rfc4234
Parsing std64
Parsing std66
Parsing rfc4722
Parsing rfc5022
Parsing rfc5616
Parsing rfc5707
Parsing draft-haluska-sipping-directory-assistance
Parsing rfc4313
Parsing rfc4458
Parsing rfc5411
Parsing rfc5552
Parsing rfc5567
Parsing rfc5627
Parsing rfc5850
Parsing rfc6086
Parsing rfc6231
Parsing rfc6917
Parsing rfc7044
Parsing rfc7131
Parsing rfc6501
Parsing rfc6504
Parsing rfc8331
Parsing bcp78
Parsing rfc2119
Parsing rfc2460
Parsing rfc3420
Parsing std5
Parsing std7
Parsing draft-ietf-tls-ol

Parsing rfc2119
Parsing rfc5989
Parsing rfc6910
Parsing rfc7082
Parsing rfc5411

Parsing rfc5027
Parsing rfc5245
Parsing rfc5898Parsing rfc6914
Parsing rfc8262
Parsing bcp119
Parsing bcp78

Parsing rfc8839
Parsing rfc5411
Parsing rfc5945
Parsing bcp78
Parsing rfc2119
Parsing rfc2119
Parsing rfc2486
Parsing rfc2828
Parsing rfc2833
Parsing rfc2916
Parsing rfc3312
Parsing rfc7315
Parsing rfc5245
Parsing bcp78
Parsing rfc4092
Parsing rfc4092Parsing rfc2392
Parsing rfc3420
Parsing rfc4579
Parsing rfc5364
Parsing rfc5411
Parsing rfc8262
Parsing bcp78
Parsing rfc3238
Parsing rfc3850
Parsing rfc4566
Parsing rfc5246
Parsing rfc5370
Parsing rfc5370
Parsing rfc5850
Parsing rfc7092
Parsing rfc7245
Parsing bcp78
Parsing rfc2119

Parsing rfc5245
Parsing rfc4215
Parsing rfc3850
Parsing rfc4566
Parsing rfc5246
Parsing rfc5360
Parsing rfc5207Parsing rfc5366
Parsing rfc8849
Parsing rfc7092
Parsing rfc6503
Parsing rfc8856

Parsing rfc2119
Parsing rfc5234
Parsing std68
Parsing rfc5411
Parsing rfc5850
Pars

Parsing rfc7876
Parsing rfc7876
Parsing rfc4738
Parsing rfc4771
Parsing rfc1848
Parsing rfc1869
Parsing rfc1870
Parsing rfc5064
Parsing rfc5321
Parsing rfc5451
Parsing rfc6056
Parsing rfc6128
Parsing rfc7200
Parsing rfc7339
Parsing bcp78
Parsing rfc3428
Parsing rfc5360
Parsing rfc5363
Parsing bcp67
Parsing bcp78
Parsing rfc3588
Parsing rfc5234
Parsing rfc5457Parsing rfc4234

Parsing rfc5002
Parsing rfc8119
Parsing bcp78
Parsing rfc4234
Parsing rfc5411
Parsing rfc5627
Parsing rfc5850
Parsing rfc6044
Parsing rfc7044
Parsing rfc7131
Parsing rfc7544
Parsing rfc8119
Parsing rfc8224
Parsing bcp78
Parsing rfc2119
Parsing rfc2585
Parsing rfc2818
Parsing rfc3280
Parsing rfc3370
Parsing rfc3548
Parsing rfc3761
Parsing rfc4234
Parsing rfc4475
Parsing draft-haluska-sipping-directory-assistance
Parsing rfc4662
Parsing rfc4975
Parsing rfc5079
Parsing rfc5361
Parsing rfc5370
Parsing rfc5373
Parsing rfc5638
Parsing rfc5853
Parsing rfc5898
Parsing rfc6072
Parsing rfc6080
Parsing rfc6216
Parsing rfc6881

Parsing rfc7913
Parsing rfc7976
Parsing rfc8496
Parsing rfc6390
Parsing rfc8842
Parsing rfc2119
Parsing rfc6443
Parsing rfc6466
Parsing rfc2810
Parsing rfc6505
Parsing rfc6569
Parsing rfc8876
Parsing bcp78
Parsing rfc2119
Parsing rfc2141
Parsing rfc3308
Parsing draft-ietf-opsec-v6
Parsing rfc1903
Parsing rfc8899
Parsing rfc5761
Parsing rfc6051
Parsing rfc7160
Parsing rfc8899
Parsing rfc2026
Parsing rfc2104
Parsing rfc2119
Parsing rfc2196
Parsing rfc8861
Parsing rfc8862
Parsing rfc8864
Parsing rfc8865
Parsing rfc6714
Parsing bcp26
Parsing rfc2119
Parsing rfc2132
Parsing rfc2818
Parsing rfc3361
Parsing rfc4122
Parsing rfc4510
Parsing rfc1165
Parsing rfc7239Parsing rfc6354
Parsing rfc7240

Parsing rfc1165
Parsing rfc8269
Parsing rfc5770
Parsing rfc6659
Parsing rfc3320
Parsing rfc3485
Parsing rfc4077
Parsing rfc4122
Parsing rfc4234
Parsing rfc4346
Parsing rfc4896
Parsing draft-ietf-tls-oldversions-deprecate
Parsing rfc5411
Parsing bcp78
Parsing rfc2119
Parsing rfc5057
Parsing rfc8599
Parsi


Parsing rfc3994
Parsing draft-kunze-coinrg-transport-issues
Parsing rfc1914
Parsing draft-kunze-coinrg-transport-issues
Parsing rfc5371
Parsing rfc5391
Parsing rfc5404
Parsing rfc5450
Parsing rfc7983
Parsing rfc4383
Parsing rfc4563
Parsing rfc4650
Parsing rfc4738
Parsing rfc5374
Parsing rfc6071
Parsing rfc6407
Parsing rfc6849
Parsing rfc6881
Parsing rfc6884
Parsing rfc6904
Parsing rfc8594
Parsing rfc8601
Parsing draft-mcquistin-augmented-udp-example
Parsing rfc1283
Parsing rfc5438
Parsing rfc5547
Parsing rfc6914
Parsing rfc7572
Parsing bcp26
Parsing bcp78
Parsing bcp9
Parsing rfc1036
Parsing rfc1496
Parsing rfc1505
Parsing rfc1737
Parsing rfc7989
Parsing rfc8088
Parsing rfc8404
Parsing rfc1864
Parsing rfc1945
Parsing rfc2026
Parsing rfc2045
Parsing rfc2141
Parsing rfc1283
Parsing rfc2227
Parsing rfc2231
Parsing rfc2369
Parsing rfc2434
Parsing rfc2518
Parsing rfc2557
Parsing rfc2821
Parsing rfc2912
Parsing rfc2919
Parsing rfc2965
Parsing rfc3282
Parsing rfc3798
Parsing rfc3801
Parsing 

Parsing rfc3252
Parsing rfc2122
Parsing rfc7310
Parsing rfc7509
Parsing rfc7838
Parsing rfc7201
Parsing rfc7295
Parsing rfc7362
Parsing rfc7376
Parsing rfc2125
Parsing rfc7604
Parsing rfc7605
Parsing rfc7657
Parsing rfc7826
Parsing rfc7890
Parsing rfc8085
Parsing rfc8261
Parsing rfc7587
Parsing rfc7655
Parsing rfc7657
Parsing rfc7667
Parsing rfc7714
Parsing rfc7728
Parsing rfc7741
Parsing rfc7912
Parsing rfc2132
Parsing rfc2136
Parsing rfc7798
Parsing rfc7826
Parsing rfc7866
Parsing rfc8083
Parsing rfc8269
Parsing rfc8286
Parsing rfc8451
Parsing rfc7912
Parsing rfc8460
Parsing rfc8445
Parsing rfc8825
Parsing rfc8829
Parsing rfc8833
Parsing rfc8834
Parsing rfc8825
Parsing rfc8830
Parsing rfc2138
Parsing rfc2155
Parsing rfc8460
Parsing rfc4021
Parsing rfc4229
Parsing rfc7681
Parsing rfc7681
Parsing bcp78
Parsing rfc2156
Parsing rfc2165
Parsing rfc2184
Parsing rfc2198
Parsing rfc2205
Parsing rfc2206
Parsing rfc8836
Parsing rfc8839
Parsing rfc8841
Parsing rfc2207
Parsing rfc8843
Parsing rf

ValueError: not enough values to unpack (expected 2, got 1)

In [153]:
len(docs)

3194

In [154]:
len(parsed)

3360

In [155]:
len(to_parse)

81800

In [162]:
len(set(to_parse))

10798

In [143]:
set(parsed).intersection(set(to_parse))

{'bcp14',
 'bcp76',
 'bcp85',
 'bcp98',
 'bcp99',
 'rfc1123',
 'rfc1321',
 'rfc1750',
 'rfc1889',
 'rfc2046',
 'rfc2069',
 'rfc2076',
 'rfc2183',
 'rfc2246',
 'rfc2279',
 'rfc2326',
 'rfc2327',
 'rfc2368',
 'rfc2396',
 'rfc2401',
 'rfc2426',
 'rfc2543',
 'rfc2616',
 'rfc2617',
 'rfc2630',
 'rfc2633',
 'rfc2806',
 'rfc2822',
 'rfc2849',
 'rfc2914',
 'rfc2960',
 'rfc2976',
 'rfc3015',
 'rfc3204',
 'rfc3262',
 'rfc3263',
 'rfc3264',
 'rfc3268',
 'rfc3310',
 'rfc3311',
 'rfc3313',
 'rfc3319',
 'rfc3323',
 'rfc3325',
 'rfc3327',
 'rfc3329',
 'rfc3372',
 'rfc3388',
 'rfc3398',
 'rfc3427',
 'rfc3455',
 'rfc3459',
 'rfc3487',
 'rfc3515',
 'rfc3521',
 'rfc3578',
 'rfc3581',
 'rfc3603',
 'rfc3608',
 'rfc3665',
 'rfc3666',
 'rfc3680',
 'rfc3725',
 'rfc3764',
 'rfc3824',
 'rfc3840',
 'rfc3841',
 'rfc3842',
 'rfc3853',
 'rfc3856',
 'rfc3857',
 'rfc3858',
 'rfc3891',
 'rfc3892',
 'rfc3893',
 'rfc3903',
 'rfc3910',
 'rfc3911',
 'rfc3944',
 'rfc3959',
 'rfc3960',
 'rfc3966',
 'rfc3968',
 'rfc3969',
 '

In [163]:
with open("data.json", "w") as f:
    json.dump(docs, f)
with open("parsed.txt", "w") as f:
    f.write("\n".join(parsed))
with open("to_parse.txt", "w") as f:
    f.write("\n".join(to_parse))

In [168]:
len(set([x["label"] for x in docs]))

3031

In [169]:
added = []
removed_duplicates = []
for x in docs:
    if x["label"] not in added:
        added.append(x["label"])
        removed_duplicates.append(x)

In [170]:
len(removed_duplicates)

3031

In [171]:
with open("data.json", "w") as f:
    json.dump(removed_duplicates, f)
with open("parsed.txt", "w") as f:
    f.write("\n".join(parsed))
with open("to_parse.txt", "w") as f:
    f.write("\n".join(to_parse))

In [176]:
removed_duplicates[3]

{'title': "Actions Addressing Identified Issues with the Session Initiation Protocol's (SIP) Non-INVITE Transaction",
 'text': 'https://www.rfc-editor.org/rfc/rfc4320.txt',
 'pdf': 'https://www.rfc-editor.org/rfc/pdfrfc/rfc4320.txt.pdf',
 'html': 'https://tools.ietf.org/html/rfc4320',
 'label': 'RFC 4320',
 'aka': [],
 'last_updated': '2015-10-14',
 'updates': [{'uri': '/doc/rfc3261/', 'name': 'RFC 3261'}],
 'updated_by': [],
 'obsoletes': [],
 'obsoleted_by': [],
 'authors': [{'email': '/person/rjsparks@nostrum.com',
   'name': 'Robert Sparks'}],
 'references': [{'name': 'BCP 78',
   'status': 'Best Current Practice',
   'type': 'normatively references',
   'downref': ''},
  {'name': 'RFC 3261',
   'status': 'Proposed Standard',
   'type': 'normatively references',
   'downref': ''},
  {'name': 'RFC 3263',
   'status': 'Proposed Standard',
   'type': 'normatively references',
   'downref': ''},
  {'name': 'RFC 4321',
   'status': 'Informational',
   'type': 'normatively references',
 