In [143]:
import os
import re

import wget
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm


def concat_rfc_lines(lines):
    """
    Given a list of lines where a same RFC is described on multiple lines, concat
    the lines describing the same RFC.
    """
    rfc_lines = []
    current_rfc = ''
    for line in lines:
        if line.startswith('RFC'):
            rfc_lines.append(current_rfc)  # End of previous RFC, append it to list.
            current_rfc = line  # Get beginning of new rfc.
        else:
            current_rfc += line
    return rfc_lines


def remove_multiple_spaces(text):
    """
    Given a string, replace all multiple spaces in it by a single space.
    """
    text = re.sub('\s{2,}', ' ', text)
    text = text.lstrip().rstrip()  # Remove whitespaces in beginning or end of string.
    return text
    

def get_rfc_lines(page):
    """
    Given the result of an url request, get the text of interest.
    """
    # Load the page with BeautifulSoup.
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Get the text of interest (the index is in <pre>...</pre>).
    body = soup('pre')[0]
    
    # Get plain text.
    content = body.get_text() 
    
    # Remove all text before the line beginning by 'RFC1' (beginning of the index).
    content = content.split('RFC1 ')[1]
    content = 'RFC1 ' + content
    
    # Split raw text to lines.
    lines = content.splitlines()
    lines = [line for line in lines if line != ''] # remove empty lines.
    
    # Concat lines describing the same RFC.
    rfc_lines = concat_rfc_lines(lines)
    
    # Remove multiple spaces.
    rfc_lines = [remove_multiple_spaces(line) for line in rfc_lines]
    
    # Remove all 'Non Issued' RFCs.
    rfc_lines = [line for line in rfc_lines if 'Not Issued' not in line]
    
    return rfc_lines[1:]


def create_dataframe(rfc_lines):
    """
    Given the lines describing each RFC, create a dataframe.
    """
    # Init lists.
    names = []
    titles = []
    authors = []
    dates = []
    formats = []
    obsolotes = []
    obsoloted = []
    updates = []
    updated = []
    also = []
    status = []
    dois = []
    
    # Process each line.
    for i, line in enumerate(tqdm(rfc_lines)):
        
        # Get all attributes within brackets.
        brackets = re.findall(r"\((.*?)\)", line)

        # Get individual attributes.
        form = None
        obs = None
        obs_by = None
        up = None
        up_by = None
        al = None
        stat = None
        doi = None
        for att in brackets:
            if att.startswith('Format: '):
                form = att.split('Format: ')[1]
            elif att.startswith('Obsolotes '):
                obs = att.split('Obsolotes ')[1]
            elif att.startswith('Obsoleted by '):
                obs_by = att.split('Obsoleted by ')[1]
            elif att.startswith('Updates '):
                up = att.split('Updates ')[1]
            elif att.startswith('Updated by '):
                up_by = att.split('Updated by ')[1]
            elif att.startswith('Also '):
                al = att.split('Also ')[1]
            elif att.startswith('Status: '):
                stat = att.split('Status: ')[1]
            elif att.startswith('DOI: '):
                doi = att.split('DOI: ')[1]
        line = line.split('(Format')[0].rstrip()  # Remove bracket attributes from the line.

        # Get the date of publication.
        split_line = line.split(".")
        split_line = [l for l in split_line if l != '']
        date = split_line[-1].lstrip()
        line = line.replace(date + '.', '')  # Remove date from line.

        # Get name of RFC.
        name = line.split()[0]
        line = line.replace(name, '')  # Remove name from line.

        # Get title of RFC.
        title = line.split('.')[0].lstrip()
        line = line.replace(title + '.', '')  # Remove title from line.

        # Get authors.
        aut = line.lstrip().rstrip()[:-1]

        # Append all info to corresponding list.
        names.append(name)
        titles.append(title)
        authors.append(aut)
        dates.append(date)
        formats.append(form)
        obsolotes.append(obs)
        obsoloted.append(obs_by)
        updates.append(up)
        updated.append(up_by)
        also.append(al)
        status.append(stat)
        dois.append(doi)

    # Create dataframe.
    d = {'Name':names,
         'Ttile':titles,
         'Authors':authors,
         'Date':dates,
         'Formats':formats,
         'Obsolotes':obsolotes,
         'Obsoloted_by':obsoloted,
         'Updates':updates,
         'Updated_by':updated,
         'Also_FYI':also,
         'Status':status,
         'DOI':dois}
    df = pd.DataFrame(d)
    return df
        
    
    
def main(url):
    """
    """
    # Download the page.
    page = requests.get(url)
    
    # Get all RFC lines.
    rfc_lines = get_rfc_lines(page)
    
    # Create dataframe.
    df = create_dataframe(rfc_lines)
    return df

In [144]:
df = main(url='https://tools.ietf.org/rfc/index')

HBox(children=(FloatProgress(value=0.0, max=8574.0), HTML(value='')))




In [145]:
df

Unnamed: 0,Name,Ttile,Authors,Date,Formats,Obsolotes,Obsoloted_by,Updates,Updated_by,Also_FYI,Status,DOI
0,RFC1,Host Software,S. Crocker,April 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0001
1,RFC2,Host software,B. Duvall,April 1969,"TXT, PDF, HTML",,,,,,UNKNOWN,10.17487/RFC0002
2,RFC3,Documentation conventions,S.D. Crocker,April 1969,"TXT, HTML",,RFC0010,,,,UNKNOWN,10.17487/RFC0003
3,RFC4,Network timetable,E.B. Shapiro,March 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0004
4,RFC5,Decode Encode Language (DEL),J. Rulifson,June 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0005
...,...,...,...,...,...,...,...,...,...,...,...,...
8569,RFC8767,Serving Stale Data to Improve DNS Resiliency,"D. Lawrence, W. Kumari, P. Sood",March 2020,"HTML, TXT, PDF, XML",,,"RFC1034, RFC1035, RFC2181",,,PROPOSED STANDARD,10.17487/RFC8767
8570,RFC8768,Constrained Application Protocol (CoAP) Hop-Li...,"M. Boucadair, T. Reddy.K, J. Shallow",March 2020,"HTML, TXT, PDF, XML",,,,,,PROPOSED STANDARD,10.17487/RFC8768
8571,RFC8769,Cryptographic Message Syntax (CMS) Content Typ...,J. Schaad,March 2020,"HTML, TXT, PDF, XML",,,,,,INFORMATIONAL,10.17487/RFC8769
8572,RFC8771,The Internationalized Deliberately Unreadable ...,"A. Mayrhofer, J. Hague",1 April 2020,"HTML, TXT, PDF, XML",,,,,,EXPERIMENTAL,10.17487/RFC8771


In [6]:
total = 10 #8774
base_url = 'https://tools.ietf.org/rfc/rfc'
outdir = '/raid/antoloui/Master-thesis/_data/search/rfc'

# Get all urls.
urls = [base_url + str(i+1) + '.txt' for i in range(total)]

# Create output directory if not exists.
os.makedirs(outdir, exist_ok=True)

# Download all RFC files.
for url in urls:
    wget.download(url, outdir)

HTTPError: HTTP Error 404: Not Found