# RFC Index Dataframe
Create a clean dataframe containing information about all RFC's given the index page at https://tools.ietf.org/rfc/index.

In [30]:
import os
import re
import pickle

import wget
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm



def concat_rfc_lines(lines):
    """
    Given a list of lines where a same RFC is described on multiple lines, concat
    the lines describing the same RFC.
    """
    rfc_lines = []
    current_rfc = ''
    for line in lines:
        if line.startswith('RFC'):
            rfc_lines.append(current_rfc)  # End of previous RFC, append it to list.
            current_rfc = line  # Get beginning of new rfc.
        else:
            current_rfc += line
    return rfc_lines


def remove_multiple_spaces(text):
    """
    Given a string, replace all multiple spaces in it by a single space.
    """
    text = re.sub('\s{2,}', ' ', text)
    text = text.lstrip().rstrip()  # Remove whitespaces in beginning or end of string.
    return text
    

def get_rfc_lines(page, errors_filepath):
    """
    Given the result of an url request, get the text of interest.
    """
    # Load the page with BeautifulSoup.
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Get the text of interest (the index is in <pre>...</pre>).
    body = soup('pre')[0]
    
    # Get plain text.
    content = body.get_text() 
    
    # Remove all text before the line beginning by 'RFC1' (beginning of the index).
    content = content.split('RFC1 ')[1]
    content = 'RFC1 ' + content
    
    # Split raw text to lines.
    lines = content.splitlines()
    lines = [line for line in lines if line != ''] # remove empty lines.
    
    # Concat lines describing the same RFC.
    rfc_lines = concat_rfc_lines(lines)
    rfc_lines = rfc_lines[1:]
    
    # Remove multiple spaces in lines.
    rfc_lines = [remove_multiple_spaces(line) for line in rfc_lines]
    
    # Remove all 'Non Issued' RFC lines.
    rfc_lines = [line for line in rfc_lines if 'Not Issued' not in line]
    
    # Remove RFC that were not downloaded (404 error).
    with open(errors_filepath, "rb") as f:
        errors = pickle.load(f)
    rfc_lines = [line for line in rfc_lines if int(line.split(' ', 1)[0].split('RFC')[1]) not in set(errors)]
    print("  The following RFC's were not found (404) during download so are removed here: {}".format(str(errors)))
    
    return rfc_lines


def create_dataframe(rfc_lines):
    """
    Given the lines describing each RFC, create a dataframe.
    """
    # Init lists.
    names = []
    titles = []
    authors = []
    dates = []
    formats = []
    obsolotes = []
    obsoloted = []
    updates = []
    updated = []
    also = []
    status = []
    dois = []
    
    # Process each line.
    for i, line in enumerate(tqdm(rfc_lines)):
        
        # Get all attributes within brackets.
        brackets = re.findall(r"\((.*?)\)", line)

        # Get individual attributes.
        form = None
        obs = None
        obs_by = None
        up = None
        up_by = None
        al = None
        stat = None
        doi = None
        for att in brackets:
            if att.startswith('Format: '):
                form = att.split('Format: ')[1]
            elif att.startswith('Obsolotes '):
                obs = att.split('Obsolotes ')[1]
            elif att.startswith('Obsoleted by '):
                obs_by = att.split('Obsoleted by ')[1]
            elif att.startswith('Updates '):
                up = att.split('Updates ')[1]
            elif att.startswith('Updated by '):
                up_by = att.split('Updated by ')[1]
            elif att.startswith('Also '):
                al = att.split('Also ')[1]
            elif att.startswith('Status: '):
                stat = att.split('Status: ')[1]
            elif att.startswith('DOI: '):
                doi = att.split('DOI: ')[1]
        line = line.split('(Format')[0].rstrip()  # Remove bracket attributes from the line.

        # Get the date of publication.
        split_line = line.split(".")
        split_line = [l for l in split_line if l != '']
        date = split_line[-1].lstrip()
        line = line.replace(date + '.', '')  # Remove date from line.

        # Get name of RFC.
        name = line.split()[0]
        line = line.replace(name, '')  # Remove name from line.

        # Get title of RFC.
        split_line = line.split('.')
        title = split_line.pop(0)
        while not split_line[0].isspace() and not (len(split_line[0]) == 2 and split_line[0][0].isspace() and split_line[0][1].isupper()):
            title += ('.' + split_line.pop(0))  # This line deals with a title that contains dots.
        line = line.replace(title + '.', '')  # Remove title from line.

        # Get authors.
        aut = line.lstrip().rstrip()[:-1]

        # Append all info to corresponding list.
        names.append(name)
        titles.append(title)
        authors.append(aut)
        dates.append(date)
        formats.append(form)
        obsolotes.append(obs)
        obsoloted.append(obs_by)
        updates.append(up)
        updated.append(up_by)
        also.append(al)
        status.append(stat)
        dois.append(doi)

    # Create dataframe.
    d = {'Name':names,
         'Ttile':titles,
         'Authors':authors,
         'Date':dates,
         'Formats':formats,
         'Obsolotes':obsolotes,
         'Obsoloted_by':obsoloted,
         'Updates':updates,
         'Updated_by':updated,
         'Also_FYI':also,
         'Status':status,
         'DOI':dois}
    df = pd.DataFrame(d)
    return df
        

def main(url, errors_filepath, outdir):
    """
    """
    print("\nDownload the index page at {}...".format(url))
    page = requests.get(url)
    
    print("\nExtract all RFC lines...")
    rfc_lines = get_rfc_lines(page, errors_filepath)
    
    print("\nProcess lines and create dataframe...")
    df = create_dataframe(rfc_lines)
    
    print("\nSave dataframe to {}...".format(outdir))
    df.to_csv(outdir, sep=',', encoding='utf-8', float_format='%.10f', decimal='.')
    print("\nDone.")
    return df

In [37]:
df = main(url='https://tools.ietf.org/rfc/index', 
          errors_filepath='/raid/antoloui/Master-thesis/_data/search/rfc/errors404', 
          outdir='/raid/antoloui/Master-thesis/_data/search/rfc/info.csv')
df


Download the index page at https://tools.ietf.org/rfc/index...

Extract all RFC lines...
  The following RFC's were not found (404) during download so are removed here: [8, 9, 51, 418, 530, 598, 3333, 3350, 3399, 3699, 3799, 3800, 3899, 3900, 3907, 3908, 3999, 4000, 4099, 4100, 4199, 4200, 4232, 4299, 4300, 4399, 4400, 4499, 4500, 4599, 4600, 4658, 4699, 4700, 4751, 4799, 4800, 4899, 4900, 4921, 4922, 4989, 4999, 5099, 5100, 5108, 5199, 5200, 5299, 5300, 5312, 5313, 5314, 5315, 5319, 5399, 5400, 5499, 5500, 5599, 5600, 5699, 5700, 5799, 5800, 5809, 5821, 5822, 5823, 5899, 5900, 5999, 6000, 6099, 6100, 6102, 6103, 6199, 6200, 6299, 6300, 6399, 6400, 6499, 6500, 6523, 6524, 6599, 6600, 6634, 6699, 6700, 6799, 6800, 6899, 6900, 6966, 6995, 6999, 7000, 7099, 7327, 7907, 8523, 8524, 8535, 8566, 8626, 8644, 8646, 8647, 8648, 8723, 8724, 8726, 8727, 8744, 8753, 8758, 8761, 8763, 8764, 8765, 8766, 8772]

Process lines and create dataframe...


HBox(children=(FloatProgress(value=0.0, max=8569.0), HTML(value='')))



Save dataframe to /raid/antoloui/Master-thesis/_data/search/rfc/index.csv...

Done.


Unnamed: 0,Name,Ttile,Authors,Date,Formats,Obsolotes,Obsoloted_by,Updates,Updated_by,Also_FYI,Status,DOI
0,RFC1,Host Software,S. Crocker,April 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0001
1,RFC2,Host software,B. Duvall,April 1969,"TXT, PDF, HTML",,,,,,UNKNOWN,10.17487/RFC0002
2,RFC3,Documentation conventions,S.D. Crocker,April 1969,"TXT, HTML",,RFC0010,,,,UNKNOWN,10.17487/RFC0003
3,RFC4,Network timetable,E.B. Shapiro,March 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0004
4,RFC5,Decode Encode Language (DEL),J. Rulifson,June 1969,"TXT, HTML",,,,,,UNKNOWN,10.17487/RFC0005
...,...,...,...,...,...,...,...,...,...,...,...,...
8564,RFC8768,Constrained Application Protocol (CoAP) Hop-L...,"M. Boucadair, T. Reddy.K, J. Shallow",March 2020,"HTML, TXT, PDF, XML",,,,,,PROPOSED STANDARD,10.17487/RFC8768
8565,RFC8769,Cryptographic Message Syntax (CMS) Content Ty...,J. Schaad,March 2020,"HTML, TXT, PDF, XML",,,,,,INFORMATIONAL,10.17487/RFC8769
8566,RFC8770,Host Router Support for OSPFv2,"K. Patel, P. Pillay-Esnault, M. Bhardwaj, S. B...",April 2020,"HTML, TXT, PDF, XML",,,RFC6987,,,PROPOSED STANDARD,10.17487/RFC8770
8567,RFC8771,The Internationalized Deliberately Unreadable...,"A. Mayrhofer, J. Hague",1 April 2020,"HTML, TXT, PDF, XML",,,,,,EXPERIMENTAL,10.17487/RFC8771
