### This code extracts documents and related comments from documents in different categories. There are 10 categories in regulation.gov

In [1]:
from pickle import dump, load
import pandas
import requests
import urllib.request
from bs4 import BeautifulSoup
import PyPDF2
api_key = 'vT4R3vZ8RpZhnCpgeCPx1LdWRSZS8yxHHGquPrxm'

In [2]:
def read_file_get_docid(filepath):
    dump_df = load(open(filepath,'rb'))
    df_with_comments = dump_df[dump_df.numberOfCommentsReceived > 0]
    doc_id = df_with_comments.documentId
    doc_type = df_with_comments.documentType
    return [doc_id,set(doc_type)]

In [3]:
[doc_id_list,types] = read_file_get_docid('data/BFS_doc_list') #choose the category dump file
print(types)
# document ID with 4 parts represent documents. 3 parts represent dockets 
doc_ids = [doc_id for doc_id in doc_id_list if len(doc_id.split('-')) == 4]
len(doc_ids)

{'Other', 'Rule', 'Notice', 'Proposed Rule'}


761

### Using regulations.gov API
* We need to use the API to retrieve each document content. This API will use document_id that we extracted from the file above.
* For each document_id, we will need to construct comment_id based on the total number of comments on it.

In [48]:
def download_file(download_url):
    response = urllib.request.urlopen(download_url)
    file = open("document.pdf", 'wb')
    file.write(response.read())
    file.close()
    
def get_attached_comments(comment_id, key=api_key):
    #print(each_id) # fro debugging
    #open the api to get file url
    url = "http://api.data.gov:80/regulations/v3/document.json?api_key="+key+"&documentId="+comment_id
    response = requests.get(url)
    if response.status_code != 200:
        print(response.status_code)
    data = response.json()
    link = data["attachments"][0]["fileFormats"][0] # assuming there is only 1 attachment and extracting the pdf link
    access_link = link+'&api_key='+key
    #download file(pdf) and read pdf (page by page)
    download_file(access_link)
    pdfFileObj = open('document.pdf','rb')     #'rb' for read binary mode
    comment_text =""
    try:
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        pno = pdfReader.numPages
        for i in range(pno):
            pageObj = pdfReader.getPage(i)          #'i' is the page number
            comment_text += pageObj.extractText()
    except:
        print("cant read "+comment_id) # prints in case we are not able to read file
    return comment_text

In [49]:
def get_document_comments_from_api(docketId,key=api_key):
    offset=0
    url = "http://api.data.gov:80/regulations/v3/documents.json?api_key="+key+"&countsOnly=1&dct=PS&dktid="+docketId
    response = requests.get(url)
    if response.status_code != 200:
        print(response.status_code)
    data = response.json()
    total = data['totalNumRecords']
    com_list =[]
    for i in range(0,total,500):
        url = "http://api.data.gov:80/regulations/v3/documents.json?api_key="+key+"&countsOnly=0&&rpp=500&po="+str(i)+"&dktid="+docketId
        response = requests.get(url)
        #print("Offset:"+str(i)+" Code:"+str(response.status_code))
        data = response.json()
        com_list += data['documents']
    com_df = pandas.DataFrame(com_list)
    return com_df

def get_document_content_from_api(docId,key=api_key):
    url = "http://api.data.gov:80/regulations/v3/document.json?api_key="+key+"&documentId="+docId
    response = requests.get(url)
    if response.status_code != 200:
        print(response.status_code)
    data = response.json()
    
    # Get HTML for document content
    link = data['fileFormats'][1] # The second link is the document in HTML format
    access_link = link+'&api_key='+key
    
    with urllib.request.urlopen(access_link) as response:
        html = response.read()
    
    # We are interested in the pre tag of the HTML content
    soup = BeautifulSoup(html, "lxml")
    content = soup.find_all('pre')
    
    # Now we need to construct comment_id from document_id
    docket_id = '-'.join(docId.split('-')[:3])
    comment_df = get_document_comments_from_api(docket_id)
    # get comment text where exists
    comment_text =comment_df[comment_df.commentText.notnull()].commentText
    comment_list =comment_text.tolist()
    #get doc id where there is attchment
    c_ids = comment_df[comment_df.attachmentCount>0].documentId
    # get comment for each id in list
    for each_id in c_ids.unique():
        comment_list.append(get_attached_comments(each_id))
    doc_dict = {
        "text":content,
        "comment_list":comment_list
    }
    return doc_dict

#### Running it on one document

In [50]:
resp = get_document_content_from_api(doc_ids[0])
resp

cant read ASC-2016-0004-0011




cant read ASC-2016-0004-0046


{'comment_list': ['- I have added additional commentary to my original comment, with the full comments below:\n- How much discussion has there been regarding a flat fee option, rather than being based on the number of appraisers doing business with the AMC? Commentary and discussion have likened this ASC fee to the fee embedded in appraiser\'s state renewal fees; however, their is a significant difference: appraisers are not paying their ASC fee based on how much they work - it is a one-sized flat fee. While the impact of a per-appraiser fee could easily be absorbed by large AMCs, it could have significant negative impact on smaller, local, and regional AMCs that provide service to lenders, banks, and credit unions. Additionally, a flat one-sized fee, rather than a per-appraiser fee, would be more easily calculated, enforced, and collected, and would have less impact on AMCs.\n- Nevertheless, in the absence of an apparent consideration of a flat one-sized fee option, the option (third)

### next steps
1. Run it on say top ten document id (based on number of attachment in desc order) for each category
2. Each doc output from the get_document_content_from_api file will be a dictionary, create a data frame from all the document and comments you extract (say a data frame of 10 row)
3. dump the data frame in a file like

```output = open('LES_doc_content', 'wb')
dump(df, output, -1)
output.close()```