### This code extracts documents and related comments from documents in different categories. There are 10 categories in regulation.gov

In [1]:
from pickle import dump, load
import pandas
import requests
import urllib.request
from bs4 import BeautifulSoup
import PyPDF2
api_key = 'vT4R3vZ8RpZhnCpgeCPx1LdWRSZS8yxHHGquPrxm'

In [2]:
def read_file_get_docid(filepath):
    dump_df = load(open(filepath,'rb'))
    df_with_comments = dump_df[dump_df.numberOfCommentsReceived > 0]
    df_with_comments =df_with_comments.sort(['numberOfCommentsReceived'], ascending=[False])
    doc_id = df_with_comments.documentId
    doc_type = df_with_comments.documentType
    return [doc_id,set(doc_type)]
    

In [3]:
[doc_id_list,types] = read_file_get_docid('data/AD_doc_list') #choose the category dump file
print(types)
# document ID with 4 parts represent documents. 3 parts represent dockets 
doc_ids = [doc_id for doc_id in doc_id_list if len(doc_id.split('-')) == 4]
len(doc_ids)

{'Proposed Rule', 'Other', 'Rule', 'Notice'}




4421

### Using regulations.gov API
* We need to use the API to retrieve each document content. This API will use document_id that we extracted from the file above.
* For each document_id, we will need to construct comment_id based on the total number of comments on it.

In [4]:
def download_file(download_url):
    try:
        response = urllib.request.urlopen(download_url)
        file = open("document.pdf", 'wb')
        file.write(response.read())
        file.close()
    except:
        print("(downloading the pdf exception)error log" +  download_url)
    
def get_attached_comments(comment_id, key=api_key):
    #print(each_id) # fro debugging
    #open the api to get file url
    url = "http://api.data.gov:80/regulations/v3/document.json?api_key="+key+"&documentId="+comment_id
    try:
        response = requests.get(url)
    except:
        print("(api opening of attached comment exception)error log" +  url)
    if response.status_code != 200:
        print("status code " +str(response.status_code)+" (get_attached_comments) program will break at this point which is ok because we dont need inconsistent data. Run again ")
    data = response.json()
    att_count = len(data["attachments"][0]["fileFormats"])
    comment_text =""
    for i in range(att_count):
        if data["attachments"][0]["fileFormats"][i].endswith("pdf"):
            link = data["attachments"][0]["fileFormats"][i] 
            access_link = link+'&api_key='+key
            #download file(pdf) and read pdf (page by page)
            download_file(access_link)
            pdfFileObj = open('document.pdf','rb')     #'rb' for read binary mode
            try:
                pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
                pno = pdfReader.numPages
                for i in range(pno):
                    pageObj = pdfReader.getPage(i)          #'i' is the page number
                    comment_text += pageObj.extractText()
            except:
                print("(pdf exception)cant read "+comment_id ) # prints in case we are not able to read file
            break # execute the whole thing for 1st found pdf
    return comment_text

In [19]:
def get_document_comments_from_api(docketId,key=api_key):
    offset=0
    url = "http://api.data.gov:80/regulations/v3/documents.json?api_key="+key+"&countsOnly=1&dct=PS&dktid="+docketId
    try:
        response = requests.get(url)
    except:
        print("(api opening comment count)error log"+url) # prints in case we are not able to read file
    if response.status_code != 200:
        print("status code"+str(response.status_code) + " (get_document_comments_from_api) program will break at this point which is ok because we dont need inconsistent data. Run again")
    data = response.json()
    total = data['totalNumRecords']
    com_list =[]
    for i in range(0,total,500):
        url = "http://api.data.gov:80/regulations/v3/documents.json?api_key="+key+"&countsOnly=0&&rpp=500&po="+str(i)+"&dct=PS&dktid="+docketId
        try:
            response = requests.get(url)
        except:
            print("(api opening actual comments)error log"+url) # prints in case we are not able to read file
        #print("Offset:"+str(i)+" Code:"+str(response.status_code))
        if response.status_code != 200:
            print(response.status_code)
        data = response.json()
        com_list += data['documents']
    com_df = pandas.DataFrame(com_list)
    return com_df

def get_document_content_from_api(docId,key=api_key):
    url = "http://api.data.gov:80/regulations/v3/document.json?api_key="+key+"&documentId="+docId
    try:
        response = requests.get(url)
    except:
         print("(api opening doc exception) error log"+url)   
    if response.status_code != 200:
        print("status code "+str(response.status_code)+" (get_document_content_from_api) program will break at this point which is ok because we dont need inconsistent data. Run again")
    data = response.json()
    
    # Get HTML for document content
    link = data['fileFormats'][1] # The second link is the document in HTML format
    access_link = link+'&api_key='+key
    
    try:
        with urllib.request.urlopen(access_link) as response:
            html = response.read()
    except:
        print("doc file opening exception")
    
    # We are interested in the pre tag of the HTML content
    soup = BeautifulSoup(html, "lxml")
    content = soup.find_all('pre')
    
    # Now we need to construct comment_id from document_id
    docket_id = '-'.join(docId.split('-')[:3])
    comment_df = get_document_comments_from_api(docket_id)
    # get comment text where exists
    comment_list =[]
    if not comment_df.empty:
        if "commentText" in df:
            comment_text =comment_df[comment_df.commentText.notnull()].commentText
            comment_list =comment_text.tolist()
        #get doc id where there is attchment
        c_ids = comment_df[comment_df.attachmentCount>0].documentId
        # get comment for each id in list
        for each_id in c_ids.unique():
            comment_list.append(get_attached_comments(each_id))
    doc_dict = {
        "text":content,
        "comment_list":comment_list
    }
    return doc_dict

#### Running it on one document

In [20]:
doc_collection = []

In [16]:
## testing cell
df = get_document_comments_from_api("NHTSA-1996-1698")
##"commentText" in df

False

In [21]:
for i in range(30,40):
    print(str(i)+' Calling APIs for ',doc_ids[i])
    resp = get_document_content_from_api(doc_ids[i])
    doc_collection.append(resp)

30 Calling APIs for  NHTSA-1996-1698-0001




(pdf exception)cant read NHTSA-1996-1698-0192
(pdf exception)cant read NHTSA-1996-1698-0440
(pdf exception)cant read NHTSA-1996-1698-0193
(pdf exception)cant read NHTSA-1996-1698-0441
(pdf exception)cant read NHTSA-1996-1698-0194
(pdf exception)cant read NHTSA-1996-1698-0838
(pdf exception)cant read NHTSA-1996-1698-0442
(pdf exception)cant read NHTSA-1996-1698-0195
(pdf exception)cant read NHTSA-1996-1698-0078
(pdf exception)cant read NHTSA-1996-1698-0993
(pdf exception)cant read NHTSA-1996-1698-0839
(pdf exception)cant read NHTSA-1996-1698-0210
(pdf exception)cant read NHTSA-1996-1698-0459
(pdf exception)cant read NHTSA-1996-1698-0443
(pdf exception)cant read NHTSA-1996-1698-0196
(pdf exception)cant read NHTSA-1996-1698-0994
(pdf exception)cant read NHTSA-1996-1698-0014
(pdf exception)cant read NHTSA-1996-1698-0084
(pdf exception)cant read NHTSA-1996-1698-0046
(pdf exception)cant read NHTSA-1996-1698-0197
(pdf exception)cant read NHTSA-1996-1698-0752
(pdf exception)cant read NHTSA-199

KeyError: 'attachments'

### next steps
1. Run it on say top ten document id (based on number of attachment in desc order) for each category
2. Each doc output from the get_document_content_from_api file will be a dictionary, create a data frame from all the document and comments you extract (say a data frame of 10 row)
3. dump the data frame in a file like

```output = open('LES_doc_content', 'wb')
dump(df, output, -1)
output.close()```

In [None]:
collection_df = pandas.DataFrame(doc_collection)
output = open('BFS_doc_content', 'wb')
dump(collection_df, output, -1)
output.close()

In [16]:
df = get_document_comments_from_api('CDFI-2016-0001')

In [18]:
df.empty


True

In [7]:
read_file_get_docid('data/AD_doc_list')



Unnamed: 0,agencyAcronym,allowLateComment,attachmentCount,commentDueDate,commentStartDate,commentText,docketId,docketTitle,docketType,documentId,documentStatus,documentType,frNumber,numberOfCommentsReceived,openForComment,organization,postedDate,rin,submitterName,title
1447,EPA,False,0,2014-11-14T23:59:59-05:00,2014-04-21T00:00:00-04:00,,EPA-HQ-OW-2011-0880,"Definition of ""Waters of the United States"" Un...",Rulemaking,EPA-HQ-OW-2011-0880-0001,Posted,Proposed Rule,2014-07142,1128055,False,,2014-04-21T00:00:00-04:00,2040-AF30,,Clean Water Act; Definitions: Waters of the Un...
15231,FAA,True,0,2014-07-25T23:59:59-04:00,2014-06-25T00:00:00-04:00,,FAA-2014-0396,Interpretation of the Special Rule for Model A...,Nonrulemaking,FAA-2014-0396-0001,Posted,Rule,2014-14948,32348,True,,2014-06-25T00:00:00-04:00,,,Interpretation of the Special Rule for Model A...
18416,FMCSA,True,0,2011-03-04T23:59:59-05:00,2010-12-29T00:00:00-05:00,,FMCSA-2004-19608,Hours of Service of Drivers,Rulemaking,FMCSA-2004-19608-4095,Posted,Proposed Rule,2010-32251,23563,True,,2010-12-29T00:00:00-05:00,2126-AB14,,"Hours of Service of Drivers, Proposed Rule, 75..."
18091,FMCSA,True,0,2000-12-15T23:59:59-05:00,,,FMCSA-1997-2350,Notice of Proposed Rulemaking (NPRM) - Hours o...,Rulemaking,FMCSA-1997-2350-0001,Posted,Rule,,23371,True,,1996-11-15T00:00:00-05:00,2126-AA23,,Advanced Notice of Proposed Rulemaking - Hours...
17417,FHWA,True,0,2014-06-09T23:59:59-04:00,2014-03-11T00:00:00-04:00,,FHWA-2013-0020,National Performance Management Measures,Rulemaking,FHWA-2013-0020-0003,Posted,Proposed Rule,2014-05152,11196,True,,2014-03-11T00:00:00-04:00,Not Assigned,,National Performance Management Measures; High...
17436,FHWA,True,0,2016-08-20T23:59:59-04:00,2016-04-22T00:00:00-04:00,,FHWA-2013-0054,National Performance Management Measures; Asse...,Rulemaking,FHWA-2013-0054-0092,Posted,Proposed Rule,2016-08014,8884,True,,2016-04-22T00:00:00-04:00,2125-AF54,,National Performance Management Measures: Asse...
18382,FMCSA,True,0,2001-05-04T23:59:59-04:00,,,FMCSA-2000-7382,Notice of Proposed Rulemaking (NPRM); Request ...,Rulemaking,FMCSA-2000-7382-0012,Posted,Rule,,6948,True,,2001-05-04T00:00:00-04:00,Not Assigned,,Notice of Proposed Rulemaking (NPRM); Request ...
29242,TSA,True,0,2008-12-29T23:59:59-05:00,2008-10-30T00:00:00-04:00,,TSA-2008-0021,"Large Aircraft Security Program, Other Aircraf...",Rulemaking,TSA-2008-0021-0001,Posted,Rule,E8-23685,6695,True,,2008-10-30T00:00:00-04:00,1652-AA53,,"NPRM: Large Aircraft Security Program, Other ..."
22428,NHTSA,True,0,2001-09-28T23:59:59-04:00,,,NHTSA-2001-8885,"U.S. DOT/NHTSA - Glare from Headlamps, Federal...",Rulemaking,NHTSA-2001-8885-0106,Posted,Rule,,5788,True,,2001-09-28T00:00:00-04:00,2127-AH81,,U.S. DOT/NHTSA - Request for Comments
16578,FAA,False,0,2016-01-15T23:59:59-05:00,2015-12-16T00:00:00-05:00,,FAA-2015-7396,Registration and Marking Requirements for Smal...,Rulemaking,FAA-2015-7396-0001,Posted,Rule,2015-31750,5577,False,,2015-12-16T00:00:00-05:00,2120-AK82,,Registration and Marking Requirements for Smal...


In [15]:
doc_id_list[:5]

1447     EPA-HQ-OW-2011-0880-0001
15231          FAA-2014-0396-0001
18416       FMCSA-2004-19608-4095
18091        FMCSA-1997-2350-0001
17417         FHWA-2013-0020-0003
Name: documentId, dtype: object

In [18]:
doc_ids[:5]

['FAA-2014-0396-0001',
 'FMCSA-2004-19608-4095',
 'FMCSA-1997-2350-0001',
 'FHWA-2013-0020-0003',
 'FHWA-2013-0054-0092']