# Summary
This notebook downloads comments attachments from the regulations.gov website. The attachments are converted to text and replace the data in the comment's "comment_body" column.

The data is then exported for use in other notebooks.

In [1]:
import json
import pandas
import urllib.request
import PyPDF2
import tempfile
import os
from subprocess import (PIPE, Popen)

In [2]:
# Import the cleaned comments
data = pandas.read_json('./data/comments_cleaned.json', orient='records', dtype='false')

# Get a subset of only documents containing attachments.
has_attachment = data.dropna(subset=['doc.attachment_download -href'])

print("There are", len(has_attachment), "comments with attachments")
display(has_attachment)

There are 93 comments with attachments


Unnamed: 0,doc.attachment_download,doc.attachment_download -href,doc.attachment_name,doc.category,doc.city,doc.comment_body,doc.country,doc.name,doc.state,doc.zip
8174,,https://www.regulations.gov/contentStreamer?do...,,Academic/Think Tank,Maple Grove,Education Liberty Watch is submitting the foll...,United States,Karen Effrem,MN,
8197,,https://www.regulations.gov/contentStreamer?do...,,National Coalition of Anti-Violence Programs,New York,"Dear Secretary DeVos,\n\nThe National Coalitio...",United States,Anonymous Anonymous,NY,
8204,,https://www.regulations.gov/contentStreamer?do...,,EveryChild Solutions,Cary,"Dear Ms. Malawer:\n\nI am a parent, an educati...",United States,Cynthia Daniels-Hall,NC,
8219,,https://www.regulations.gov/contentStreamer?do...,,YWCA USA,Washington,"September 20, 2017\n\nThe Honorable Betsy DeVo...",United States,Alejandra Castillo,DC,
8238,,https://www.regulations.gov/contentStreamer?do...,,,Individual,I am writing to urge the Department of Educati...,,Tiffany Hsiang,,
8251,,https://www.regulations.gov/contentStreamer?do...,,,Institution of Higher Education,Please accept these comments regarding the eva...,,Shannon Sheppard,,
8262,,https://www.regulations.gov/contentStreamer?do...,,,National Education Association,Enclosed are the comments of the National Educ...,Union,Donna Harris-Aikens,,
8271,,https://www.regulations.gov/contentStreamer?do...,,Individual,Zimmerman,My name is Elizabeth Marsh and I have a 7 year...,United States,Elizabeth Marsh,MN,
8294,,https://www.regulations.gov/contentStreamer?do...,,,Community Organization,"At Day One, we partner with youth to end datin...",,Day One Anonymous,,
8295,,https://www.regulations.gov/contentStreamer?do...,,National Advocacy Organization,Washington,These 60 constituents representing Louisiana j...,United States,Anne Hedgepeth,DC,


In [3]:
# Header information for downloading the PDF attachment.
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
headers = {'User-Agent': user_agent}

In [4]:
# Download all attachments as .pdf or .docx
def download_attachments(row):
    global counter
    url = str(row['doc.attachment_download -href'])
    
    extension = '.pdf'
    if (url[-5:] == 'msw12'):
        extension = '.docx'
        
    name = str(counter) + extension
    
    try:
        request = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(request).read()        
        file = open(name, 'wb+')
        file.write(response)
        file.close()
        
    except:
        print("failed to download", str(counter))
    
    counter = counter + 1

In [5]:
# Counter for filenames in download_attachments
counter = 1

# Save files in data/attachments
os.chdir("./data/attachments")
has_attachment.apply(download_attachments, axis=1)
os.chdir("../..")

In [5]:
def get_attachment_text(url):
    global counter
    
    command = 'pdf2txt.py ' + str(counter) + '.pdf'
    if (url[-5:] == 'msw12'):
        command = 'pandoc -t latex ' + str(counter) + '.docx'
        
    counter = counter + 1
    
    try:
        text = Popen(command, stdout=PIPE, shell=True).stdout.read().decode("utf-8")
        print(counter)
        return text

    except:
        # Return no text if the attachment cannot be read.
        return ''

In [6]:
os.chdir("./data/attachments")

counter = 1
has_attachment['doc.comment_body'] = has_attachment['doc.attachment_download -href'].map(get_attachment_text)
    
os.chdir("../..")

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [31]:
# Merge back with main dataframe
data['doc.comment_body'].update(has_attachment['doc.comment_body'])

data.to_json('./data/comments_with_attachments.json', orient='records')