Permalink
Browse files

added/cleaned up docs in fec scrape tutorial

  • Loading branch information...
1 parent d1230cf commit b2ac9e62e8a9d2ebbcd6807a2e548e5dda48bb3c Serdar Tumgoren committed Jan 25, 2012
Showing with 29 additions and 17 deletions.
  1. +29 −17 tutorials/webscraping101/fec_efiles_scrape.py
@@ -9,29 +9,34 @@
http://fec.gov/finance/disclosure/efile_search.shtml
+HELPFUL LINKS:
+ * http://www.crummy.com/software/BeautifulSoup/documentation.html
+ * http://docs.python-requests.org/en/latest/user/quickstart/
+ * http://en.wikipedia.org/wiki/List_of_HTTP_status_codes
+
"""
-#TODO: add documentation links for language features and libs
import sys
import requests
from BeautifulSoup import BeautifulSoup
# Build a dictionary containing our form field values
+# http://docs.python.org/tutorial/datastructures.html#dictionaries
form_data = {
'name':'Romney', # committee name field
'type':'P', # committee type is P for Presidential
'frmtype':'F3P', # form type
}
-# Make the POST request by passing in our form data. This should
-# return a response object that contains status codes for your request and the
-# raw HTML of the page.
+# Make the POST request with the form dictionary. This should
+# return a response object containing the status of the request --ie
+# whether or not it was successful -- and raw HTML for the returned page.
response = requests.post('http://query.nictusa.com/cgi-bin/dcdev/forms/', data=form_data)
-# If the response is OK, then process the HTML
+# If the request was successful, then process the HTML
if response.status_code == 200:
- # the raw HTML is stored in the response object's "text" attribute
+ # The raw HTML is stored in the response object's "text" attribute
soup = BeautifulSoup(response.text)
links = soup.findAll('a')
@@ -41,19 +46,26 @@
if link.text == 'Download':
download_links.append(link)
- #NOTE: You can tighten up the above code by leveraging BeautifulSoup's
- # more advanced features, which allow you to filter the results of the
- # "findAll" method by using regular expressions or lambda functions.
- #
- # Below, we use a lambda function to filter for links with "href"
- # attributes starting with a certain URL path:
+ """
+ NOTE: We could replace the 4 lines of code above with the single line below:
+
+ download_links = soup.findAll('a', href=lambda path: path.startswith('/cgi-bin/dcdev/forms/DL/'))
- #download_links = soup.findAll('a', href=lambda path: path.startswith('/cgi-bin/dcdev/forms/DL/'))
+ This one-liner leverages one of BeautifulSoup's more advanced features -- specifically, the
+ ability to filter the "findAll" method's results by applying regular expressions or
+ lambda functions.
+
+ Above, we used a lambda function to filter for links with "href"
+ attributes starting with a certain URL path.
- # To learn more:
- # http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)
- # http://stackoverflow.com/questions/890128/python-lambda-why
- # http://docs.python.org/howto/regex.html
+ To learn more:
+
+ * http://www.crummy.com/software/BeautifulSoup/documentation.html
+ * http://stackoverflow.com/questions/890128/python-lambda-why
+ * http://docs.python.org/howto/regex.html
+ """
+
+ #TODO: download the newest filing
else:
# Gracefully exit the program if response code is not 200

0 comments on commit b2ac9e6

Please sign in to comment.