added/cleaned up docs in fec scrape tutorial

PythonJournos · Jan 25, 2012 · b2ac9e6 · b2ac9e6
1 parent d1230cf
commit b2ac9e6
Showing 1 changed file with 29 additions and 17 deletions.
diff --git a/tutorials/webscraping101/fec_efiles_scrape.py b/tutorials/webscraping101/fec_efiles_scrape.py
@@ -9,29 +9,34 @@
 
     http://fec.gov/finance/disclosure/efile_search.shtml
 
+HELPFUL LINKS:
+ * http://www.crummy.com/software/BeautifulSoup/documentation.html
+ * http://docs.python-requests.org/en/latest/user/quickstart/
+ * http://en.wikipedia.org/wiki/List_of_HTTP_status_codes
+
 """
-#TODO: add documentation links for language features and libs
 import sys
 
 import requests
 from BeautifulSoup import BeautifulSoup
 
 # Build a dictionary containing our form field values
+# http://docs.python.org/tutorial/datastructures.html#dictionaries
 form_data = {
     'name':'Romney', # committee name field
     'type':'P',      # committee type is P for Presidential
     'frmtype':'F3P', # form type
 }
 
-# Make the POST request by passing in our form data. This should 
-# return a response object that contains status codes for your request and the
-# raw HTML of the page.
+# Make the POST request with the form dictionary. This should 
+# return a response object containing the status of the request --ie 
+# whether or not it was successful -- and raw HTML for the returned page.
 response = requests.post('http://query.nictusa.com/cgi-bin/dcdev/forms/', data=form_data)
 
-# If the response is OK, then process the HTML
+# If the request was successful, then process the HTML
 if response.status_code == 200:
 
-    # the raw HTML is stored in the response object's "text" attribute
+    # The raw HTML is stored in the response object's "text" attribute
     soup = BeautifulSoup(response.text)
     links = soup.findAll('a') 
 
@@ -41,19 +46,26 @@
         if link.text == 'Download':
             download_links.append(link)
 
-    #NOTE: You can tighten up the above code by leveraging BeautifulSoup's 
-    # more advanced features, which allow you to filter the results of the 
-    # "findAll" method by using regular expressions or lambda functions. 
-    #
-    # Below, we use a lambda function to filter for links with "href" 
-    # attributes starting with a certain URL path:
+    """
+    NOTE: We could replace the 4 lines of code above with the single line below:
+
+    download_links = soup.findAll('a', href=lambda path: path.startswith('/cgi-bin/dcdev/forms/DL/'))
 
-    #download_links = soup.findAll('a', href=lambda path: path.startswith('/cgi-bin/dcdev/forms/DL/'))
+    This one-liner leverages one of BeautifulSoup's more advanced features -- specifically, the 
+    ability to filter the "findAll" method's results by applying regular expressions or 
+    lambda functions.
+    
+    Above, we used a lambda function to filter for links with "href" 
+    attributes starting with a certain URL path. 
      
-    # To learn more: 
-    # http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)
-    # http://stackoverflow.com/questions/890128/python-lambda-why
-    # http://docs.python.org/howto/regex.html
+    To learn more: 
+
+    * http://www.crummy.com/software/BeautifulSoup/documentation.html
+    * http://stackoverflow.com/questions/890128/python-lambda-why
+    * http://docs.python.org/howto/regex.html
+    """
+
+    #TODO: download the newest filing
 
 else:
     # Gracefully exit the program if response code is not 200