Permalink
Browse files

partial work on fec efiling scrape tutorial that demonstrates POST re…

…quest
  • Loading branch information...
1 parent 56c5bf1 commit d1230cf4265f05dcdb0dd034ee41d9b81b19b95a Serdar Tumgoren committed Jan 25, 2012
Showing with 61 additions and 0 deletions.
  1. +61 −0 tutorials/webscraping101/fec_efiles_scrape.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+"""
+The third scrape in our series demonstrates how to fetch data from
+a remote server by making a POST request.
+
+For this scrape, we'll request a list of campaign finance filing
+links from the Federal Election Election Commission. The form for
+these electronic filings is found at the below link:
+
+ http://fec.gov/finance/disclosure/efile_search.shtml
+
+"""
+#TODO: add documentation links for language features and libs
+import sys
+
+import requests
+from BeautifulSoup import BeautifulSoup
+
+# Build a dictionary containing our form field values
+form_data = {
+ 'name':'Romney', # committee name field
+ 'type':'P', # committee type is P for Presidential
+ 'frmtype':'F3P', # form type
+}
+
+# Make the POST request by passing in our form data. This should
+# return a response object that contains status codes for your request and the
+# raw HTML of the page.
+response = requests.post('http://query.nictusa.com/cgi-bin/dcdev/forms/', data=form_data)
+
+# If the response is OK, then process the HTML
+if response.status_code == 200:
+
+ # the raw HTML is stored in the response object's "text" attribute
+ soup = BeautifulSoup(response.text)
+ links = soup.findAll('a')
+
+ # Extract the download links
+ download_links = []
+ for link in links:
+ if link.text == 'Download':
+ download_links.append(link)
+
+ #NOTE: You can tighten up the above code by leveraging BeautifulSoup's
+ # more advanced features, which allow you to filter the results of the
+ # "findAll" method by using regular expressions or lambda functions.
+ #
+ # Below, we use a lambda function to filter for links with "href"
+ # attributes starting with a certain URL path:
+
+ #download_links = soup.findAll('a', href=lambda path: path.startswith('/cgi-bin/dcdev/forms/DL/'))
+
+ # To learn more:
+ # http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)
+ # http://stackoverflow.com/questions/890128/python-lambda-why
+ # http://docs.python.org/howto/regex.html
+
+else:
+ # Gracefully exit the program if response code is not 200
+ sys.exit("Response code not OK: %s" % response.status_code)
+

0 comments on commit d1230cf

Please sign in to comment.