Adding election scrape to the webscraping101 folder.

PythonJournos · Jan 28, 2012 · d75d1a9 · d75d1a9
1 parent 1ee010b
commit d75d1a9
Show file tree

Hide file tree

Showing 2 changed files with 168 additions and 20 deletions.
diff --git a/tutorials/webscraping101/fec_efiles_scrape.py b/tutorials/webscraping101/fec_efiles_scrape.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 """
-This scrape demonstrates how to "fill out" an 
+This scrape demonstrates how to "fill out" an
 online form to fetch data from a remote server.
 
 More accurately, we'll show how to make a POST request
@@ -48,8 +48,8 @@
     'frmtype':'F3P', # form type
 }
 
-# Make the POST request with the form dictionary. This should 
-# return a response object containing the status of the request -- ie 
+# Make the POST request with the form dictionary. This should
+# return a response object containing the status of the request -- ie
 # whether or not it was successful -- and raw HTML for the returned page.
 response = requests.post('http://query.nictusa.com/cgi-bin/dcdev/forms/', data=form_data)
 
@@ -58,9 +58,9 @@
 
     # The raw HTML is stored in the response object's "text" attribute
     soup = BeautifulSoup(response.text)
-    links = soup.findAll('a') 
+    links = soup.findAll('a')
 
-    # Extract the download links 
+    # Extract the download links
     download_links = []
     for link in links:
         if link.text == 'Download':
@@ -71,14 +71,14 @@
 
     download_links = soup.findAll('a', href=lambda path: path.startswith('/cgi-bin/dcdev/forms/DL/'))
 
-    This one-liner leverages one of BeautifulSoup's more advanced features -- specifically, the 
-    ability to filter the "findAll" method's results by applying regular expressions or 
+    This one-liner leverages one of BeautifulSoup's more advanced features -- specifically, the
+    ability to filter the "findAll" method's results by applying regular expressions or
     lambda functions.
-    
-    Above, we used a lambda function to filter for links with "href" 
-    attributes starting with a certain URL path. 
-     
-    To learn more: 
+
+    Above, we used a lambda function to filter for links with "href"
+    attributes starting with a certain URL path.
+
+    To learn more:
 
     * http://www.crummy.com/software/BeautifulSoup/documentation.html
     * http://stackoverflow.com/questions/890128/python-lambda-why
@@ -87,8 +87,8 @@
 
     # Now that we have our target links, we can download CSVs for further processing.
 
-    # Below is the base URL for FEC Filing CSV downloads. 
-    # Notice the "%s" format character at the end. 
+    # Below is the base URL for FEC Filing CSV downloads.
+    # Notice the "%s" format character at the end.
     BASE_URL =  'http://query.nictusa.com/comma/%s.fec'
 
     # To get at the raw data for each filing, we'll combine the above BASE_URL with
@@ -115,25 +115,25 @@
         # Create a list of data rows by splitting on the line terminator character
         data_rows = response.text.split('\n')
 
-        # Use the CSV module to parse the comma-separated rows of data. Calling 
-        # the built-in "list" function causes csv to parse our data strings 
+        # Use the CSV module to parse the comma-separated rows of data. Calling
+        # the built-in "list" function causes csv to parse our data strings
         # into lists of distinct data points (the same as if it they were
         # in a spreadsheet or database table).
         # http://docs.python.org/library/csv.html
         data = list(csv.reader(data_rows))
 
-        # The first row in the FEC data contains useful info about the format of 
+        # The first row in the FEC data contains useful info about the format of
         # the remaining rows in the file.
         version = data[0][2] # e.g., 8.0
         print "Downloaded Electronic filing with File Format Version %s" % version
-        
+
         ### WHAT'S NEXT? ###
         # In a normal script you would use the version number to fetch the
         # the appropriate file formats, which could then be used to process
         # the remaining data in the file.
 
-        # But we know you get the picture -- and we want to be kind to 
-        # the FEC's servers -- so we'll exit the program early and assign 
+        # But we know you get the picture -- and we want to be kind to
+        # the FEC's servers -- so we'll exit the program early and assign
         # the rest of the script as homework :-)
         sys.exit("Exited script after processing one link.")
 

diff --git a/tutorials/webscraping101/la_election_scrape.py b/tutorials/webscraping101/la_election_scrape.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+"""
+This scrape demonstrates how to 'page through' links and build on other
+scripts in the PyJournos webscraping tutorial folder located here:
+
+    https://github.com/PythonJournos/LearningPython/tree/master/tutorials/webscraping101
+
+The site that we are using for this example can be found here:
+
+    http://staticresults.sos.la.gov/
+
+
+USAGE:
+
+You can run this scrape by going to command line, navigating to the
+directory containing this script, and typing the below command:
+
+    python la_election_scrape.py
+
+
+HELPFUL LINKS:
+
+ Python Modules used in this script:
+ * BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/documentation.html
+ * CSV:           http://docs.python.org/library/csv.html
+ * requests:      http://docs.python-requests.org/en/latest/user/quickstart/
+
+ HTTP codes
+ * http://en.wikipedia.org/wiki/List_of_HTTP_status_codes
+
+"""
+import csv
+import requests
+
+from BeautifulSoup import BeautifulSoup
+
+URL = 'http://staticresults.sos.la.gov/'
+
+response = requests.get(URL)
+
+# Create an empty link to identify bad links & race links
+bad_links = []
+races_links = []
+
+if response.status_code == 200:
+
+    # Parse the HTML into a form that's easy to use
+    soup = BeautifulSoup(response.text)
+
+    # Use BeautifulSoup's API to extract your data
+    # This page is clean & simple. All links are links we want to crawl.
+    # So, let's grab them all.
+    links = []
+    for tag in soup.table:
+
+        # soup.table is made of h1 tags & links.
+        # only save links, which have a name equal to 'a'
+        if tag.name == 'a':
+
+            # 'href' is an attribute of item
+            relative_link = tag['href']
+
+            # the election date the text, so let's grab that to associate
+            # with the link
+            date = tag.text
+
+            # we need a complete link to follow, so let's create that
+            absolute_link = URL + relative_link
+
+            # now we add the date & abs link to our list
+            links.append((date, absolute_link))
+
+    '''
+    Note: at this point, we have a list links that looks something like this:
+    [
+    (u'04051986', u'http://staticresults.sos.la.gov/04051986/Default.html')
+    (u'02011986', u'http://staticresults.sos.la.gov/02011986/Default.html')
+    (u'01181986', u'http://staticresults.sos.la.gov/01181986/Default.html')
+    (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
+    ...
+    ]
+    '''
+
+    # Now, we would apply the same logic as we are approaching the first page,
+    # except for now, we would apply that logic to each link in a for loop.
+    # Let's pull out links all of the race types on each page
+
+    for item in links:
+
+        # to clarify which item is which in each tuple
+        # this is extra code for demo purposes
+        # Example item: (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
+        date = item[0]
+        link = item[1]
+
+        # this looks familar
+        response = requests.get(link)
+
+        # while we do not explain functions in this demo, this would be a good use
+        # if you are feeling adventurous, you should try to turn & the code at
+        # the start of the script into a funciton, then call that function
+
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text)
+
+            # more familar stuff
+            races_tags = soup.table.findAll('a')
+            for races_tag in races_tags:
+                relative_link = races_tag['href']
+                absolute_link = URL + relative_link
+
+                # now let's add the date, races_type, and races_link to the tuple
+                races_type = races_tag.text
+                races_links.append((date, races_type, absolute_link))
+
+        else:
+            bad_links.append((response.status_code, link))
+
+
+################################################################################
+
+# THE RESULTS:
+# This is for easy viewing of the new list & not required for this script
+count = 0
+while count < 50:  # The number 50 is used to limit the output.
+    for link in races_links:
+        print "Election date: %s, Races link type: %s, Link: %s" % (link[0], link[1], link[2])
+        count+=1
+
+# Let's see which links failed
+for bad_link in bad_links:
+    print "Response code: %s, Link: %s" % (bad_link[0], bad_link[1])
+
+
+'''
+End Result looks something like this:
+[
+(u'10/22/2011', u'All Races in a Parish', u'http://staticresults.sos.la.gov/10222011_Parishes.html')
+(u'07/16/2011', u'All Races in a Parish', u'http://staticresults.sos.la.gov/07162011_Parishes.html')
+(u'04/30/2011', u'LA Legislature Races', u'http://staticresults.sos.la.gov/04302011_Legislative.html')
+(u'04/30/2011', u'Multi-Parish Races', u'http://staticresults.sos.la.gov/04302011_MultiParish.html')
+....
+]
+
+These are the bad links that came back:
+[(404, u'http://staticresults.sos.la.gov/11021982/Default.html'),
+(404, u'http://staticresults.sos.la.gov/09111982/Default.html')]
+'''