Skip to content
Browse files

Slightly updating script.

  • Loading branch information...
1 parent 734a572 commit 77a854ddb74d42f90145ad39dac86fb81539d728 @jackiekazil jackiekazil committed Jan 30, 2012
Showing with 43 additions and 39 deletions.
  1. +43 −39 tutorials/webscraping101/la_election_scrape.py
View
82 tutorials/webscraping101/la_election_scrape.py
@@ -17,6 +17,9 @@
python la_election_scrape.py
+This script assumes that you learned about the requests library from the
+fec_efiles_scrape.py file. Also, please note, that this script can take more than
+30 seconds to run. Be patient.
HELPFUL LINKS:
@@ -41,6 +44,7 @@
# Create an empty link to identify bad links & race links
bad_links = []
races_links = []
+date_links = []
if response.status_code == 200:
@@ -50,7 +54,7 @@
# Use BeautifulSoup's API to extract your data
# This page is clean & simple. All links are links we want to crawl.
# So, let's grab them all.
- links = []
+
for tag in soup.table:
# soup.table is made of h1 tags & links.
@@ -68,61 +72,61 @@
absolute_link = URL + relative_link
# now we add the date & abs link to our list
- links.append((date, absolute_link))
+ date_links.append((date, absolute_link))
- '''
- Note: at this point, we have a list links that looks something like this:
- [
- (u'04051986', u'http://staticresults.sos.la.gov/04051986/Default.html')
- (u'02011986', u'http://staticresults.sos.la.gov/02011986/Default.html')
- (u'01181986', u'http://staticresults.sos.la.gov/01181986/Default.html')
- (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
- ...
- ]
- '''
+'''
+Note: at this point, we have a list links that looks something like this:
+[
+(u'04051986', u'http://staticresults.sos.la.gov/04051986/Default.html')
+(u'02011986', u'http://staticresults.sos.la.gov/02011986/Default.html')
+(u'01181986', u'http://staticresults.sos.la.gov/01181986/Default.html')
+(u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
+...
+]
+'''
- # Now, we would apply the same logic as we are approaching the first page,
- # except for now, we would apply that logic to each link in a for loop.
- # Let's pull out links all of the race types on each page
+# Now, we would apply the same logic as we are approaching the first page,
+# except for now, we would apply that logic to each link in a for loop.
+# Let's pull out links all of the race types on each page
- for item in links:
+for item in date_links:
- # to clarify which item is which in each tuple
- # this is extra code for demo purposes
- # Example item: (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
- date = item[0]
- link = item[1]
+ # to clarify which item is which in each tuple
+ # this is extra code for demo purposes
+ # Example item: (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
+ date = item[0]
+ link = item[1]
- # this looks familar
- response = requests.get(link)
+ # this looks familar
+ response = requests.get(link)
- # while we do not explain functions in this demo, this would be a good use
- # if you are feeling adventurous, you should try to turn & the code at
- # the start of the script into a funciton, then call that function
+ # while we do not explain functions in this demo, this would be a good use
+ # if you are feeling adventurous, you should try to turn & the code at
+ # the start of the script into a funciton, then call that function
- if response.status_code == 200:
- soup = BeautifulSoup(response.text)
+ if response.status_code == 200:
+ soup = BeautifulSoup(response.text)
- # more familar stuff
- races_tags = soup.table.findAll('a')
- for races_tag in races_tags:
- relative_link = races_tag['href']
- absolute_link = URL + relative_link
+ # more familar stuff
+ races_tags = soup.table.findAll('a')
+ for races_tag in races_tags:
+ relative_link = races_tag['href']
+ absolute_link = URL + relative_link
- # now let's add the date, races_type, and races_link to the tuple
- races_type = races_tag.text
- races_links.append((date, races_type, absolute_link))
+ # now let's add the date, races_type, and races_link to the tuple
+ races_type = races_tag.text
+ races_links.append((date, races_type, absolute_link))
- else:
- bad_links.append((response.status_code, link))
+ else:
+ bad_links.append((response.status_code, link))
################################################################################
# THE RESULTS:
# This is for easy viewing of the new list & not required for this script
count = 0
-while count < 50: # The number 50 is used to limit the output.
+while count < 20: # The number 50 is used to limit the output.
for link in races_links:
print "Election date: %s, Races link type: %s, Link: %s" % (link[0], link[1], link[2])
count+=1

0 comments on commit 77a854d

Please sign in to comment.
Something went wrong with that request. Please try again.