Skip to content
This repository
Browse code

Slightly updating script.

  • Loading branch information...
commit 77a854ddb74d42f90145ad39dac86fb81539d728 1 parent 734a572
Jackie Kazil jackiekazil authored

Showing 1 changed file with 43 additions and 39 deletions. Show diff stats Hide diff stats

  1. +43 39 tutorials/webscraping101/la_election_scrape.py
82 tutorials/webscraping101/la_election_scrape.py
@@ -17,6 +17,9 @@
17 17
18 18 python la_election_scrape.py
19 19
  20 +This script assumes that you learned about the requests library from the
  21 +fec_efiles_scrape.py file. Also, please note, that this script can take more than
  22 +30 seconds to run. Be patient.
20 23
21 24 HELPFUL LINKS:
22 25
@@ -41,6 +44,7 @@
41 44 # Create an empty link to identify bad links & race links
42 45 bad_links = []
43 46 races_links = []
  47 +date_links = []
44 48
45 49 if response.status_code == 200:
46 50
@@ -50,7 +54,7 @@
50 54 # Use BeautifulSoup's API to extract your data
51 55 # This page is clean & simple. All links are links we want to crawl.
52 56 # So, let's grab them all.
53   - links = []
  57 +
54 58 for tag in soup.table:
55 59
56 60 # soup.table is made of h1 tags & links.
@@ -68,53 +72,53 @@
68 72 absolute_link = URL + relative_link
69 73
70 74 # now we add the date & abs link to our list
71   - links.append((date, absolute_link))
  75 + date_links.append((date, absolute_link))
72 76
73   - '''
74   - Note: at this point, we have a list links that looks something like this:
75   - [
76   - (u'04051986', u'http://staticresults.sos.la.gov/04051986/Default.html')
77   - (u'02011986', u'http://staticresults.sos.la.gov/02011986/Default.html')
78   - (u'01181986', u'http://staticresults.sos.la.gov/01181986/Default.html')
79   - (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
80   - ...
81   - ]
82   - '''
  77 +'''
  78 +Note: at this point, we have a list links that looks something like this:
  79 +[
  80 +(u'04051986', u'http://staticresults.sos.la.gov/04051986/Default.html')
  81 +(u'02011986', u'http://staticresults.sos.la.gov/02011986/Default.html')
  82 +(u'01181986', u'http://staticresults.sos.la.gov/01181986/Default.html')
  83 +(u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
  84 +...
  85 +]
  86 +'''
83 87
84   - # Now, we would apply the same logic as we are approaching the first page,
85   - # except for now, we would apply that logic to each link in a for loop.
86   - # Let's pull out links all of the race types on each page
  88 +# Now, we would apply the same logic as we are approaching the first page,
  89 +# except for now, we would apply that logic to each link in a for loop.
  90 +# Let's pull out links all of the race types on each page
87 91
88   - for item in links:
  92 +for item in date_links:
89 93
90   - # to clarify which item is which in each tuple
91   - # this is extra code for demo purposes
92   - # Example item: (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
93   - date = item[0]
94   - link = item[1]
  94 + # to clarify which item is which in each tuple
  95 + # this is extra code for demo purposes
  96 + # Example item: (u'03301985', u'http://staticresults.sos.la.gov/03301985/Default.html')
  97 + date = item[0]
  98 + link = item[1]
95 99
96   - # this looks familar
97   - response = requests.get(link)
  100 + # this looks familar
  101 + response = requests.get(link)
98 102
99   - # while we do not explain functions in this demo, this would be a good use
100   - # if you are feeling adventurous, you should try to turn & the code at
101   - # the start of the script into a funciton, then call that function
  103 + # while we do not explain functions in this demo, this would be a good use
  104 + # if you are feeling adventurous, you should try to turn & the code at
  105 + # the start of the script into a funciton, then call that function
102 106
103   - if response.status_code == 200:
104   - soup = BeautifulSoup(response.text)
  107 + if response.status_code == 200:
  108 + soup = BeautifulSoup(response.text)
105 109
106   - # more familar stuff
107   - races_tags = soup.table.findAll('a')
108   - for races_tag in races_tags:
109   - relative_link = races_tag['href']
110   - absolute_link = URL + relative_link
  110 + # more familar stuff
  111 + races_tags = soup.table.findAll('a')
  112 + for races_tag in races_tags:
  113 + relative_link = races_tag['href']
  114 + absolute_link = URL + relative_link
111 115
112   - # now let's add the date, races_type, and races_link to the tuple
113   - races_type = races_tag.text
114   - races_links.append((date, races_type, absolute_link))
  116 + # now let's add the date, races_type, and races_link to the tuple
  117 + races_type = races_tag.text
  118 + races_links.append((date, races_type, absolute_link))
115 119
116   - else:
117   - bad_links.append((response.status_code, link))
  120 + else:
  121 + bad_links.append((response.status_code, link))
118 122
119 123
120 124 ################################################################################
@@ -122,7 +126,7 @@
122 126 # THE RESULTS:
123 127 # This is for easy viewing of the new list & not required for this script
124 128 count = 0
125   -while count < 50: # The number 50 is used to limit the output.
  129 +while count < 20: # The number 50 is used to limit the output.
126 130 for link in races_links:
127 131 print "Election date: %s, Races link type: %s, Link: %s" % (link[0], link[1], link[2])
128 132 count+=1

0 comments on commit 77a854d

Please sign in to comment.
Something went wrong with that request. Please try again.