Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Serdar Tumgoren
committed
Jan 23, 2012
1 parent
14080d3
commit a925f97
Showing
2 changed files
with
84 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
ipython | ||
requests | ||
yolk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#!/usr/bin/env python | ||
""" | ||
This is the first example scrape in our series. | ||
In this scrape, we'll demonstrate some Python basics | ||
using the FDIC's Failed Banks List. | ||
USAGE: | ||
You can run this scrape by going to command line, navigating to the | ||
directory containing this script, and typing the below command: | ||
python failed_banks_scrape.py | ||
NOTE: | ||
The original FDIC data is located at the below URL: | ||
http://www.fdic.gov/bank/individual/failed/banklist.html | ||
In order to be considerate to the FDIC's servers, we're scraping | ||
a copy of the page stored on one of Amazon S3. | ||
""" | ||
# Import a built-in library for working with data on the Web | ||
# DOCS: http://docs.python.org/library/urllib.html | ||
import urllib | ||
|
||
# import a 3rd-party to help extract data from raw HTML | ||
# DOCS: http://www.crummy.com/software/BeautifulSoup/documentation.html | ||
from BeautifulSoup import BeautifulSoup | ||
|
||
# URL of the page we're going to scrape (below is the real URL, but | ||
# we'll hit a dummy version to be kind to the FDIC) | ||
URL = 'https://s3.amazonaws.com/python-journos/FDIC_Failed_Bank_List.html' | ||
|
||
# Open a network connection using the "urlopen" method. | ||
# This returns a network "object" | ||
# http://docs.python.org/library/urllib.html#high-level-interface | ||
web_cnx = urllib.urlopen(URL) | ||
|
||
# Use the network object to download, or "read", the page's HTML | ||
html = web_cnx.read() | ||
|
||
# Parse the HTML into a form that's easy to use | ||
soup = BeautifulSoup(html) | ||
|
||
# Use BeautifulSoup's API to extract your data | ||
# 1) Fetch the table by ID | ||
table = soup.find(id='table') | ||
|
||
# 2) Grab the table's rows | ||
rows = table.findAll('tr') | ||
|
||
# 3) Get header names from first row | ||
headers = rows[0].findAll('th') | ||
|
||
# Extract the column names and add them to a list | ||
columns = [] | ||
for header in headers: | ||
columns.append(header.text) | ||
|
||
# Use the tab character's 'join' method to concatenate | ||
# the column names into a single, tab-separated string. | ||
# Then print out the header column. | ||
print '\t'.join(columns) | ||
|
||
# 4) Process the data, skipping the initial header row | ||
for row in rows[1:]: | ||
|
||
# Extract the data points from the table row and print them | ||
data = row.findAll('td') | ||
bank_name = data[0].text | ||
city = data[1].text | ||
state = data[2].text | ||
cert_num = data[3].text | ||
ai = data[4].text | ||
closed_on = data[5].text | ||
updated = data[6].text | ||
print "\t".join([bank_name, city, state, cert_num, ai, closed_on, updated]) |