# Chapter 9. Getting Data

## stdin adn stdout

<h3>To run the code below</h3>

Save each script separately with the name shown on the first comment.
All scripts can be found at the examples folder.
Then run it using the following command lines:

<h4><li>Windows</li></h4>

```bash
type SomeFile.txt | python egrep.py "[0-9]" | python line_count.py
```

<h4><li>Unix</li></h4>

```bash
cat SomeFile.txt | python egrep.py "[0-9]" | python line_count.py
```


```python
# egrep.py
import sys, re

# sys.argv is the list of command-line arguments
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified at the command line
regex = sys.argv[1]

# For every line passed into the script
for line in sys.stdin:
    # If it matches the regex, write it to stdout
    if re.search(regex, line):
        sys.stdout.write(line)
```

```python
# line_count.py
import sys

count = 0
for line in sys.stdin:
    count += 1

# print goes to sys.stdout
print(count)
```

<h3>To run the code below</h3>

Save each script separately with the name shown on the first comment.
All scripts can be found at the examples folder.
Then run it using the following command lines:

<h4><li>Windows</li></h4>

```bash
type The_Bible.txt | python most_common_words.py
```

<h4><li>Unix</li></h4>

```bash
cat The_Bible.txt | python most_common_words.py
```

```python
# most_common_words.py
import sys
from collections import Counter

# Pass in number of words as first argument
try:
    num_words = int(sys.argv[1])
except:
    print("usage: most_common_words.py num_words")
    sys.exit(1)  # non-zero exit code indicates error

counter = Counter(word.lower()                      # Lowercase words
                  for line in sys.stdin
                  for word in line.strip().split()  # Split on spaces
                  if word)                          # Skip empty 'words'

for word, count in counter.most_common(num_words):
    sys.stdout.write(str(count))
    sys.stdout.write("\t")
    sys.stdout.write(word)
    sys.stdout.write("\n")
```

```bash
$ cat the_bible.txt | python most_common_words.py 10
64299   the
51378   and
34809   of
13663   to
12798   that
12563   in
10263   he
9840    shall
8987    unto
8837    for
```

## Reading Files

### The Basics of Text Files

In [4]:
# 'r' means read-only, it's assumed if you leave it out
file_for_reading = open('examples/09 - Getting Data/reading_file.txt', 'r')
file_for_reading2 = open('examples/09 - Getting Data/reading_file.txt')

# 'w' is write -- will destroy the file if it already exists!
file_for_writing = open('examples/09 - Getting Data/writing_file.txt', 'w')

# 'a' is append -- for adding to the end of the file
file_for_appending = open('examples/09 - Getting Data/appending_file.txt', 'a')

# Don't forget to close your files when you're done
file_for_writing.close()

In [4]:
# The following was inserted to run the code below
filename = 'examples/09 - Getting Data/reading_file.txt'

def function_that_gets_data_from(file):
    print(file)

def process(*args):
    print(*args)

In [6]:
with open(filename) as f:
    data = function_that_gets_data_from(f)

# At this point f has already been closed, so don't try to use it
process(data)

<_io.TextIOWrapper name='examples/09 - Getting Data/reading_file.txt' mode='r' encoding='cp1252'>
None


In [11]:
import re

starts_with_hash = 0

with open('examples/09 - Getting Data/input.txt') as f:
    for line in f:                  # Look at each line in the file
        if re.match('^#', line):    # Use a regex to see if it starts with '#'
            starts_with_hash += 1   # If it does, add 1 to the count

starts_with_hash

1

In [12]:
def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece"""
    return email_address.lower().split('@')[-1]

# A couple of tests
assert get_domain('joelgrus@gmail.com') == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

from collections import Counter

with open('examples/09 - Getting Data/email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if '@' in line)
    
domain_counts

Counter({'gmail.com': 1, 'm.datasciencester.com': 1})

### Delimited Files

In [5]:
import csv

with open('examples/09 - Getting Data/tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        process(date, symbol, closing_price)

6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT 41.51
6/19/2014 FB 64.34


In [7]:
with open ('examples/09 - Getting Data/colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row['date']
        symbol = dict_row['symbol']
        closing_price = float(dict_row['closing_price'])
        process(date, symbol, closing_price)

6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5


In [9]:
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5}

with open('examples/09 - Getting Data/comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f)
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

In [10]:
results = [['test1', 'success', 'Monday'],
           ['test2', 'success, kind of', 'Tuesday'],
           ['test3', 'failure, kind of', 'Wednesday'],
           ['test4', 'failure, utter', 'Thursday']]

# Don't do this!
with open('examples/09 - Getting Data/bad_csv.txt', 'w') as f:
    for row in results:
        f.write(','.join(map(str, row)))    # Might have too many commas in it!
        f.write('\n')                       # Row might have newlines as well!

## Scraping the Web

### HTML and the Parsing Thereof

In [None]:
# Use 'python -m pip install beautifulsoup4 requests html5lib' when using the command line
%pip install beautifulsoup4 requests html5lib

In [14]:
from bs4 import BeautifulSoup
import requests

# Joel Grus kindly uploaded the relevant HTML file on his GitHub.
# In order to fit the URL in the book pages, he had to split it across two lines.
# Recall that whitespace-separated strings get concetaned.

url = ('https://raw.githubusercontent.com/'
       'joelgrus/data/master/getting-data.html')
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
soup

<!DOCTYPE html>
<html lang="en-US"><head>
    <title>Getting Data</title>
    <meta charset="utf-8"/>
</head>
<body>
    <h1>Getting Data</h1>
    <div class="explanation">
        This is an explanation.
    </div>
    <div class="comment">
        This is a comment.
    </div>
    <div class="content">
        <p id="p1">This is the first paragraph.</p>
        <p class="important">This is the second paragraph.</p>
    </div>
    <div class="signature">
        <span id="name">Joel</span>
        <span id="twitter">@joelgrus</span>
        <span id="email">joelgrus-at-gmail</span>
    </div>


</body></html>

In [16]:
first_paragraph = soup.find('p')    # Or just soup.p
first_paragraph

<p id="p1">This is the first paragraph.</p>

In [17]:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

print(first_paragraph_text)
print(first_paragraph_words)

This is the first paragraph.
['This', 'is', 'the', 'first', 'paragraph.']


In [20]:
first_paragraph_id = soup.p['id']       # Raises KeyError if no 'id'
first_paragraph_id2 = soup.p.get('id')  # Returns None if no 'id'

print(first_paragraph_id)
print(first_paragraph_id2)

p1
p1


In [21]:
all_paragraphs = soup.find_all('p')     # Or just soup('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

print(all_paragraphs)
print(paragraphs_with_ids)

[<p id="p1">This is the first paragraph.</p>, <p class="important">This is the second paragraph.</p>]
[<p id="p1">This is the first paragraph.</p>]


In [22]:
important_paragraphs = soup('p', {'class': 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]

print(important_paragraphs)
print(important_paragraphs2)
print(important_paragraphs3)

[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]
[<p class="important">This is the second paragraph.</p>]


In [28]:
# Warning: this will return the same <span> multiple times
# if it sits inside multiple <div>s.
# Be more clever if that's the case.
spans_inside_divs = [span
                     for div in soup('div')     # For each <div> on the page
                     for span in div('span')]   # Find each <span> inside it

spans_inside_divs

[<span id="name">Joel</span>,
 <span id="twitter">@joelgrus</span>,
 <span id="email">joelgrus-at-gmail</span>]

### Example: Keeping Tabs on Congress

In [30]:
from bs4 import BeautifulSoup
import requests

url = 'https://www.house.gov/representatives'
text = requests.get(url).text
soup = BeautifulSoup(text, 'html5lib')

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

print(len(all_urls))    # 967 for me, way too many

967


In [32]:
import re

# Must start with http:// or https://
# Must end with .house.gov or .house.gov/
regex = r'^https?://.*\.house\.gov/?$'

# Let's write some tests!
assert re.match(regex, 'http://joel.house.gov')
assert re.match(regex, 'https://joel.house.gov')
assert re.match(regex, 'http://joel.house.gov/')
assert re.match(regex, 'https://joel.house.gov/')
assert not re.match(regex, 'joel.house.gov')
assert not re.match(regex, 'http://joel.house.com')
assert not re.match(regex, 'https://joel.house.gov/biography')

# And now apply
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))   # Still 874 for me


874


In [34]:
good_urls = list(set(good_urls))

print(len(good_urls))   # Only 437 for me

437


In [35]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')

# Use a set because the links might appear multiple times.
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
print(links)    # {'/media/press-releases'}

{'https://jayapal.house.gov/category/news/', 'https://jayapal.house.gov/category/press-releases/'}


In [36]:
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    
    print(f'{house_url}: {pr_links}')
    press_releases[house_url] = pr_links

https://reschenthaler.house.gov/: set()
https://thompson.house.gov: {'/media-center/press-releases'}
https://wittman.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=2670'}
https://kuster.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://langworthy.house.gov: {'/media/press-releases'}
https://williams.house.gov: {'/media-center/press-releases'}
https://matsui.house.gov: {'/media/press-releases'}
https://mcclain.house.gov: {'/press-releases'}
https://feenstra.house.gov: {'/node/1119', '/media/press-releases'}
https://moskowitz.house.gov: set()
https://kustoff.house.gov: {'/media/press-releases'}
https://mikegarcia.house.gov/: set()
https://mikethompson.house.gov/: {'/newsroom/press-releases'}
https://sherman.house.gov: {'/media-center/press-releases'}
https://kamlager-dove.house.gov: {'/media/press-releases'}
https://mcclintock.house.gov/: {'/newsroom/press-releases'}
https://peltola.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://kaptur.house.gov/

In [37]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [tag.get_text() for tag in soup('p')]
    
    return any(keyword.lower() in paragraph.lower()
               for paragraph in paragraphs)

In [38]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, 'twitter')      # Is inside <p>
assert not paragraph_mentions(text, 'facebook') # Not inside <p>

<div class="alert alert-block alert-warning">
<b>Warning:</b> The code below might not return all representatives that mentioned "data" on press releases.

This happens due to the fact that some of the links inside the `press_releases` dict are not relative links, being absolute.
When executed, those links will be requested as `https://representative.house.gov/https://representative.house.gov/media/press-releases`, which will return the status code <b>"410 - Gone"</b> and no text to be analyzed by BeautifulSoup.
</div>

In [45]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f'{house_url}/{pr_link}'
        text = requests.get(url).text
        
        if paragraph_mentions(text, 'data'):
            print(f'{house_url}')
            break    # Done with this house_url

https://obernolte.house.gov
https://fallon.house.gov
https://schrier.house.gov
https://phillips.house.gov/
https://schakowsky.house.gov
https://golden.house.gov
https://molinaro.house.gov
https://biggs.house.gov
https://curtis.house.gov/
https://delbene.house.gov
https://edwards.house.gov
https://grothman.house.gov
https://mfume.house.gov/
https://cartwright.house.gov
https://luetkemeyer.house.gov/
https://collins.house.gov
https://bucshon.house.gov/
https://beyer.house.gov
https://delauro.house.gov/
https://danbishop.house.gov


## Using APIs

### JSON and XML

<h4>JavaScript Object Notation (JSON)</h4>

```json
{
    "title": "Data Science Book",
    "author": "Joel Grus",
    "publicationYear": 2019,
    "topics": ["data", "science", "data science"]
}
```

In [46]:
import json
serialized = """{ "title": "Data Science Book",
                  "author": "Joel Grus",
                  "publicationYear": 2019,
                  "topics": ["data", "science", "data science"] }"""

# Parse the JSON to create a Python dict
deserialized = json.loads(serialized)
assert deserialized['publicationYear'] == 2019
assert 'data science' in deserialized['topics']

deserialized

{'title': 'Data Science Book',
 'author': 'Joel Grus',
 'publicationYear': 2019,
 'topics': ['data', 'science', 'data science']}

<h4>Extensible Markup Language (XML)</h4>

```xml
<Book>
    <Title>Data Science Book</Title>
    <Author>Joel Grus</Author>
    <PublicationYear>2014</PublicationYear>
    <Topics>
        <Topic>data</Topic>
        <Topic>science</Topic>
        <Topic>data science</Topic>
    </Topics>
</Book>
```

### Using an Unauthenticated API

In [49]:
import requests, json

github_user = 'joelgrus'
endpoint = f'https://api.github.com/users/{github_user}/repos'

repos = json.loads(requests.get(endpoint).text)

repos

[{'id': 112873601,
  'node_id': 'MDEwOlJlcG9zaXRvcnkxMTI4NzM2MDE=',
  'name': 'advent2017',
  'full_name': 'joelgrus/advent2017',
  'private': False,
  'owner': {'login': 'joelgrus',
   'id': 1308313,
   'node_id': 'MDQ6VXNlcjEzMDgzMTM=',
   'avatar_url': 'https://avatars.githubusercontent.com/u/1308313?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/joelgrus',
   'html_url': 'https://github.com/joelgrus',
   'followers_url': 'https://api.github.com/users/joelgrus/followers',
   'following_url': 'https://api.github.com/users/joelgrus/following{/other_user}',
   'gists_url': 'https://api.github.com/users/joelgrus/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/joelgrus/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/joelgrus/subscriptions',
   'organizations_url': 'https://api.github.com/users/joelgrus/orgs',
   'repos_url': 'https://api.github.com/users/joelgrus/repos',
   'events_url': 'https://api.github.com/users/j

In [None]:
# Use 'python -m pip install python-dateutil' when using the command line
%pip install python-dateutil

In [51]:
from collections import Counter
from dateutil.parser import parse

dates = [parse(repo['created_at']) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

print(dates)
print(month_counts)
print(weekday_counts)

[datetime.datetime(2017, 12, 2, 20, 13, 49, tzinfo=tzutc()), datetime.datetime(2018, 11, 30, 22, 41, 16, tzinfo=tzutc()), datetime.datetime(2019, 12, 1, 2, 57, 18, tzinfo=tzutc()), datetime.datetime(2020, 11, 21, 16, 21, 49, tzinfo=tzutc()), datetime.datetime(2021, 11, 24, 13, 53, 23, tzinfo=tzutc()), datetime.datetime(2022, 11, 22, 2, 25, 22, tzinfo=tzutc()), datetime.datetime(2023, 12, 2, 3, 15, 48, tzinfo=tzutc()), datetime.datetime(2018, 2, 23, 15, 51, 4, tzinfo=tzutc()), datetime.datetime(2017, 12, 19, 0, 12, 40, tzinfo=tzutc()), datetime.datetime(2018, 1, 31, 23, 51, 16, tzinfo=tzutc()), datetime.datetime(2018, 12, 19, 19, 44, 45, tzinfo=tzutc()), datetime.datetime(2018, 9, 5, 2, 43, 52, tzinfo=tzutc()), datetime.datetime(2019, 2, 1, 20, 25, 46, tzinfo=tzutc()), datetime.datetime(2013, 7, 5, 2, 2, 28, tzinfo=tzutc()), datetime.datetime(2023, 3, 19, 20, 15, 39, tzinfo=tzutc()), datetime.datetime(2017, 5, 10, 17, 22, 45, tzinfo=tzutc()), datetime.datetime(2013, 11, 15, 5, 33, 22, t

In [52]:
last_5_repositories = sorted(repos,
                             key=lambda r: r['created_at'],
                             reverse=True)[:5]

last_5_languages = [repo['language']
                    for repo in last_5_repositories]

print(last_5_repositories)
print(last_5_languages)

[{'id': 726318877, 'node_id': 'R_kgDOK0q_HQ', 'name': 'advent2023', 'full_name': 'joelgrus/advent2023', 'private': False, 'owner': {'login': 'joelgrus', 'id': 1308313, 'node_id': 'MDQ6VXNlcjEzMDgzMTM=', 'avatar_url': 'https://avatars.githubusercontent.com/u/1308313?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/joelgrus', 'html_url': 'https://github.com/joelgrus', 'followers_url': 'https://api.github.com/users/joelgrus/followers', 'following_url': 'https://api.github.com/users/joelgrus/following{/other_user}', 'gists_url': 'https://api.github.com/users/joelgrus/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/joelgrus/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/joelgrus/subscriptions', 'organizations_url': 'https://api.github.com/users/joelgrus/orgs', 'repos_url': 'https://api.github.com/users/joelgrus/repos', 'events_url': 'https://api.github.com/users/joelgrus/events{/privacy}', 'received_events_url': 'https://api.github.co

### Finding APIs

* Look for "developers" or "API" section of the site for details;
* Search the web for "python \<sitename\> api"
* List of Python API Wrappers, by [Real Python](https://github.com/realpython/list-of-python-api-wrappers)

## Example: Using the Twitter APIs

<div class="alert alert-block alert-warning">
<b>Warning:</b> This section is highly outdated due to restrictions in access to X.
Please wait until further updates.
</div>

In [None]:
# Use 'python -m pip install twython' when using the command line
%pip install twython

To get the credentials:

1. Go to https://developer.twitter.com/.
2. If you are not signed in, click “Sign in” and enter your Twitter username and password.
3. Click Apply to apply for a developer account.
4. Request access for your own personal use.
5. Fill out the application. It requires 300 words (really) on why you need access, so to get over the limit you could tell them about this book and how much you’re enjoying it.
6. Wait some indefinite amount of time.
7. If you know someone who works at Twitter, email them and ask them if they can expedite your application. Otherwise, keep waiting.
8. Once you get approved, go back to developer.twitter.com, find the “Apps” section, and click “Create an app.”
9. Fill out all the required fields (again, if you need extra characters for the description, you could talk about this book and how edifying you’re finding it).
10. Click CREATE.

<div class="alert alert-block alert-danger">
<b>Caution:</b> Don’t share the keys, don’t publish them in your book, and don’t check them into your public GitHub repository. One simple solution is to store them in a credentials.json file that doesn’t get checked in, and to have your code use json.loads to retrieve them.

Another solution is to store them in environment variables and use os.environ to retrieve them.

A third option would be using the <b>Keyring</b> module, which stores your credentials on your OS credentials vault.
</div>

In [54]:
import os

# Feel free to plug in your key and secret in directly
CONSUMER_KEY = os.environ.get('TWITTER_CONSUMER_KEY')
CONSUMER_SECRET = os.environ.get('TWITTER_CONSUMER_SECRET')

In [None]:
import webbrowser
from twython import Twython

# Get a temporary client to retrieve an authentication URL
temp_client = Twython(CONSUMER_KEY, CONSUMER_SECRET)
temp_creds = temp_client.get_authentication_tokens()
url = temp_creds['auth_url']

# Now visit that URL to authorize the application and get a PIN
print(f'Go visit {url} and get the PIN code and paste it below')
webbrowser.open(url)
PIN_CODE = input('Please enter the PIN code: ')

# Now we use that PIN_CODE to get the actual tokens
auth_client = Twython(CONSUMER_KEY,
                      CONSUMER_SECRET,
                      temp_creds['oauth_token'],
                      temp_creds['oauth_token_secret'])
final_step = auth_client.get_authorized_tokens(PIN_CODE)
ACCESS_TOKEN = final_step['oauth_token']
ACCESS_TOKEN_SECRET = final_step['oauth_token_secret']

# And get a new Twython instance using them
twitter = Twython(CONSUMER_KEY,
                  CONSUMER_SECRET,
                  ACCESS_TOKEN,
                  ACCESS_TOKEN_SECRET)

<div class="alert alert-block alert-info">
<b>Tip:</b> At this point you may want to consider saving the <tt>ACCESS_TOKEN</tt> and <tt>ACCESS_TOKEN_SECRET</tt> somewhere safe, so that next time you don’t have to go through this rigmarole.
</div>

In [None]:
# Search for tweets containing the phrase "data science"
for status in twitter.search(q='data science')['statuses']:
    user = status['user']['screen_name']
    text = status['text']
    print(f'{user}: {text}\n')

We'll be using now the [Streaming API](https://developer.twitter.com/en/docs/tutorials/consuming-streaming-data). In order to access it, we need to define a class that inherits from <tt>TwythonStreamer</tt> and that overrides its <tt>on_success</tt> method, and possibly its <tt>on_error</tt> method.

In [None]:
from twython import TwythonStreamer

# Appending data to a global variable is pretty poor form
# but it makes the example much simpler.
tweets = []

class MyStreamer(TwythonStreamer):
    def on_success(self, data):
        """
        What do we do when Twitter sends us data?
        Here data will be a Python dict representing a tweet.
        """
        # We only want to collect English-language tweets
        if data.get('lang') == 'en':
            tweets.append(data)
            print(f'Received tweet #{len(tweets)}')

        # Stop when we've collected enough
        if len(tweets) >= 1000:
            self.disconnect()

    def on_error(self, status_code, data):
        print(status_code, data)
        self.disconnect()

In [None]:
stream = MyStreamer(CONSUMER_KEY,
                    CONSUMER_SECRET,
                    ACCESS_TOKEN,
                    ACCESS_TOKEN_SECRET)

# Starts consuming public statuses that contain the keyword 'data'
stream.statuses.filter(track='data')

# If instead we wanted to start consuming a sample of *all* public statuses
# stream.statuses.sample()

In [None]:
top_hashtags = Counter(hashtag['text'].lower()
                       for tweet in tweets
                       for hashtag in tweet['entities']['hashtags'])

print(top_hashtags.most_common(5))