In [1]:
# These are the ways to get data

## stdin and stdout

In [None]:
# You can pipe data using sys.stdin and sys.stdout
# This reads in lines of text and spits back out the ones that match regular expressions
# Use these as script files but you can code them here
#####################################################
# egrep.py
import sys, re
# re, regular expressions

# sys.argv is the list of command-line arguments
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified at the command line
regex = sys.argv[1]

# for every line passed into the script
for line in sys.stdin:
    # if it matches the regex, write it to stdout
    if re.search(regex, line):
        sys.stdout.write(line)
#####################################################
# line_count.py
import sys

count = 0
for line in sys.stdin:
    count += 1
    
# print goes to sys.stdout
print count
# this counts how many lines of a file could contain your regex expression
# example in Unix command line
# cat Somefile.txt | python egrep.py "[0-9]" | python line_count.py
# | = output of left = input of right
# cat = read file sequentially, use this python script looking for numbers, and count how many lines have numbers
#####################################################
# most_common_words.py
import sys
from collections import Counter

# pass in number of words as first argument
try:
    num_words = int(sys.argv[1])
except:
    print("usage: most_common_words.py num_words")
    sys.exit(1)   # non-zero exit code indicates error
    
counter = Counter(word.lower()                        # lowercase words
                 for line in sys.stdin                # for every line in the input file
                 for word in line.strip().split()     # split on spaces
                 if word)                             # skip empty 'words'

for word, count in counter.most_common(num_words):    # tuple in the most_common function instance num_words
    sys.stdout.write(str(count))                      # write out the count into the output file
    sys.stdout.write("\t")                            # tab character separation
    sys.stdout.write(word)                            # write out the word
    sys.stdout.write("\n")                            # new line 
    
# example in command line
# cat the_bible.txt | python most_common_words.py 10 
# read the_bible.txt and give the 10 most common words


## Reading Files

In [None]:
# You can explicity read from and write to files directly in your code
# The Basics of Text Files
# Step 1: obtain a file object using open 
# command open('filename', 'method')

# 'r' means read-only
file_for_reading = open('reading_file.txt', 'r')

# 'w' is write -- will destroy the file if it already exists (truncate)
file_for_writing = open('writing_file.txt', 'w')

# 'a' is append -- for adding to the end of the file
file_for_appending = open('appending_file.txt', 'a')

# Be sure to close files, use it in a with block
with open(filename, 'r') as f:
    data = function_that_gets_data_from(f)
    # f should be closed after it pulls data variable, don't try to use file
    process(data)

# reading a whole text file, you can just iterate over the lines of the file using a for statement

    starts_with_has = 0
    
    with open('input.txt', 'r') as f:
        for line in f:                            # look at eah line in the opened file
            if re.match("^#", line):              # use a regex to see if it starts with '#'
                starts_with_has += 1              # add 1 to count if it does

# for ex: if you have a file full of email addresses, 1 / line and want to make a histogram of domains

def get_domain(email_address):
    """split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]   # lower cased, split at @, return last piece in array

with open('email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())  # line.strip gets rid of \n character
                           for line in f              # go through every line
                           if "@" in line)            # add to counter if there's an @ symbol
                           

## Delimited Files (Comma-Separated, Tab-Separated)

In [None]:
# These files have lots of data on each line, ',' or tabs indicate where the next field starts
# Use Python's csv module or pandas library
# work with csv files in binary mode, use a b after the r or w
# You probably want each row as a list if you have no file headers
# Use csv.reader to iterate over the rows which will approximately split list
#####################################################
import csv

with open('tab_delimited_stock_prices.txt', 'rb') as f:            # grabs data from binary csv file
    reader = csv.reader(f, delimiter='\t')                         # tab separation in file
    for row in reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])                              # float value of prices
        process(date, symbol, closing_price) 
#####################################################
# if you had headers though such as 
# date:symbol:closing_price
# 6/20/2014:AAPL:90.91
# 6/20/2014:MSFT:41.68
# 6/20/2014:FB:64.5

# You can skip the header row (with reader.next()) or get each row as a dict and use headers as keys
# dict method uses csv.DictReader:

with open('colon_delimited_stock_prices.txt', 'rb') as f:          # with opens file, grabs data, closes file
    reader = csv.DictReader(f, delimiter=':')                      # csv.DictReader on filename using : separator
    for row in reader:                                             # go through each row
        date = row["date"]                                         # date column
        symbol = row["symbol"]                                     # symbol column
        closing_price = float(row["closing_price"])                # price as float
        process(date, symbol, closing_price)                       # this format as tuple

# Can pass keys as fieldnames parameter as well
# write out delimited data using csv.writer

# today_prices = {'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5}

with open('comma_delimited_stock_prices.txt', 'wb') as f:
    writer = csv.writer(f, delimiter=',')                          # write from this gathered input
    for stock, price in today_prices.items():                      # for every key:value in this dict
        writer.writerow([stock, price])                            # write rows with this format
        

## Scraping the Web

In [None]:
# Getting meaningful structured information out of scraped web pages isn't so easy
# HTML and the Parsing Thereof
# HTML is not generally well-formed or annotated
# Use of BeautifulSoup for a simple interface for accessing HTML Data
# Use of requests library for HTTP requests 

# pip install beautifulsoupX
# pip install requests
# pip install html5lib

# Pass data through the BeautifulSoup() function
# this will be the result of a call to requests.get
from bs4 import BeautifulSoup
import requests
html = requests.get("http://www.example.com").text
soup = BeautifulSoup(html, 'html5lib')

# Work with tag objects, these represent the structure of an HTML page
# finding the first <p> tag and its contents
first_paragraph = soup.find('p')
# text contents of a Tag using text property
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
# Extract tag's attributes by treating it like a dict
first_paragraph_id = soup.p['id']                    # key error if no 'id'
first_paragraph_id2 = soup.p.get('id')               # returns None if no 'id'
# multiple tags at once
all_paragraphs = soup.find_all('p') 
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]          # all paragraphs that return True
# finding tags with a specific class
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
                        if 'important' in p.get('class', [])]

## Using APIs
### Application Programming Interfaces

In [1]:
# Allows us to explicitly request data in a structured format

### JSON (and XML)

In [None]:
# Data requested through API needs to be serialized into a string format
# JSON = JavaScript Object Notation
# Can parse JSON using Python's json module
# use its loads function which deserializes a string representing a JSON object into a Python object
import json
serialized = """{ "title" : "Data Science Book",
                  "author" : "Joel Grus",
                  "publicationYear" : 2014,
                  "topics" : ["data", "science", "data science"]}"""

# parse the JSON to create a Python dict
deserialized = json.loads(serialized)
if "data science" in deserialized["topics"]:
    print deserialized
# Use BeautifulSoup to get data from XML

### Using an Unauthenticated API

In [3]:
import requests, json
endpoint = "https://api.github.com/users/sephfire05/repos"

repos = json.loads(requests.get(endpoint).text)
print(repos)
# Responses in unicode string

[{'id': 142181835, 'node_id': 'MDEwOlJlcG9zaXRvcnkxNDIxODE4MzU=', 'name': 'data-science-from-scratch', 'full_name': 'Sephfire05/data-science-from-scratch', 'owner': {'login': 'Sephfire05', 'id': 36177057, 'node_id': 'MDQ6VXNlcjM2MTc3MDU3', 'avatar_url': 'https://avatars2.githubusercontent.com/u/36177057?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/Sephfire05', 'html_url': 'https://github.com/Sephfire05', 'followers_url': 'https://api.github.com/users/Sephfire05/followers', 'following_url': 'https://api.github.com/users/Sephfire05/following{/other_user}', 'gists_url': 'https://api.github.com/users/Sephfire05/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/Sephfire05/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/Sephfire05/subscriptions', 'organizations_url': 'https://api.github.com/users/Sephfire05/orgs', 'repos_url': 'https://api.github.com/users/Sephfire05/repos', 'events_url': 'https://api.github.com/users/Sephfire05/event

In [11]:
# Use a date parser
from collections import Counter
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)
print(dates)
print(month_counts)
print(weekday_counts)
last_5_repositories = sorted(repos,
                            key=lambda r: r["created_at"],
                            reverse=True)[:5]

[datetime.datetime(2018, 7, 24, 15, 59, 10, tzinfo=tzutc()), datetime.datetime(2018, 4, 26, 19, 3, 31, tzinfo=tzutc()), datetime.datetime(2018, 2, 6, 0, 41, 39, tzinfo=tzutc()), datetime.datetime(2018, 5, 7, 20, 44, tzinfo=tzutc()), datetime.datetime(2018, 2, 6, 0, 40, 28, tzinfo=tzutc())]
Counter({2: 2, 7: 1, 4: 1, 5: 1})
Counter({1: 3, 3: 1, 0: 1})


### Finding APIs

#### Search the web for "python ___ api" to find a library
#### For a list of APIs that have Python wrappers, 2 directories are at Python API (http://www.pythonapi.com) and Python for Beginners (http://bit.ly/IL35VOR)

## Pandas is the primary library that data science types use for working with(and, in particular, importing) data.