<h1><span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Processing-Data-Files" data-toc-modified-id="Processing-Data-Files-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Processing Data Files</a></span></li><li><span><a href="#Fun-with-Files-and-Directories" data-toc-modified-id="Fun-with-Files-and-Directories-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Fun with Files and Directories</a></span></li><li><span><a href="#Parsing-and-Processing-Data" data-toc-modified-id="Parsing-and-Processing-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Parsing and Processing Data</a></span></li><li><span><a href="#Processing-Infinite-Data" data-toc-modified-id="Processing-Infinite-Data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Processing Infinite Data</a></span></li><li><span><a href="#Feeding-the-Pipeline" data-toc-modified-id="Feeding-the-Pipeline-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Feeding the Pipeline</a></span></li><li><span><a href="#Extending-the-pipeline" data-toc-modified-id="Extending-the-pipeline-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Extending the pipeline</a></span></li><li><span><a href="#Advanced-Data-Routing" data-toc-modified-id="Advanced-Data-Routing-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Advanced Data Routing</a></span></li><li><span><a href="#Various-Programming-Tricks-(And-Debugging)" data-toc-modified-id="Various-Programming-Tricks-(And-Debugging)-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Various Programming Tricks (And Debugging)</a></span></li><li><span><a href="#Parsing-and-Printing" data-toc-modified-id="Parsing-and-Printing-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Parsing and Printing</a></span></li><li><span><a href="#Co-routines" data-toc-modified-id="Co-routines-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Co-routines</a></span></li></ul></div>

# Processing Data Files

In [10]:
# nongenlog.py
#
# Sum up the number of bytes transferred in an Apache log file
# using a simple for-loop.   We're not using generators here.

wwwlog = open("access-log")
total = 0
for line in wwwlog:
    bytestr = line.rsplit(None, 1)[1]
    if bytestr != "-":
        total += int(bytestr)

print("Total", total)

Total 20329184


In [11]:
# genlog.py
#
# Sum up the bytes transferred in an Apache server log using
# generator expressions

wwwlog = open("access-log")
bytecolumn = (line.rsplit(None, 1)[1] for line in wwwlog)
bytes = (int(x) for x in bytecolumn if x != "-")

print("Total", sum(bytes))

Total 20329184


In [8]:
# Make a big log file for testing

import sys

if len(sys.argv) != 2:
    print("Usage : makebig.py repetitions", file=sys.stderr)
    raise SystemExit(1)

data = open("access-log").read()

f = open("big-access-log", "w")
for i in xrange(int(sys.argv[1])):
    f.write(data)


Usage : makebig.py repetitions


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# Fun with Files and Directories

In [13]:
% ls ./www

[0m[01;34mbar[0m/  [01;34mfoo[0m/


In [14]:
%ls ./www/bar

[0m[01;36maccess-log[0m*  [01;36maccess-log-0108.bz2[0m*  [01;36maccess-log-0208.bz2[0m*


In [15]:
% ls ./www/foo

[0m[01;36maccess-log[0m*  [01;36maccess-log-0108.gz[0m*  [01;36maccess-log-0208.gz[0m*


In [16]:
# genfind.py
#
# A function that generates files that match a given filename pattern

import os
import fnmatch


def gen_find(filepat, top):
    for path, dirlist, filelist in os.walk(top):
        for name in fnmatch.filter(filelist, filepat):
            yield os.path.join(path, name)


# Example use

if __name__ == "__main__":
    lognames = gen_find("access-log*", "www")
    for name in lognames:
        print(name)

www/foo/access-log
www/foo/access-log-0108.gz
www/foo/access-log-0208.gz
www/bar/access-log
www/bar/access-log-0208.bz2
www/bar/access-log-0108.bz2


In [17]:
# genopen.py
#
# Takes a sequence of filenames as input and yields a sequence of file
# objects that have been suitably open

import gzip, bz2


def gen_open(filenames):
    for name in filenames:
        if name.endswith(".gz"):
            yield gzip.open(name)
        elif name.endswith(".bz2"):
            yield bz2.BZ2File(name)
        else:
            yield open(name)


# Example use

if __name__ == "__main__":
    from genfind import gen_find

    lognames = gen_find("access-log*", "www")
    logfiles = gen_open(lognames)
    for f in logfiles:
        print(f)

<_io.TextIOWrapper name='www/foo/access-log' mode='r' encoding='UTF-8'>
<gzip _io.BufferedReader name='www/foo/access-log-0108.gz' 0x7f0f6c67f588>
<gzip _io.BufferedReader name='www/foo/access-log-0208.gz' 0x7f0f6c67f940>
<_io.TextIOWrapper name='www/bar/access-log' mode='r' encoding='UTF-8'>
<bz2.BZ2File object at 0x7f0f6c67f978>
<bz2.BZ2File object at 0x7f0f6c67fc88>


In [20]:
# gencat.py
#
# Concatenate multiple generators into a single sequence

import itertools

def gen_cat(sources):
    for s in sources:
        for item in s:
            yield item


# Example use

if __name__ == "__main__":
    from genfind import gen_find
    from genopen import gen_open

    lognames = gen_find("access-log*", "www")
    logfiles = gen_open(lognames)
    loglines = gen_cat(logfiles)
    for line in itertools.islice(loglines, 10):
        print(line)

140.180.132.213 - - [24/Feb/2008:00:08:59 -0600] "GET /ply/ply.html HTTP/1.1" 200 97238

140.180.132.213 - - [24/Feb/2008:00:08:59 -0600] "GET /favicon.ico HTTP/1.1" 404 133

75.54.118.139 - - [24/Feb/2008:00:15:40 -0600] "GET / HTTP/1.1" 200 4447

75.54.118.139 - - [24/Feb/2008:00:15:41 -0600] "GET /images/Davetubes.jpg HTTP/1.1" 200 60025

75.54.118.139 - - [24/Feb/2008:00:15:42 -0600] "GET /favicon.ico HTTP/1.1" 404 133

75.54.118.139 - - [24/Feb/2008:00:15:49 -0600] "GET /software.html HTTP/1.1" 200 3163

75.54.118.139 - - [24/Feb/2008:00:16:10 -0600] "GET /ply/index.html HTTP/1.1" 200 8018

75.54.118.139 - - [24/Feb/2008:00:16:11 -0600] "GET /ply/bookplug.gif HTTP/1.1" 200 23903

213.145.165.82 - - [24/Feb/2008:00:16:19 -0600] "GET /ply/ HTTP/1.1" 200 8018

128.143.38.83 - - [24/Feb/2008:00:31:39 -0600] "GET /favicon.ico HTTP/1.1" 404 133



In [21]:
# gengrep.py
#
# Grep a sequence of lines that match a re pattern

import re
import itertools


def gen_grep(pat, lines):
    patc = re.compile(pat)
    for line in lines:
        try:
            line.decode()
        except AttributeError:
            if patc.search(line):
                yield line


# Example use

if __name__ == "__main__":
    from genfind import gen_find
    from genopen import gen_open
    from gencat import gen_cat

    lognames = gen_find("access-log*", "www")
    logfiles = gen_open(lognames)
    loglines = gen_cat(logfiles)

    # Look for ply downloads (PLY is my own Python package)
    plylines = gen_grep(r"ply-.*\.gz", loglines)
    for line in itertools.islice(plylines, 10):
        print(line)

131.107.0.112 - - [24/Feb/2008:03:02:22 -0600] "GET /ply/ply-2.2.tar.gz HTTP/1.1" 200 142210

74.6.8.73 - - [24/Feb/2008:03:09:17 -0600] "GET /ply/ply-1.2.tar.gz HTTP/1.0" 200 64334

74.6.8.73 - - [24/Feb/2008:03:34:07 -0600] "GET /ply/ply-2.1.tar.gz HTTP/1.0" 200 107720

61.230.94.215 - - [24/Feb/2008:04:45:55 -0600] "GET /ply/ply-1.0.tar.gz HTTP/1.1" 200 60130

61.230.94.215 - - [24/Feb/2008:04:48:05 -0600] "GET /ply/ply-2.3.tar.gz HTTP/1.1" 200 115318

150.210.155.167 - - [24/Feb/2008:09:22:11 -0600] "GET /ply/ply-2.3.tar.gz HTTP/1.1" 200 115318

74.6.8.73 - - [24/Feb/2008:10:34:02 -0600] "GET /ply/ply-1.3.1.tar.gz HTTP/1.0" 304 -

201.141.81.60 - - [24/Feb/2008:13:33:31 -0600] "GET /ply/ply-2.3.tar.gz HTTP/1.1" 200 115318

74.6.22.143 - - [24/Feb/2008:14:57:30 -0600] "GET /ply/ply-1.4.tar.gz HTTP/1.0" 200 66002

189.13.184.120 - - [24/Feb/2008:14:59:40 -0600] "GET /ply/ply-2.3.tar.gz HTTP/1.1" 200 115318



In [22]:
# bytesgen.py
#
# An example of chaining together different generators into a processing
# pipeline.

from genfind import *
from genopen import *
from gencat import *
from gengrep import *

pat = r"ply-.*\.gz"
logdir = "www"

filenames = gen_find("access-log*", logdir)
logfiles = gen_open(filenames)
loglines = gen_cat(logfiles)
patlines = gen_grep(pat, loglines)
bytecol = (line.rsplit(None, 1)[1] for line in patlines)
bytes = (int(x) for x in bytecol if x != "-")

print("Total", sum(bytes))

Total 37891196


# Parsing and Processing Data

access log

```
140.180.132.213 - - [24/Feb/2008:00:08:59 -0600] "GET /ply/ply.html HTTP/1.1" 200 97238
140.180.132.213 - - [24/Feb/2008:00:08:59 -0600] "GET /favicon.ico HTTP/1.1" 404 133
75.54.118.139 - - [24/Feb/2008:00:15:40 -0600] "GET / HTTP/1.1" 200 4447
75.54.118.139 - - [24/Feb/2008:00:15:41 -0600] "GET /images/Davetubes.jpg HTTP/1.1" 200 60025
75.54.118.139 - - [24/Feb/2008:00:15:42 -0600] "GET /favicon.ico HTTP/1.1" 404 133
75.54.118.139 - - [24/Feb/2008:00:15:49 -0600] "GET /software.html HTTP/1.1" 200 3163
75.54.118.139 - - [24/Feb/2008:00:16:10 -0600] "GET /ply/index.html HTTP/1.1" 200 8018
75.54.118.139 - - [24/Feb/2008:00:16:11 -0600] "GET /ply/bookplug.gif HTTP/1.1" 200 23903
213.145.165.82 - - [24/Feb/2008:00:16:19 -0600] "GET /ply/ HTTP/1.1" 200 8018
```

In [24]:
# retuple.py
#
# Read a sequence of log lines and parse them into a sequence of tuples

loglines = open("access-log")

import re
import itertools

logpats = r"(\S+) (\S+) (\S+) \[(.*?)\] " r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat = re.compile(logpats)

groups = (logpat.match(line) for line in loglines)
tuples = (g.groups() for g in groups if g)

if __name__ == "__main__":
    for t in itertools.islice(tuples, 10):
        print(t)

('71.57.91.136', '-', '-', '28/Feb/2008:12:39:34 -0600', 'GET', '/favicon.ico', 'HTTP/1.1', '404', '133')
('71.57.91.136', '-', '-', '28/Feb/2008:12:39:34 -0600', 'GET', '/dynamic/06FilesAndText.pdf', 'HTTP/1.1', '206', '181019')
('128.135.24.9', '-', '-', '28/Feb/2008:12:41:14 -0600', 'GET', '/', 'HTTP/1.1', '200', '4447')
('128.135.24.9', '-', '-', '28/Feb/2008:12:41:14 -0600', 'GET', '/images/Davetubes.jpg', 'HTTP/1.1', '200', '60025')
('128.135.24.9', '-', '-', '28/Feb/2008:12:41:14 -0600', 'GET', '/favicon.ico', 'HTTP/1.1', '404', '133')
('128.135.24.9', '-', '-', '28/Feb/2008:12:41:19 -0600', 'GET', '/dynamic/index.html', 'HTTP/1.1', '200', '5313')
('128.135.24.9', '-', '-', '28/Feb/2008:12:41:23 -0600', 'GET', '/dynamic/07Functional.pdf', 'HTTP/1.1', '200', '133908')
('208.97.218.10', '-', '-', '28/Feb/2008:12:50:17 -0600', 'GET', '/python.html', 'HTTP/1.1', '200', '18870')
('208.97.218.10', '-', '-', '28/Feb/2008:12:50:17 -0600', 'GET', '/images/NerdRanchEurope.jpg', 'HTTP/1.1'

In [25]:
# redict.py
#
# Read a sequence of log lines and parse them into a sequence of dictionaries

loglines = open("access-log")

import re
import itertools

logpats = r"(\S+) (\S+) (\S+) \[(.*?)\] " r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat = re.compile(logpats)

groups = (logpat.match(line) for line in loglines)
tuples = (g.groups() for g in groups if g)

colnames = (
    "host",
    "referrer",
    "user",
    "datetime",
    "method",
    "request",
    "proto",
    "status",
    "bytes",
)

log = (dict(list(zip(colnames, t))) for t in tuples)

if __name__ == "__main__":
    for x in  itertools.islice(log, 10):
        print(x)

{'host': '71.57.91.136', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:39:34 -0600', 'method': 'GET', 'request': '/favicon.ico', 'proto': 'HTTP/1.1', 'status': '404', 'bytes': '133'}
{'host': '71.57.91.136', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:39:34 -0600', 'method': 'GET', 'request': '/dynamic/06FilesAndText.pdf', 'proto': 'HTTP/1.1', 'status': '206', 'bytes': '181019'}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/', 'proto': 'HTTP/1.1', 'status': '200', 'bytes': '4447'}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/images/Davetubes.jpg', 'proto': 'HTTP/1.1', 'status': '200', 'bytes': '60025'}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/favicon.ico', 'proto': 'HTTP/1.1', 'status': '404', 'bytes': '133'}
{'host

In [26]:
# fieldmap.py
#
# Take a sequence of dictionaries and remap one of the fields


def field_map(dictseq, name, func):
    for d in dictseq:
        d[name] = func(d[name])
        yield d


# Example

if __name__ == "__main__":

    loglines = open("access-log")

    import re
    import itertools
    
    logpats = r"(\S+) (\S+) (\S+) \[(.*?)\] " r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

    logpat = re.compile(logpats)

    groups = (logpat.match(line) for line in loglines)
    tuples = (g.groups() for g in groups if g)

    colnames = (
        "host",
        "referrer",
        "user",
        "datetime",
        "method",
        "request",
        "proto",
        "status",
        "bytes",
    )

    log = (dict(list(zip(colnames, t))) for t in tuples)

    log = field_map(log, "status", int)
    log = field_map(log, "bytes", lambda s: int(s) if s != "-" else 0)

    for x in  itertools.islice(log, 10):
        print(x)

{'host': '71.57.91.136', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:39:34 -0600', 'method': 'GET', 'request': '/favicon.ico', 'proto': 'HTTP/1.1', 'status': 404, 'bytes': 133}
{'host': '71.57.91.136', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:39:34 -0600', 'method': 'GET', 'request': '/dynamic/06FilesAndText.pdf', 'proto': 'HTTP/1.1', 'status': 206, 'bytes': 181019}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/', 'proto': 'HTTP/1.1', 'status': 200, 'bytes': 4447}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/images/Davetubes.jpg', 'proto': 'HTTP/1.1', 'status': 200, 'bytes': 60025}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/favicon.ico', 'proto': 'HTTP/1.1', 'status': 404, 'bytes': 133}
{'host': '128.135.24.9', '

In [27]:
# linesdir.py
#
# Generate a sequence of lines from files in a directory

from genfind import *
from gencat import *
from genopen import *
import itertools

def lines_from_dir(filepat, dirname):
    names = gen_find(filepat, dirname)
    files = gen_open(names)
    lines = gen_cat(files)
    return lines


# Example use

if __name__ == "__main__":
    loglines = lines_from_dir("access-log*", "www")
    for line in  itertools.islice(loglines, 10):
        print(line, end=" ")

140.180.132.213 - - [24/Feb/2008:00:08:59 -0600] "GET /ply/ply.html HTTP/1.1" 200 97238
 140.180.132.213 - - [24/Feb/2008:00:08:59 -0600] "GET /favicon.ico HTTP/1.1" 404 133
 75.54.118.139 - - [24/Feb/2008:00:15:40 -0600] "GET / HTTP/1.1" 200 4447
 75.54.118.139 - - [24/Feb/2008:00:15:41 -0600] "GET /images/Davetubes.jpg HTTP/1.1" 200 60025
 75.54.118.139 - - [24/Feb/2008:00:15:42 -0600] "GET /favicon.ico HTTP/1.1" 404 133
 75.54.118.139 - - [24/Feb/2008:00:15:49 -0600] "GET /software.html HTTP/1.1" 200 3163
 75.54.118.139 - - [24/Feb/2008:00:16:10 -0600] "GET /ply/index.html HTTP/1.1" 200 8018
 75.54.118.139 - - [24/Feb/2008:00:16:11 -0600] "GET /ply/bookplug.gif HTTP/1.1" 200 23903
 213.145.165.82 - - [24/Feb/2008:00:16:19 -0600] "GET /ply/ HTTP/1.1" 200 8018
 128.143.38.83 - - [24/Feb/2008:00:31:39 -0600] "GET /favicon.ico HTTP/1.1" 404 133
 

In [28]:
# apachelog.py
#
# Parse an apache log file into a sequence of dictionaries

from fieldmap import *

import re
import itertools

logpats = r"(\S+) (\S+) (\S+) \[(.*?)\] " r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat = re.compile(logpats)


def skip_byte_line(lines):
    for line in lines:
        try:
            line.decode()
        except AttributeError:
            yield line

def apache_log(lines):
    lines =  skip_byte_line(lines)
    groups = (logpat.match(line) for line in lines)
    tuples = (g.groups() for g in groups if g)

    colnames = (
        "host",
        "referrer",
        "user",
        "datetime",
        "method",
        "request",
        "proto",
        "status",
        "bytes",
    )

    log = (dict(list(zip(colnames, t))) for t in tuples)
    log = field_map(log, "status", int)
    log = field_map(log, "bytes", lambda s: int(s) if s != "-" else 0)

    return log


# Example use:

if __name__ == "__main__":
    from linesdir import *

    lines = lines_from_dir("access-log*", "www")
    log = apache_log(lines)
    for r in itertools.islice(loglines, 10):
        print(r)


128.143.38.83 - - [24/Feb/2008:00:31:51 -0600] "GET /favicon.ico HTTP/1.1" 404 133

86.132.71.214 - - [24/Feb/2008:00:37:55 -0600] "GET /python.html HTTP/1.1" 200 18870

86.132.71.214 - - [24/Feb/2008:00:37:55 -0600] "GET /images/NerdRanchEurope.jpg HTTP/1.1" 200 99542

86.132.71.214 - - [24/Feb/2008:00:37:56 -0600] "GET /favicon.ico HTTP/1.1" 404 133

86.132.71.214 - - [24/Feb/2008:00:37:56 -0600] "GET /favicon.ico HTTP/1.1" 404 133

86.132.71.214 - - [24/Feb/2008:00:38:35 -0600] "GET /favicon.ico HTTP/1.1" 404 133

74.6.25.144 - - [24/Feb/2008:00:48:16 -0600] "GET /dynamic/01Introduction.pdf HTTP/1.0" 200 3110734

74.6.7.122 - - [24/Feb/2008:00:56:36 -0600] "GET /python/tutorial/beazley_intro_python/Slides/SLIDE113.HTM HTTP/1.0" 200 1095

125.25.238.64 - - [24/Feb/2008:01:04:47 -0600] "GET /ply/ HTTP/1.1" 200 8018

125.25.238.64 - - [24/Feb/2008:01:04:49 -0600] "GET /ply/bookplug.gif HTTP/1.1" 200 12382



In [2]:
# query404.py
#
# Find the set of all documents that 404 in a log file

from linesdir import *
from apachelog import *

import itertools

lines = lines_from_dir("access-log*", "www")
log = apache_log(lines)

stat404 = set(r["request"] for r in log if r["status"] == 404)

for r in  itertools.islice(sorted(stat404), 10):
    print(r)

/02WorkingWithData.pdf
/06FilesAndText.pdf
/07Functional.pdf
/Doc/index.html
/PLYTalk.pdf
/Perl98/swigperl.htm
/Py96/python96.html
/Py97/beazley.html
/Python2001/python.html
/README


In [3]:
# largefiles.py
#
# Find all transfers over a megabyte

from linesdir import *
from apachelog import *
import itertools

lines = lines_from_dir("access-log*", "www")
log = apache_log(lines)

large = (r for r in log if r["bytes"] > 1000000)

for r in  itertools.islice(large, 10):
    print((r["request"], r["bytes"]))


('/dynamic/01Introduction.pdf', 3110734)
('/dynamic/ffcache.zip', 4919642)
('/dynamic/ffcache.zip', 4919642)
('/dynamic/ffcache.zip', 4919642)
('/dynamic/01Introduction.pdf', 3108482)
('/dynamic/ffcache.zip', 4919642)
('/dynamic/02WorkingWithData.pdf', 3246437)
('/dynamic/ffcache.zip', 4919642)
('/dynamic/ffcache.zip', 4919642)
('/dynamic/02WorkingWithData.pdf', 2935451)


In [3]:
# largest.py
#
# Find the largest file

from linesdir import *
from apachelog import *

lines = lines_from_dir("access-log*", "www")
log = apache_log(lines)

print("%d %s" % max((r["bytes"], r["request"]) for r in log))


4919642 /dynamic/ffcache.zip


In [4]:
# hosts.py
#
# Find unique host IP addresses

from linesdir import *
from apachelog import *
import itertools

lines = lines_from_dir("access-log*", "www")
log = apache_log(lines)

hosts = set(r["host"] for r in log)
for h in  itertools.islice(hosts, 10):
    print(h)


81.255.238.189
203.73.43.189
198.54.202.210
203.166.87.218
62.153.70.82
80.229.38.64
84.110.221.201
210.245.52.8
83.204.240.53
74.6.26.198


In [2]:
# downloads.py
#
# Find out how many downloads of a specific request

from linesdir import *
from apachelog import *

lines = lines_from_dir("access-log*", "www")
log = apache_log(lines)

request = "ply/ply-2.3.tar.gz"

total = sum(1 for r in log if r["request"] == "/ply/ply-2.3.tar.gz")

print("Total", total)


Total 690


In [5]:
# robots.py
#
# Find out who has been hitting robots.txt

from linesdir import *
from apachelog import *
import itertools

lines = lines_from_dir("access-log*", "www")
log = apache_log(lines)

addrs = set(r["host"] for r in log if "robots.txt" in r["request"])

import socket

for addr in  itertools.islice(addrs, 10):
    try:
        print(socket.gethostbyaddr(addr)[0])
    except socket.herror:
        print(addr)


msnbot-65-55-212-77.search.msn.com
9.80-202-87.nextgentel.com
www.whois.sc
72.36.114.239
222.122.236.43
64.124.85.71.broadcast.zip.zayo.com
64.124.85.75.broadcast.zip.zayo.com
crawl-b04-s3.orangebot.orange.fr
static-88.131.106.15.addr.tdc.se
65.55.232.15


In [None]:
# なんかうまく動かない
# robotsfast.py
#
# Find out who has been hitting robots.txt

from linesdir import *
from apachelog import *

lines = lines_from_dir("access-log*", "www")
lines = (line for line in lines if "robots.txt" in line)
log = apache_log(lines)

addrs = set(r["host"] for r in log if "robots.txt" in r["request"])

import socket

for addr in addrs:
    try:
        print(socket.gethostbyaddr(addr)[0])
    except socket.herror:
        print(addr)


# Processing Infinite Data

In [None]:
# follow.py
#
# Follow a file like tail -f.

import time


def follow(thefile):
    thefile.seek(0, 2)
    while True:
        line = thefile.readline()
        if not line:
            time.sleep(0.1)
            continue
        yield line


# Example use
# Note : This example requires the use of an apache log simulator.
#
# Go to the directory run/foo and run the program 'logsim.py' from
# that directory.   Run this program as a background process and
# leave it running in a separate window.  We'll write program
# that read the output file being generated
#

if __name__ == "__main__":
    logfile = open("run/foo/access-log", "r")
    loglines = follow(logfile)
    for line in loglines:
        print(line)

In [None]:
# realtime404.py
#
# Print all 404 requests as they happen in the log

from apachelog import *
from follow import *

logfile  = open("run/foo/access-log")
loglines = follow(logfile)
log      = apache_log(loglines)

r404 = (r for r in log if r['status'] == 404)

for r in r404:
    print(r['host'], r['datetime'], r['request'])


# Feeding the Pipeline

In [None]:
# genreceive.py
#
# A generator that yields connections to a TCP socket

import socket


def receive_connections(addr):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    s.bind(addr)
    s.listen(5)
    while True:
        client = s.accept()
        yield client


# Example use

if __name__ == "__main__":
    for c, a in receive_connections(("", 9000)):
        print("Got connection from", a)
        c.send("Hello World\n")
        c.close()

In [None]:
# genmessages.py
#
# A generator that yields messages on a UDP socket

import socket


def receive_messages(addr, maxsize):
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    s.bind(addr)
    while True:
        msg = s.recvfrom(maxsize)
        yield msg


# Example use
# To send a message to this generator, use the code "msgtest.py"

if __name__ == "__main__":
    for msg, addr in receive_messages(("", 10000), 1024):
        print((msg, "from", addr))


# Extending the pipeline


In [None]:
# genpickle.py
#
# Turn a sequence of objects into a sequence of pickle strings

import pickle


def gen_pickle(source):
    for item in source:
        yield pickle.dumps(item)


def gen_unpickle(infile):
    while True:
        try:
            item = pickle.load(infile)
            yield item
       except EOFError:
            return


In [None]:
# recivefrom.py
#
# Receive objects from a different machine

import socket
from genpickle import *


def receivefrom(addr):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    s.bind(addr)
    s.listen(5)
    c, a = s.accept()
    for item in gen_unpickle(c.makefile()):
        yield item
    c.close()


# Example use:
if __name__ == "__main__":
    for r in receivefrom(("127.0.0.1", 15000)):
        print(r["host"], r["request"])

In [None]:
# sendto.py
#
# Send items to a remote machine

import socket
from genpickle import *


def sendto(source, addr):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect(addr)
    for pitem in gen_pickle(source):
        s.sendall(pitem)
    s.close()


# Example use.   This requires you to run receivefrom.py
# in a different process/window

if __name__ == "__main__":
    from apachelog import *
    from follow import *

    lines = follow(open("run/foo/access-log"))
    log = apache_log(lines)
    sendto(log, ("127.0.0.1", 15000))

In [None]:
# genqueue.py
#
# Generate a sequence of items that put onto a queue


def sendto_queue(source, thequeue):
    for item in source:
        thequeue.put(item)
    thequeue.put(StopIteration)


def genfrom_queue(thequeue):
    while True:
        item = thequeue.get()
        if item is StopIteration:
            break
        yield item


# Example
if __name__ == "__main__":

    # A consumer.   Prints out 404 records.
    def print_r404(log_q):
        log = genfrom_queue(log_q)
        r404 = (r for r in log if r["status"] == 404)
        for r in r404:
            print(r["host"], r["datetime"], r["request"])

    import queue, threading
    from follow import *
    from apachelog import *

    log_q = queue.Queue()
    log_thr = threading.Thread(target=print_r404, args=(log_q,))
    log_thr.setDaemon(True)
    log_thr.start()

    # Feed the consumer thread
    lines = follow(open("run/foo/access-log"))
    log = apache_log(lines)
    sendto_queue(log, log_q)

# Advanced Data Routing


In [None]:
# genmultiplex.py

import threading, queue
from genqueue import *
from gencat import *


def multiplex(sources):
    in_q = queue.Queue()
    consumers = []
    for s in sources:
        thr = threading.Thread(target=sendto_queue, args=(s, in_q))
        thr.start()
        consumers.append(genfrom_queue(in_q))
    return gen_cat(consumers)


if __name__ == "__main__":
    import follow

    foo_log = follow.follow(open("run/foo/access-log"))
    bar_log = follow.follow(open("run/bar/access-log"))
    for line in multiplex([foo_log, bar_log]):
        print(line)

In [None]:
# broadcast.py
#
# Broadcast a generator source to a collection of consumers


def broadcast(source, consumers):
    for item in source:
        for c in consumers:
            c.send(item)


# Example
if __name__ == "__main__":

    class Consumer(object):
        def send(self, item):
            print(self, "got", item)

    c1 = Consumer()
    c2 = Consumer()
    c3 = Consumer()

    from follow import *

    lines = follow(open("run/foo/access-log"))
    broadcast(lines, [c1, c2, c3])

In [None]:
# netsend.py
#
# Consume items and send them to a remote machine

import socket, pickle


class NetConsumer(object):
    def __init__(self, addr):
        self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.s.connect(addr)

    def send(self, item):
        pitem = pickle.dumps(item)
        self.s.sendall(pitem)

    def close(self):
        self.s.close()


# Example use.  This requires you to run receivefrom.py first.

if __name__ == "__main__":
    from broadcast import *
    from follow import *
    from apachelog import *

    # A class that sends 404 requests to another host
    class Stat404(NetConsumer):
        def send(self, item):
            if item["status"] == 404:
                NetConsumer.send(self, item)

    stat404 = Stat404(("", 15000))

    lines = follow(open("run/foo/access-log"))
    log = apache_log(lines)
    broadcast(log, [stat404])


In [None]:
# consthread.py

import queue, threading
from genqueue import genfrom_queue


class ConsumerThread(threading.Thread):
    def __init__(self, target):
        threading.Thread.__init__(self)
        self.setDaemon(True)
        self.in_q = queue.Queue()
        self.target = target

    def send(self, item):
        self.in_q.put(item)

    def run(self):
        self.target(genfrom_queue(self.in_q))


# Example use
if __name__ == "__main__":
    from follow import *
    from apachelog import *
    from broadcast import *

    def find_404(log):
        for r in (r for r in log if r["status"] == 404):
            print(r["status"], r["datetime"], r["request"])

    def bytes_transferred(log):
        total = 0
        for r in log:
            total += r["bytes"]
            print("Total bytes", total)

    c1 = ConsumerThread(find_404)
    c1.start()
    c2 = ConsumerThread(bytes_transferred)
    c2.start()

    lines = follow(open("run/foo/access-log"))  # Follow a log
    log = apache_log(lines)  # Turn into records
    broadcast(log, [c1, c2])  # Broadcast to consumers


# Various Programming Tricks (And Debugging)

In [32]:
# gentrace.py
#
# Trace a generator by printing items received
import itertools


def trace(source):
    for item in source:
        print(item)
        yield item


# Example use
if __name__ == "__main__":
    from apachelog import *

    lines = open("access-log")
    log = trace(apache_log(lines))
    r404 = itertools.islice((r for r in log if r["status"] == 404), 3)

    for r in r404:
        pass

{'host': '71.57.91.136', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:39:34 -0600', 'method': 'GET', 'request': '/favicon.ico', 'proto': 'HTTP/1.1', 'status': 404, 'bytes': 133}
{'host': '71.57.91.136', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:39:34 -0600', 'method': 'GET', 'request': '/dynamic/06FilesAndText.pdf', 'proto': 'HTTP/1.1', 'status': 206, 'bytes': 181019}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/', 'proto': 'HTTP/1.1', 'status': 200, 'bytes': 4447}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/images/Davetubes.jpg', 'proto': 'HTTP/1.1', 'status': 200, 'bytes': 60025}
{'host': '128.135.24.9', 'referrer': '-', 'user': '-', 'datetime': '28/Feb/2008:12:41:14 -0600', 'method': 'GET', 'request': '/favicon.ico', 'proto': 'HTTP/1.1', 'status': 404, 'bytes': 133}
{'host': '128.135.24.9', '

In [None]:
# storelast.py
#
# An iterator that stores the last value returned.  

class storelast(object):
    def __init__(self,source):
        self.source = source
    def __next__(self):
        item = next(self.source)
        self.last = item
        return item
    def __iter__(self):
        return self

# Example
if __name__ == '__main__':
    from follow import *
    from apachelog import *

    lines = storelast(follow(open("run/foo/access-log")))
    log   = apache_log(lines)

    for r in log:
        print(r)
        print(lines.last)

In [None]:
# genshutdown.py
#
# Example of shutting down a generator
#
# Requires you to run run/foo/logsim.py to get a real-time source

from follow import *

lines = follow(open("run/foo/access-log"))
for i, line in enumerate(lines):
    print(line, end=" ")
    if i == 10:
        lines.close()

# Parsing and Printing


In [44]:
# genrecord.py
import struct


def gen_records(record_format, thefile):
    record_size = struct.calcsize(record_format)
    while True:
        raw_record = thefile.read(record_size)
        if not raw_record:
            print('==== no more records ===')
            break
        yield struct.unpack(record_format, raw_record)


# Example use
if __name__ == "__main__":
    f = open("stockdata.bin", "rb")
    for name, shares, price in gen_records("<8sif", f):
        print("%10s %10d %10.2f" % (name.decode(), shares, price))


  AA              100      32.20
  IBM              50      91.10
  CAT             150      83.44
  MSFT            200      51.23
  GE               95      40.37
  MSFT             50      65.10
  IBM             100      70.44
==== no more records ===


In [49]:
# print_count.py


def print_count(n):
    yield "Hello World\n"
    yield "\n"
    yield "Look at me count to %d\n" % n
    for i in range(n):
        yield "   %d\n" % i
    yield "I'm done!\n"


# Example:

if __name__ == "__main__":
    out = print_count(10)
    print("".join(out))

    # Route to a file
    out = print_count(5)
    f = open("count.txt", "wb")
    print("")
    for chunk in out:
        f.write(chunk.encode('utf-8'))
    f.close()

Hello World

Look at me count to 10
   0
   1
   2
   3
   4
   5
   6
   7
   8
   9
I'm done!


inside of count.txt


In [51]:
%cat count.txt

Hello World

Look at me count to 5
   0
   1
   2
   3
   4
I'm done!


# Co-routines

In [52]:
# recvcount.py
#
# Example of a co-routine


def recv_count():
    try:
        while True:
            n = (yield) # <= co-routine
            print("T-minus", n)
    except GeneratorExit:
        print("Kaboom!")


r = recv_count()
next(r)
for i in range(5, 0, -1):
    r.send(i)

r.close()

T-minus 5
T-minus 4
T-minus 3
T-minus 2
T-minus 1
Kaboom!


In [53]:
# consumer.py
#
# consumer decorator and co-routine example


def consumer(func):
    def start(*args, **kwargs):
        c = func(*args, **kwargs)
        next(c)
        return c

    return start


# Example
if __name__ == "__main__":

    @consumer # <= 上と違って、next() しなくて良くなる
    def recv_count():
        try:
            while True:
                n = (yield)
                print("T-minus", n)
        except GeneratorExit:
            print("Kaboom!")

    r = recv_count()
    for i in range(5, 0, -1):
        r.send(i)

    r.close()

T-minus 5
T-minus 4
T-minus 3
T-minus 2
T-minus 1
Kaboom!


In [None]:
# logcoroutine.py
#
# An example of using co-routines to define consumers for the Apache log data

from consumer import *
from apachelog import *
from follow import *
from broadcast import *

@consumer
def find_404():
    while True:
        r = (yield)
        if r['status'] == 404:
            print(r['status'],r['datetime'],r['request'])

@consumer
def bytes_transferred():
    total = 0
    while True:
        r = (yield)
        total += r['bytes']
        print("Total bytes", total)

lines = follow(open("run/foo/access-log"))
log   = apache_log(lines)

broadcast(log, [find_404(),bytes_transferred()])