# Scrape haikus from different places on the internet

URLs used:

* [http://www.hsa-haiku.org/hendersonawards/henderson.htm](http://www.hsa-haiku.org/hendersonawards/henderson.htm)
* [http://www.hsa-haiku.org/bradyawards/brady.htm](http://www.hsa-haiku.org/bradyawards/brady.htm)
* [http://www.hsa-haiku.org/museumhaikuliteratureawards/museumhaikuliterature-award.htm](http://www.hsa-haiku.org/museumhaikuliteratureawards/museumhaikuliterature-award.htm)
* [http://www.hsa-haiku.org/virgilioawards/virgilio.htm](http://www.hsa-haiku.org/virgilioawards/virgilio.htm)
* [http://sacred-texts.com/shi/jh/index.htm](http://sacred-texts.com/shi/jh/index.htm)
* [https://www.thehaikufoundation.org/per-diem-archive/](https://www.thehaikufoundation.org/per-diem-archive/)
* [https://www.ahapoetry.com/aadoh/h_dictionary.htm](https://www.ahapoetry.com/aadoh/h_dictionary.htm)

This notebook writes the haikus to different `*.txt` files with one haiku per line, with each line of the haiku separated with tabs.

Note that the `scrape_data.py` file in this folder is the final result of this notebook. It was just useful to use a notebook to interactively prototype the scraping.

In [27]:
import string
from requests_html import HTMLSession

In [76]:
# Redefine what 'alphabet' means so we don't filter out newlines and spaces.
ALPHABET = frozenset(string.ascii_lowercase + '\n' + ' ')


def preprocess(text, use_ascii=True):
    """
        Preprocess text. Converts to lowercase and filters non-alphabetic characters.
        Defaults to defining alphabetic characters as ascii-alphabetic
        Examples:
        >>> text = 'ABC.,#'
        >>> ''.join(preprocess(text))
        'abc'
        >>> text = 'ÈÆÖÉEAEOE,.%'
        >>> ''.join(preprocess(text, use_ascii=False))
        'èæöéeaeoe'
    """
    if use_ascii:
        return filter(ALPHABET.__contains__, text.lower())
    return filter(str.isalpha, text.lower())

## Get Henderson awarded haikus from [http://www.hsa-haiku.org/hendersonawards/henderson.htm](http://www.hsa-haiku.org/hendersonawards/henderson.htm)

In [30]:
session = HTMLSession()
r = session.get('http://www.hsa-haiku.org/hendersonawards/henderson.htm')

In [75]:
haikus = r.html.find('td > blockquote > p')
haikus = (h.text for h in haikus)
haikus = (''.join(preprocess(h)).strip() for h in haikus)
haikus = filter(lambda x: len(x) > 1, haikus)
haikus = (h.split('\n') for h in haikus)
haikus = ('\t'.join(' '.join(line.split()) for line in h if len(line) > 1) for h in haikus)

with open('henderson.txt', 'w') as f:
    for haiku in haikus:
        f.write(haiku + '\n')

## Get Brady awarded haikus from [http://www.hsa-haiku.org/bradyawards/brady.htm](http://www.hsa-haiku.org/bradyawards/brady.htm)

In [81]:
r = session.get('http://www.hsa-haiku.org/bradyawards/brady.htm')

In [82]:
haikus = r.html.find('td > blockquote > p')
haikus = (h.text for h in haikus)
haikus = (''.join(preprocess(h)).strip() for h in haikus)
haikus = filter(lambda x: len(x) > 1, haikus)
haikus = (h.split('\n') for h in haikus)
haikus = ('\t'.join(' '.join(line.split()) for line in h if len(line) > 1) for h in haikus)

with open('brady.txt', 'w') as f:
    for haiku in haikus:
        f.write(haiku + '\n')

## Get Museum Haiku awarded haikus from [http://www.hsa-haiku.org/museumhaikuliteratureawards/museumhaikuliterature-award.htm](http://www.hsa-haiku.org/museumhaikuliteratureawards/museumhaikuliterature-award.htm)

In [122]:
r = session.get('http://www.hsa-haiku.org/museumhaikuliteratureawards/museumhaikuliterature-award.htm')

In [123]:
haikus = r.html.find('p.haiku')
haikus = (h.text for h in haikus)
haikus = (''.join(preprocess(h)).strip() for h in haikus)
haikus = filter(lambda x: len(x) > 1, haikus)
haikus = (h.split('\n') for h in haikus)
haikus = ('\t'.join(' '.join(line.split()) for line in h if len(line) > 1) for h in haikus)

with open('museum.txt', 'w') as f:
    for haiku in haikus:
        f.write(haiku + '\n')

## Get Virgilio awarded haikus from [http://www.hsa-haiku.org/virgilioawards/virgilio.htm](http://www.hsa-haiku.org/virgilioawards/virgilio.htm)

In [124]:
r = session.get('http://www.hsa-haiku.org/virgilioawards/virgilio.htm')

In [125]:
haikus = r.html.find('.haiku')
haikus = (h.text for h in haikus)
haikus = (''.join(preprocess(h)).strip() for h in haikus)
haikus = filter(lambda x: len(x) > 1, haikus)
haikus = (h.split('\n') for h in haikus)
haikus = ('\t'.join(' '.join(line.split()) for line in h if len(line) > 1) for h in haikus)

with open('virgilio.txt', 'w') as f:
    for haiku in haikus:
        f.write(haiku + '\n')

## Get haikus from [http://sacred-texts.com/shi/jh/index.htm](http://sacred-texts.com/shi/jh/index.htm)

In [132]:
SUB_URLS = [f'http://sacred-texts.com/shi/jh/jh0{i}.htm' for i in range(2, 8)]

all_haikus = []
for url in SUB_URLS:
    r = session.get(url)
    haikus = r.html.find('td > p')
    haikus = (h.text for h in haikus)
    haikus = (''.join(preprocess(h)).strip() for h in haikus)
    haikus = filter(lambda x: len(x) > 12, haikus)
    haikus = (h.split('\n') for h in haikus)
    haikus = ('\t'.join(' '.join(line.split()) for line in h if len(line) > 1) for h in haikus)
    
    
    all_haikus += list(haikus)

with open('sacred.txt', 'w') as f:
    for haiku in all_haikus:
        f.write(haiku + '\n')

## Get haikus from [https://www.thehaikufoundation.org/per-diem-archive/](https://www.thehaikufoundation.org/per-diem-archive/)

In [149]:
r = session.get('https://www.thehaikufoundation.org/per-diem-archive/')
urls = r.html.find('li > a')
urls = (u.attrs['href'] for u in urls)
urls = filter(lambda x: 'IDcat' in x, urls)
urls = (f'https://www.thehaikufoundation.org{u}' for u in urls)

In [150]:
all_haikus = []
for url in urls:
    r = session.get(url)
    try:
        haikus = r.html.find('td > pre')
        haikus = (h.text for h in haikus)
        haikus = (''.join(preprocess(h)).strip() for h in haikus)
        haikus = filter(lambda x: len(x) > 1, haikus)
        haikus = (h.split('\n') for h in haikus)
        haikus = ('\t'.join(' '.join(line.split()) for line in h if len(line) > 1) for h in haikus)

        all_haikus += list(haikus)
    except:
        pass

In [147]:
with open('perdiem.txt', 'w') as f:
    for haiku in all_haikus:
        f.write(haiku + '\n')

## Get haikus from [https://www.ahapoetry.com/aadoh/h_dictionary.htm](https://www.ahapoetry.com/aadoh/h_dictionary.htm)

In [167]:
r = session.get('https://www.ahapoetry.com/aadoh/h_dictionary.htm')
urls = r.html.find('p > a')
urls = (u.attrs['href'] for u in urls)
urls = (f'https://www.ahapoetry.com/aadoh/{u}' for u in urls)

In [168]:
def key(x):
    """Is a given x a haiku?"""
    try:
        return x.attrs['align'] == 'center'
    except:
        return False

all_haikus = []
for url in urls:
    r = session.get(url)
    haikus = r.html.find('p')
    haikus = filter(key, haikus)
    haikus = (h.text for h in haikus)
    haikus = (''.join(preprocess(h)).strip() for h in haikus)
    haikus = filter(lambda x: len(x) > 1, haikus)
    haikus = (h.split('\n') for h in haikus)
    haikus = ('\t'.join(' '.join(line.split()) for line in h if len(line) > 1) for h in haikus)
    
    all_haikus += list(haikus)

with open('aadoh.txt', 'w') as f:
    for haiku in all_haikus:
        f.write(haiku + '\n')