In [9]:
'''Makes pols.csv'''

# coding: utf-8
import requests
from pyquery import PyQuery as pq
from nameparser import HumanName
from slugify import slugify #for python3 requirements module is now python-slugify, but keep from slugify import slugify

import re #place before import requests
from operator import itemgetter #place before import requests
import csv #place before import requests
from time import sleep

In [10]:
HOUSEURL = "http://www.myfloridahouse.gov/Sections/Representatives/representatives.aspx"
SENATEURL = "https://www.flsenate.gov/Senators"
HOUSEPRE = "http://www.myfloridahouse.gov"
SENATEPRE = "http://www.flsenate.gov"
naptime = 0.1


In [11]:
def cleaner(text):
    text = text.replace(" , ", ", ")
    text = text.replace("\r\n", " ")
    text = re.sub(r'\s+', ' ', text).replace(" , ", ", ")   # Try to replace random internal spaces
    text = " ".join(text.split())   # Replace random internal spaces
    text = re.sub(r'\"\w+\"', '', text)    # Kill off nicknames.
    text = text.replace(", MD, ", ", ")     # Sorry, Dr. Ralph E. Massullo
    text = text.replace(", Dr. ", ", ")   # Sorry Bush, Dr. James
    text = re.sub(r'^Dr. ', '', text)
    text = re.sub(r', MD$', '', text)
    text = text.replace('FernÃ¡ndez', 'Fernández')
    text = text.strip()
    return(text)

In [14]:
REPLIST = []
HEADERS = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0'}

In [13]:
SENATE = requests.get(SENATEURL, headers=HEADERS).content
print(f"Processing senators at {SENATEURL}")
SENATORS = pq(SENATE)("table#Senators")
for senator in pq(SENATORS)("tr")[1:-1]:    # Skip header row and weird sorta footer row
    countiesraw = pq(senator)("tr").attr('class')
    counties = "|".join(sorted(countiesraw.split()[1:]))
    counties = counties.replace("St_", "St. ").replace("_", " ")
    personurl = SENATEPRE + pq(senator)("a").attr('href')
    alphaname = cleaner(pq(senator)("a").text())
    if alphaname == "Vacant":
        party = "vacant"
    else:
        if alphaname.split(", ")[1] == "Jr." or alphaname.split(", ")[1] == "Sr.":
            temp = alphaname.split(", ")
            alphaname = temp[0] + ", " + temp[2] + ", " + temp[1]
        party = pq(senator)("td")[1].text.strip()[:1]
    print("\t" + alphaname)
    district = pq(senator)("td")[0].text_content().strip()
    title = "Sen."
    chamber = "Senate"
    parsedname = HumanName(alphaname)
    first = parsedname.first
    last = parsedname.last
    middle = parsedname.middle
    suffix = parsedname.suffix
    if len(first) == 2: # fix for W. Travis
        first = first + " " + middle
        middle = ""
    name = first + " " + middle + " " + last + " " + suffix
    name = " ".join(name.split())       # Replace multiple spaces with one, via Jeremy Bowers and rdmurphy
    slug = slugify(title + " " + first + " " + last + " " + district)
    slug = slug.replace("NuÃ±ez", "Nunez").replace(u"Nuñez", "Nunez")
    slug = slug.lower()
    sleep(naptime)
    personhtml = requests.get(personurl, headers=HEADERS).content
    biohtml = str(pq(personhtml)('div#sidebar'))
    m = re.search('(^.+?)(, FL )', biohtml, re.MULTILINE)
    city = m.group(1).strip()
    photourl = SENATEPRE + pq(biohtml)("img").attr('src')
    memberstuff = [alphaname, name, first, last, slug, title, chamber, personurl, photourl, district, party, city, counties]
    REPLIST.append(memberstuff)
print(f"Done scraping senators")

Processing senators at https://www.flsenate.gov/Senators
	Albritton, Ben
	Baxley, Dennis
	Bean, Aaron
	Benacquisto, Lizbeth
	Berman, Lori
	Book, Lauren
	Bracy, Randolph
	Bradley, Rob
	Brandes, Jeff
	Braynon, Oscar II
	Broxson, Doug
	Cruz, Janet


KeyboardInterrupt: 

In [None]:
HOUSE = requests.get(HOUSEURL, headers=HEADERS).content

In [None]:
print(f"Processing representatives at {HOUSEURL}")
REPS = pq(pq(HOUSE)("div.team-page")[0])
for rep in pq(REPS)("div.team-box"):
    title = "Rep."
    chamber = "House"
    alphaname = cleaner(pq(rep)("h5").text().strip())
    print(f"\t{alphaname}")
    try:
        party = pq(rep)("p")[0].text.strip().split("—")[0].strip()[:1]   # Take just the first letter
    except:
        party = ""
    if "Pending District" in alphaname:
        party = "vacant"
    district = pq(pq(rep)("p")[0])("span").text().split(":")[1].strip()   # Inside the span is "District: 8". Get the 8.
    personurl = HOUSEPRE + pq(pq(rep)("a")[0]).attr('href')
    photourl = HOUSEPRE + pq(pq(rep)("img")).attr('data-src')
    countiesraw = pq(pq(rep)("p.rep-counties")).text()
    countiesraw = countiesraw.replace(" and part of", ",")
    countiesraw = countiesraw.replace(" and parts of", ",")
    countiesraw = countiesraw.replace("Part of ", "")
    countiesraw = countiesraw.replace("Parts of ", "")
    countiesraw = countiesraw.split(",")
    for i, county in enumerate(countiesraw):
        countiesraw[i] = county.strip()
    counties = "|".join(sorted(countiesraw))
    city = ""
    sleep(naptime)
    personhtml = requests.get(personurl).content
    basename = cleaner(pq(personhtml)("h1")[0].text)
    parsedname = HumanName(basename)
    first = parsedname.first
    last = parsedname.last
    middle = parsedname.middle
    suffix = parsedname.suffix
    if len(first) == 2: # fix for W. Travis
        first = first + " " + middle
        middle = ""
    name = first + " " + middle + " " + last + " " + suffix
    name = " ".join(name.split())       # Replace multiple spaces with one, via Jeremy Bowers and rdmurphy
    for i, span in enumerate(pq(pq(personhtml)("div.mi-content"))("span")):
        if 'City of Residence:' in pq(span).text():
            city = cleaner(pq(pq(personhtml)("div.mi-content"))("span")[i + 1].text)
    slug = slugify(title + " " + first + " " + last + " " + district)
    slug = slug.replace("NuÃ±ez", "Nunez").replace(u"Nuñez", "Nunez")
    slug = slug.lower()
    memberstuff = [alphaname, name, first, last, slug, title, chamber, personurl, photourl, district, party, city, counties]
    REPLIST.append(memberstuff)
    

In [None]:
print("Writing CSV.")
SORTEDREPS = sorted(REPLIST, key=itemgetter(0))   # Sort by last name then first, using alphaname field
with open('pols2.csv', 'w', newline='') as f: #creating temp file to test
    WRITER = csv.writer(f)
    WRITER.writerow(["alphaname", "name", "first", "last", "slug", "title", "chamber", "personurl", "photourl", "district", "party", "city", "counties"])
    for row in SORTEDREPS:
        newrow = []
        for item in row:
            newrow.append(item)
        # writer.writerows(REPLIST)
        WRITER.writerow(newrow)