In [3]:
#Vanilla imports + Soup
from bs4 import BeautifulSoup
import urllib
import requests
import re
import datetime
import os

#Plotly
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
plotly.tools.set_credentials_file(
    username=os.getenv('PLOTLY_USER'), 
    api_key=os.getenv('PLOTLY_API_KEY')
)
init_notebook_mode(connected=True)


#BeautifulSoup warnings get annoying
import warnings
warnings.filterwarnings("ignore")

In [4]:
#Pull in Justice Dept. Civil Division's Housing Case page
page = urllib.request.urlopen('https://www.justice.gov/crt/housing-and-civil-enforcement-section-cases-1')
soup = BeautifulSoup(page)

#Reduce down to inner div and extract headers, lists of cases
reducedSoup = soup.find('div', attrs={'class' : 'field__item even'})
headers = [elem for elem in reducedSoup.findAll('h2')]
headers

uls = [elem for elem in reducedSoup.findAll('ul')]

assert(len(headers) == len(uls))

In [5]:
missed_date_counter = 0
total_counter = 0

global missed_date_counter
global total_counter


def get_li_date(li):
    global missed_date_counter
    global total_counter
    
    total_counter += 1
    
    #Maintain a list of dates, pick the earliest
    dates = list()
    
    #Get date from anchor text containing 'complaint'
    for href in li.findAll('a'):
        if not 'complaint' in href.contents[0].lower():
            continue
        
        #Extract the date
        match = re.search(r'\d+/\d+/\d+', href.contents[0])
        if match is None:
            continue
        else:
            month, day, year = match.group().split('/')
            if len(year) < 2:
                continue
            
            #Years are truncated as '99, '00, '01; fix it
            if int(year) >= 0 and int(year) < 20:
                year = '20' + year
            else:
                year = '19' + year
                
            if int(year) < 2000:
                continue
                
            #Sometimes they screw up and day/month/year,
            #Can blow up the date object
            try:
                date = datetime.date(int(year), int(month), int(day))
            except ValueError:
                continue
            dates.append(date)       

    #Woops, couldn't get a date from anchor text with 'complaint' in it
    if not dates:    
        missed_date_counter +=1
        return None
    
    #Yay, at least one piece of anchor text w/ 'complaint' and a date in it
    else:
        return min(dates)
    

#Pair case types with the dates
cases = dict((
              headers[x].get_text(),
              [get_li_date(li) for li in uls[x].findAll('li') if not get_li_date(li) is None]
            ) for x in range(len(headers)))
cases
print(missed_date_counter)
print(total_counter)

205
1849


Cool, only lost 100 of 2054 unique cases, which isn't too bad considering how janky this process realistically is!

In [6]:
cases

{'Cases Developed Through Testing Program': [datetime.date(2007, 9, 28),
  datetime.date(2011, 9, 21),
  datetime.date(2000, 4, 13),
  datetime.date(2009, 7, 21),
  datetime.date(2003, 11, 19),
  datetime.date(2007, 9, 27),
  datetime.date(2010, 12, 6),
  datetime.date(2004, 4, 15),
  datetime.date(2008, 8, 13),
  datetime.date(2003, 4, 11),
  datetime.date(2000, 12, 4),
  datetime.date(2005, 1, 18),
  datetime.date(2006, 1, 30),
  datetime.date(2005, 9, 3),
  datetime.date(2004, 6, 29),
  datetime.date(2002, 9, 19),
  datetime.date(2009, 11, 23),
  datetime.date(2007, 10, 2),
  datetime.date(2001, 1, 18),
  datetime.date(2015, 5, 18),
  datetime.date(2016, 9, 30),
  datetime.date(2002, 6, 5),
  datetime.date(2013, 8, 13),
  datetime.date(2015, 4, 30),
  datetime.date(2016, 9, 28),
  datetime.date(2010, 10, 21),
  datetime.date(2013, 11, 25),
  datetime.date(2002, 11, 15),
  datetime.date(2016, 2, 29),
  datetime.date(2010, 7, 15),
  datetime.date(2008, 9, 3),
  datetime.date(2007, 9, 

In [8]:
#Recommend popping low-count classes of housing suits, otherwise the graph is overly busy
#You can undo that masking by commenting out the next few lines
# MASKED_CASETYPES = [
#     'Servicemembers Civil Relief Act Cases',
#     'Discrimination Based Upon Religion',
#     'Religious Land Use Cases',
#     'Public Accommodations Cases',
#     'Discrimination Based Upon Sex']
# cases = {k : v for k,v in cases.items() if not k in MASKED_CASETYPES }

#Find oldest and newest case
oldest_case = None
newest_case = None
for casedates in cases.values():
    local_oldest_case = min(casedates)
    local_newest_case = max(casedates)
    if oldest_case is None or local_oldest_case.year < oldest_case.year:
        oldest_case = local_oldest_case
    if newest_case is None or local_newest_case.year > newest_case.year:
        newest_case = local_newest_case
        
print(oldest_case)
print(newest_case)

#Group the types by year
traces = []
for casetype,casedates in cases.items():
    #Index all cases by full year range
    casedate_year_counts = {}
    for year in range(oldest_case.year, newest_case.year + 1):
        casedate_year_counts[year] = 0
    for casedate in casedates:
        casedate_year_counts[casedate.year] += 1
    
    traces.append(go.Bar(
        x=[i for i in range(oldest_case.year, newest_case.year + 1)],
        y=[num for num in casedate_year_counts.values()],
        name=casetype
    ))

layout = go.Layout(
    barmode='stack'
)

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='stacked-bar')

2000-03-17
2018-04-11
High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~spencernorris.datascience/0 or inside your plot.ly account where it is named 'stacked-bar'


Want to figure out how many cases were lost by type and year (e.g. how badly did we skew the data?).

In [11]:
#Pair case types with the dates
linted_cases = dict((
              headers[x].get_text(),
              [get_li_date(li) for li in uls[x].findAll('li') if get_li_date(li) is None]
            ) for x in range(len(headers)))
linted_cases = {k : len(v) for k,v in linted_cases.items()}
linted_cases

{'Cases Developed Through Testing Program': 9,
 'Discrimination Based Upon Disability': 40,
 'Discrimination Based Upon Familial Status': 13,
 'Discrimination Based Upon National Origin': 31,
 'Discrimination Based Upon Race or Color': 56,
 'Discrimination Based Upon Religion': 3,
 'Discrimination Based Upon Sex': 12,
 'Fair Lending Enforcement': 24,
 'Public Accommodations Cases': 7,
 'Religious Land Use Cases': 7,
 'Servicemembers Civil Relief Act Cases': 3}

Looks like the hog's share of lost cases are disability cases and race or color cases. Not surprising since they're consistently two of the most common types of cases in any given year. National origin is skewed down a bit too. Anyhow it's surprising how many Justice Dept. housing cases aren't race based but actually about disabilities.

Let's try and drill down into specifically disability cases and see what pops out. Questions right now:
- Are there recurring defendents?
- What are some of the most common unique words that pop out of the complaints?
    - Based on this, can we determine if certain conditions appear to receive less consideration than others?
        - From the Dept. of Justice **or** from the defendent?
- 