In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [222]:
# These are the new functions to work around poor extraction quality

def decision_number_html_extractor(url):
    # Extracts the HTML with the decision numbers
    response = requests.get(url)
    html = BeautifulSoup(response.text, "html.parser")
    details = html.find_all("div", class_="field field--name-field-product-number field--type-string field--label-hidden field__item")
    return(details)
    
def splitter(html_output, results):
    # Splits the decision number out of the HTML
    html_ouput = html_output
    
    for i in range(0, len(html_output)):
        output_string = str(html_ouput[i])
        step1 = output_string.split(">",1)[1]
        step2 = step1.split("</div>",1)[0]
        results.append(step2)
    
    return(results)

def new_url_generator(base_url, decision_no):
    # Creates a new URL with decision numbers
    new_list = str(decision_no).split(",")
    addition = "%2C".join(new_list).lower()
    new_url = base_url + addition
    return(new_url)

def highlight_extractor(x):
    # Cleans up the highlight paragraph
    new_string = str(x)
    first_pass = new_string.split("<!--HTML--><html><body><p>", 1)[1]
    second_pass = first_pass.split("</p>\n</body></html>\n", 1)[0]
    return(second_pass)

def general_splitter(x, patterns):
    new_string = str(x)
    first_pass = new_string.split(patterns[0], 1)[1]
    second_pass = first_pass.split(patterns[1], 1)[0]
    return(second_pass)

def decision_extractor(x, potential_patterns, patterns):
    if str(x).find("DECISION") != -1:
        for p in range(0, len(potential_patterns)):
            if str(x).find(potential_patterns[p]) != -1:
                patterns[0] = potential_patterns[p]
        return(general_splitter(x, patterns))
    else:
        return("No Decision")

def highlight_extractor(x):
    # Cleans up the highlight paragraph
    new_string = str(x)
    item_found = new_string.find("Highlights", 1)
    if item_found != -1:
        first_pass = new_string.split("<html><body><p", 1)[1]
        second_pass = first_pass.split("</p>\n</body></html>", 1)[0]
        return(second_pass)
    elif item_found != -1:
        first_pass = new_string.split("<html><body><div>\n\t", 1)[1]
        second_pass = first_pass.split("</div>", 1)[0]
        return(second_pass)
    else:
        return("No Highlights")


    
def firm_name_extractor(x, patterns):
    item_found = str(x).find(patterns[0], 1)
    if item_found != -1:
        return(general_splitter(x, patterns))
    else:
        return("Check this entry")
    
def date_extractor(x, patterns):
    item_found = str(x).find(patterns[0], 1)
    if item_found != -1:
        return(general_splitter(x, patterns))
    elif item_found == "The decision issued on the date below was subject to a GAO Protective Order.":
        return("The decision issued on the date below was subject to a GAO Protective Order.")
    else:
        return("Check this entry")
    
def status_extractor(x, patterns):
    try:
        return(general_splitter(x, patterns))
    except:
        return("Check this entry")

In [3]:
recursive_url = "https://www.gao.gov/search?f%5B0%5D=content_type%3ABid%20Protest%20Decision&f%5B1%5D=date%3Astart%2B2000-01-01%2Bend%2B2020-01-01&sort_by=docdate&sort_order=DESC&keyword=&f%5B0%5D=content_type%3ABid%20Protest%20Decision&f%5B1%5D=date%3Astart%2B2000-01-01%2Bend%2B2020-01-01&page="

results = []

for p in range(0, 543):
    new_url = recursive_url+str(p)
    html = decision_number_html_extractor(new_url)
    splitter(html, results)

In [4]:
base_url_input = "https://www.gao.gov/products/"

aggregate_results = {}

for d in range(0, len(results)):
    url = new_url_generator(base_url_input, results[d])
    response = requests.get(url)
    html = BeautifulSoup(response.text, "html.parser")
    status = html.find("div", class_ = "status highlighted-status")
    highlights = html.find_all("div", class_ = "clearfix text-formatted field field--name-product-highlights-custom field--type-text-long field--label-above quickedit-field")
    document = html.find_all("div", class_ = "js-endpoint-view-decision field field--name-field-html-block field--type-text-long field--label-above")
    protest_dict = {'result_no' : str(results[d]), 'status' : str(status), 'highlights' : str(highlights), "document": str(document), "url": url}
    aggregate_results[d] = protest_dict

In [217]:
firm_patterns = ["Matter of:\xa0</strong> ", "</p>\n<p><strong>File:"]
date_patterns = ["\n<p><strong>Date:</strong>\xa0 ", "</p>\n<div>\n<p>"]
decision_paragraph_patterns = ["<div>\n<p>DECISION</p>\n</div>\n<p>" , "</p>"]
potential_d_patterns = ["<div>\n<p>DECISION</p>\n</div>\n<p>", 
                        "<div>\n<p><strong>DECISION</strong></p>\n</div>\n<p>", 
                        "<div><strong>DECISION</strong></div>",
                       "<p><strong>DECISION</strong></p>",
                       "</p>\n<p>DECISION</p>\n<p>",
                       "</p>\n<p>\n\tDECISION</p>\n<p>",
                       "<p>\n\t\t\tDECISION</p>\n<p>",
                       "\n<div>\n<p>\n\t\t\tDECISION</p>\n</div>\n<p>\n\t\t",
                       "<p>\n<strong>DECISION</strong></p>\n<p>\n\t\t",
                       "<p>\n<strong>DECISION</strong></p>\n<p>\n\t",
                        "<p>\n<strong>DECISION\xa0 </strong></p>\n</div>\n<p>\n\t\t",
                       "<div>\n<p>\n<strong>DECISION</strong></p>\n</div>\n<p>\n\t\t",
                       "<p>\n\t\t\tDECISION\xa0</p>\n</div>\n<p>\n\t\t",
                       "<p>\n\t\tDECISION</p>\n</div>\n<p>\n\t",
                       "<p>DECISION</p>\n<p>",
                       "<p>\n\t\tDECISION</p>\n<p>\n\t\t",
                        "<p>\n\t\t\tDECISION</p>\n</div>\n<p>\n\t\t",
                       "<p>\n<strong>DECISION</strong></p>\n</div>\n<p>\n\t",
                        "DECISION</p>\n</div>\n<p class=\"MsoBodyText\">\n\t\t",
                        "DECISION</p>\n</div>\n<p class=\"MsoNormal\">\n\t\t",
                        "DECISION</strong></p>\n</div>\n<p class=\"MsoNormal\">\n\t\t",
                       "DECISION</p>\n</div>\n<p>\n\t\t",
                       "DECISION</p>\n</div>\n<p class=\"Date\">\n\t\t",
                       "DECISION</strong></p>\n<p class=\"MsoNormal\">\n\t\t\t",
                       "DECISION</p>\n</div>\n<p class=\"MatterOf\">\n\t\t",
                       "DECISION</p>\n</div>\n<p class=\"MsoBodyText\">\n\t",
                       "DECISION</p>\n</div>\n<p class=\"Default\">\n\t\t",
                       "DECISION</p>\n</div>\n<p class=\"Decision1\">\n\t\t",
                       "DECISION</p>\n<p class=\"Decision1\">\n\t\t\t",
                       "DECISION</strong></p>\n<p class=\"MsoBodyText\">\n\t\t",
                        "DECISION</p>\n</div>\n<p class=\"matterof0\">\n\t\t",
                       "DECISION</p>\n</div>\n<p class=\"MsoBodyText\">\n\t\t"]

status_patterns = ['<div class="status highlighted-status">\n', '.\n      </div>']

In [223]:
good_df = pd.DataFrame()

for d in range(0, len(results)):
    firm = firm_name_extractor(aggregate_results[d]['document'], firm_patterns)
    date = date_extractor(aggregate_results[d]['document'], date_patterns)
    status = status_extractor(aggregate_results[d]['status'], status_patterns)
    decision_paragraph = decision_extractor(aggregate_results[d]['document'], potential_d_patterns, decision_paragraph_patterns)
    highlights = highlight_extractor(aggregate_results[d]['highlights'])
    series = {"Firm" :str(firm), "Date" : str(date), "Status" : str(status), "Decision_Paragraph" : str(decision_paragraph), "Highlights_Paragraph" : str(highlights), "URL" : aggregate_results[d]['url']}
    good_df = good_df.append(series, ignore_index = True)

IndexError: list index out of range

In [226]:
print(good_df.shape[0])

good_df.to_csv("Output_new.csv")

2327


In [228]:
aggregate_results[345]['document']

'[<div class="js-endpoint-view-decision field field--name-field-html-block field--type-text-long field--label-above">\n<header class="field__label">View Decision</header>\n<div class="field-items-wrapper">\n<div class="field__item"><p>\n\tDOCUMENT FOR PUBLIC RELEASE<br/>\n\tThe decision issued on the date below was subject to a GAO Protective Order. This version has been approved for public release.</p>\n<div>\n<p>\n<strong>Decision</strong></p>\n<p>\n<strong>Matter of:\xa0</strong> AOC Connect, LLC -- Reconsideration</p>\n<p>\n<strong>File:\xa0</strong> B-416658.3</p>\n<p>\n<strong>Date:</strong>\xa0 February 12, 2019</p>\n<div>\n<p>\n\t\t\tDavid B. Dempsey, Esq., Dempsey Fontana, PLLC, for the protester.<br/>\n\t\t\tMeredith Skowronski, Esq., Library of Congress, for the agency.<br/>\n\t\t\tHeather Self, Esq., and Edward Goldstein, Esq., Office of the General Counsel, GAO, participated in the preparation of the decision.</p>\n</div>\n<p>\n\t\tDIGEST</p>\n<p>\n\t\tRequest for reconsid

In [8]:
import json

with open("aggregate_results.json", "w") as fp:
    json.dump(aggregate_results, fp)