In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [96]:
# These are the new functions to work around poor extraction quality

def decision_number_html_extractor(url):
    # Extracts the HTML with the decision numbers
    response = requests.get(url)
    html = BeautifulSoup(response.text, "html.parser")
    details = html.find_all("div", class_="field field--name-field-product-number field--type-string field--label-hidden field__item")
    return(details)
    
def splitter(html_output, results):
    # Splits the decision number out of the HTML
    html_ouput = html_output
    
    for i in range(0, len(html_output)):
        output_string = str(html_ouput[i])
        step1 = output_string.split(">",1)[1]
        step2 = step1.split("</div>",1)[0]
        results.append(step2)
    
    return(results)

def new_url_generator(base_url, decision_no):
    # Creates a new URL with decision numbers
    new_list = str(decision_no).split(",")
    addition = "%2C".join(new_list).lower()
    new_url = base_url + addition
    return(new_url)

def highlight_extractor(x):
    # Cleans up the highlight paragraph
    new_string = str(x)
    first_pass = new_string.split("<!--HTML--><html><body><p>", 1)[1]
    second_pass = first_pass.split("</p>\n</body></html>\n", 1)[0]
    return(second_pass)

def general_splitter(x, patterns):
    new_string = str(x)
    first_pass = new_string.split(patterns[0], 1)[1]
    second_pass = first_pass.split(patterns[1], 1)[0]
    return(second_pass)

In [3]:
recursive_url = "https://www.gao.gov/search?f%5B0%5D=content_type%3ABid%20Protest%20Decision&f%5B1%5D=date%3Astart%2B2000-01-01%2Bend%2B2020-01-01&sort_by=docdate&sort_order=DESC&keyword=&f%5B0%5D=content_type%3ABid%20Protest%20Decision&f%5B1%5D=date%3Astart%2B2000-01-01%2Bend%2B2020-01-01&page="

results = []

for p in range(0, 543):
    new_url = recursive_url+"p"
    html = decision_number_html_extractor(new_url)
    splitter(html, results)

In [87]:
base_url_input = "https://www.gao.gov/products/"

aggregate_results = {}

for d in range(0, len(results)):
    url = new_url_generator(base_url_input, results[d])
    response = requests.get(url)
    html = BeautifulSoup(response.text, "html.parser")
    status = html.find("div", class_ = "status highlighted-status")
    highlights = html.find_all("div", class_ = "clearfix text-formatted field field--name-product-highlights-custom field--type-text-long field--label-above quickedit-field")
    document = html.find_all("div", class_ = "js-endpoint-view-decision field field--name-field-html-block field--type-text-long field--label-above")
    protest_dict = {'result_no' : str(results[d]), 'status' : str(status), 'highlights' : str(highlights), "document": str(document)}
    aggregate_results[d] = protest_dict

In [88]:
import json

with open("aggregate_results.json", "w") as fp:
    json.dump(aggregate_results, fp)

In [97]:
firm_patterns = ["Matter of:\xa0</strong> ", "</p>\n<p><strong>File:"]
date_patterns = ["\n<p><strong>Date:</strong>\xa0 ", "</p>\n<div>\n<p>"]
decision_paragraph_patterns = ["<div>\n<p>DECISION</p>\n</div>\n<p>", "</p>"]
status_patterns = ['<div class="status highlighted-status">\n', '.\n      </div>']

test_result = aggregate_results[0]

firm = general_splitter(test_result['document'], firm_patterns)
date = general_splitter(test_result['document'], date_patterns)
decision_paragraph = general_splitter(test_result['document'], decision_paragraph_patterns)
general_splitter(test_result['status'], status_patterns)

'        We sustain the protest'

In [120]:
good_df = pd.DataFrame()

for d in range(0, len(results)):
    firm = general_splitter(aggregate_results[d]['document'], firm_patterns)
    date = general_splitter(aggregate_results[d]['document'], date_patterns)
    status = general_splitter(aggregate_results[d]['status'], status_patterns)
    decision_paragraph = general_splitter(aggregate_results[d]['document'], decision_paragraph_patterns)
    highlights = highlight_extractor(aggregate_results[d]['highlights'])
    series = {"Firm" :str(firm), "Date" : str(date), "Status" : str(status), "Decision" : str(decision_paragraph), "Highlights" : str(highlights)}
    good_df = good_df.append(series, ignore_index = True)

In [122]:
good_df.to_csv("output.csv")