# Code for scraping executive orders

The following code scrapes executive orders from the Federal Register. NOTE: The csv files containing the links to the executive orders can be downloaded here: https://www.federalregister.gov/presidential-documents/executive-orders.

In [22]:
import pandas as pd
import regex as re
import requests
from bs4 import BeautifulSoup

In [13]:
def gen_text(html_link):
    html = requests.get(html_link)
    soup = BeautifulSoup(html.text)
    xml_links = []
    for link in soup.body.findAll('a', href=re.compile(r'^https://www.federalregister.+.xml?')):
        xml_links.append(link['href'])
    if not xml_links:
        print('no xml links found')
        print('associated html link is: ', html_link)

    elif len(xml_links) > 1:
        print('more than one xml link')
        print('associated html link is: ', html_link)
    else:
        return get_xml_text(xml_links[0])

In [14]:
def get_xml_text(xml_link):
    txt = requests.get(xml_link)
    txt.encoding = txt.apparent_encoding
    soup = BeautifulSoup(txt.text)    # txt is simply the a string with your XML file
    return soup.get_text()

In [40]:
def gen_text_no_xml(html_link):
    '''
    Retrieves text from older executive orders that don't have associated xml files.
    Returns a strong with the text of the executive order.
    Inputs:
        html_link: a string containing the html link.
    '''
    html = requests.get(html_link)
    soup = BeautifulSoup(html.text)
    htm_links = []
    for link in soup.body.findAll('a', href=re.compile(r'^https://www.govinfo.+.htm?')):
        #print(link['href'])
        htm_links.append(link['href'])
        #print(type(link['href']))
    if not htm_links:
        print('no htm links found')
        print('associated html link is: ', html_link)

    elif len(htm_links) > 1:
        print('more than one htm link')
        print('associated html link is: ', html_link)
    else:
        return get_xml_text(htm_links[0])

In [1]:
files = ['/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/documents_signed_by_barack_obama_of_type_presidential_document_and_of_presidential_document_type_executive_order.csv',
         '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/documents_signed_by_george_w_bush_of_type_presidential_document_and_of_presidential_document_type_executive_order.csv',
         '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/documents_signed_by_william_j_clinton_of_type_presidential_document_and_of_presidential_document_type_executive_order.csv',
         '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/documents_signed_by_donald_trump_of_type_presidential_document_and_of_presidential_document_type_executive_order.csv']

In [None]:
df_list = []
for filename in files:
    df = pd.read_csv(filename, index_col=None, header=0)
    df['president'] = re.search(r'_by_(\w+)_of_type', filename)[1] # put president name in df
    df_list.append(df)
big_df = pd.concat(df_list, axis=0, ignore_index=True)

In [2]:
dest_file = '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/exec_orders_w_text'

In [None]:
big_df.loc[:, 'order_text'] = big_df.html_url.apply(lambda x: gen_text(x))
big_df.to_csv(dest_file)

In [3]:
exec_orders = pd.read_csv('exec_orders_w_text')

After scraping the files, I had to go back to the ones where no XML file was available and extract text using the `gen_text_no_xml()` function defined above.

In [7]:
exec_orders[exec_orders.order_text.isnull()].groupby('president').count()

Unnamed: 0_level_0,Unnamed: 0,citation,document_number,end_page,html_url,pdf_url,type,subtype,publication_date,signing_date,start_page,title,disposition_notes,executive_order_number,order_text
president,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
barack_obama,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
william_j_clinton,202,202,202,202,202,201,202,202,202,202,202,202,151,201,0


In [16]:
null_orders = exec_orders[exec_orders.order_text.isnull()]

In [9]:
clinton = '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/documents_signed_by_william_j_clinton_of_type_presidential_document_and_of_presidential_document_type_executive_order.csv'

In [11]:
clinton = pd.read_csv(clinton)

In [41]:
dest_file = '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/early_clinton_orders.csv'

In [42]:
null_orders.loc[:, 'order_text'] = null_orders.html_url.apply(lambda x: gen_text_no_xml(x))
#null_orders.to_csv(dest_file)

more than one htm link
associated html link is:  https://www.federalregister.gov/documents/1998/05/19/98-13552/federalism
more than one htm link
associated html link is:  https://www.federalregister.gov/documents/1997/02/14/97-3992/advisory-committee-on-high-performance-computing-and-communications-information-technology-and-the


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [43]:
null_orders.to_csv(dest_file)

In [45]:
not_null = exec_orders[exec_orders.order_text.notnull()]

I then added Biden's orders (up until March 7, 2021), concatenated that to my previous dataset, and saved it as a new file.

In [49]:
# now add Biden orders (as of March 7, 2021)
filename = '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/documents_signed_by_joe_biden_of_type_presidential_document_and_of_presidential_document_type_executive_order.csv'

In [50]:
biden_df = pd.read_csv(filename, index_col=None, header=0)
biden_df['president'] = re.search(r'_by_(\w+)_of_type', filename)[1] # put president name in df
#big_df = pd.concat(df_list, axis=0, ignore_index=True)


In [52]:
biden_df.loc[:, 'order_text'] = biden_df.html_url.apply(lambda x: gen_text(x))

In [54]:
biden_file = '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/biden_mar_7_21.csv'

In [55]:
biden_df.to_csv(biden_file)

In [64]:
full_exec_orders = pd.concat([not_null, null_orders, biden_df], axis=0, ignore_index=True)

In [65]:
# delete two clinton orders that didn't have text readily available
full_exec_orders[full_exec_orders.order_text.isnull()]

Unnamed: 0.1,Unnamed: 0,citation,document_number,end_page,html_url,pdf_url,type,subtype,publication_date,signing_date,start_page,title,disposition_notes,executive_order_number,president,order_text
900,680.0,63 FR 27651,98-13552,27654,https://www.federalregister.gov/documents/1998...,https://www.govinfo.gov/content/pkg/FR-1998-05...,Presidential Document,Executive Order,05/19/1998,05/14/1998,27651,Federalism,"Supplements: EO 12866, September 30, 1993; EO ...",13083.0,william_j_clinton,
949,729.0,62 FR 7131,97-3992,7132,https://www.federalregister.gov/documents/1997...,https://www.govinfo.gov/content/pkg/FR-1997-02...,Presidential Document,Executive Order,02/14/1997,02/11/1997,7131,Advisory Committee on High-Performance Computi...,"Amended by: EO 13092, July 24, 1998; EO 13113,...",13035.0,william_j_clinton,


In [66]:
full_exec_orders = full_exec_orders[full_exec_orders.order_text.notnull()]

In [67]:
full_file = '/Users/lilygrier/Documents/Grad_School/Computational Content Analysis/full_exec_orders_text.csv'

In [68]:
full_exec_orders.to_csv(full_file)