In [1]:
# Data download link: https://www.propublica.org/datastore/dataset/congressional-data-bulk-legislation-bills
# Year: 2020-2021

In [1]:
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

# Parse the XML file
tree = ET.parse('./data/117/bills/sres/sres70/fdsys_billstatus.xml')

# Get the root element
root = tree.getroot()

# Accessing specific elements directly by their tag name
number = root.find('./bill/number').text
update_date = root.find('./bill/updateDate').text
origin_chamber = root.find('./bill/originChamber').text
bill_type = root.find('./bill/type').text

print("Number:", number)
print("Update Date:", update_date)
print("Origin Chamber:", origin_chamber)
print("Type:", bill_type)

# Accessing elements in a list
for item in root.findall('./bill/committees/item'):
    system_code = item.find('systemCode').text
    name = item.find('name').text
    print("Committee System Code:", system_code)
    print("Committee Name:", name)
    
# Accessing elements with attributes
for item in root.findall('./bill/summaries/summary'):
    action_date = item.find('actionDate').text
    action_desc = item.find('actionDesc').text
    text = item.find('text').text
    print("Summary Action Date:", action_date)
    print("Summary Action Description:", action_desc)
    print(action_desc=='Passed Senate')
    
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    # print("Summary Text:", text)
    print("Summary Text:", clean_text)

Number: 70
Update Date: 2022-12-29T03:26:17Z
Origin Chamber: Senate
Type: SRES
Committee System Code: ssra00
Committee Name: Rules and Administration Committee
Summary Action Date: 2021-02-23
Summary Action Description: Introduced in Senate
False
Summary Text: 
This resolution authorizes certain Senate committees to make specified expenditures and employ personnel for the 117th Congress, and it establishes a special reserve to be available to the committees on an as-needed basis to meet unpaid obligations.

Summary Action Date: 2021-02-23
Summary Action Description: Reported to Senate
False
Summary Text: 
This resolution authorizes certain Senate committees to make specified expenditures and employ personnel for the 117th Congress, and it establishes a special reserve to be available to the committees on an as-needed basis to meet unpaid obligations.

Summary Action Date: 2021-02-24
Summary Action Description: Passed Senate
True
Summary Text: 
This resolution authorizes certain Senate 

In [2]:
# Accessing elements in a list
for item in root.findall('./bill/titles/item'):
    title_type = item.find('titleType').text
    title = item.find('title').text
    print("Title Type:", title_type)
    print("Title:", title)

Title Type: Official Title as Introduced
Title: An original resolution authorizing expenditures by committees of the Senate for the periods March 1, 2021 through September 30, 2021, October 1, 2021 through September 30, 2022, and October 1, 2022 through February 28, 2023.
Title Type: Display Title
Title: An original resolution authorizing expenditures by committees of the Senate for the periods March 1, 2021 through September 30, 2021, October 1, 2021 through September 30, 2022, and October 1, 2022 through February 28, 2023.


In [3]:
def get_xml_data(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    
    
    number = root.find('./bill/number').text
#     number = 1
    update_date = root.find('./bill/updateDate').text
    origin_chamber = root.find('./bill/originChamber').text
    bill_type = root.find('./bill/type').text
    
    action_dates, action_descs, clean_texts, title_types, titles = [], [], [], [], []
    for item in root.findall('./bill/summaries/summary'):
        action_date = item.find('actionDate').text
        action_desc = item.find('actionDesc').text
        text = item.find('text').text
        
        # print("Summary Action Date:", action_date)
        # print("Summary Action Description:", action_desc)

        soup = BeautifulSoup(text, 'html.parser')
        clean_text = soup.get_text()
        
        action_dates.append(action_date)
        action_descs.append(action_desc)
        clean_texts.append(clean_text.replace('\n', ' '))
        # print("Summary Text:", text)
        # print("Summary Text:", clean_text)
        # if action_desc=='Passed Senate':
    

    for item in root.findall('./bill/titles/item'):
        title_type = item.find('titleType').text
        title = item.find('title').text
        
        title_types.append(title_type)
        titles.append(title)
        # print("Title Type:", title_type)
        # print("Title:", title)
    
    result_title = ''
    for i in range(len(title_types)):
        result_title += title_types[i] + ': ' + titles[i] + ', '
    
    return number, update_date, origin_chamber, bill_type, action_date, action_desc, clean_text.replace('\n', ''), result_title
    

# Now crawl all the data

In [4]:
import os

directory = './data/117/bills'

folders = [name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name))]

In [5]:
folders

['hjres', 'sconres', 'hconres', 's', 'sres', 'hres', 'hr', 'sjres']

In [6]:
numbers, update_dates, origin_chambers, bill_types, action_dates, action_descs, clean_texts, result_titles = [], [], [], [], [], [], [], []
for folder_name in folders:
    folder_path = os.path.join(directory, folder_name)
    sub_folders = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
    for folder in sub_folders:
        sub_folder_path = os.path.join(folder_path, folder)
        # print(sub_folder_path)
        for filename in os.listdir(sub_folder_path):
            file_path = os.path.join(sub_folder_path, filename)
            if os.path.isfile(file_path) and '.xml' in file_path:
                # print(file_path)
                try:
                    a, b, c, d, e, f, g, h = get_xml_data(file_path) 
                    numbers.append(a)
                    update_dates.append(b)
                    origin_chambers.append(c)
                    bill_types.append(d)
                    action_dates.append(e)
                    action_descs.append(f)
                    clean_texts.append(g)
                    result_titles.append(h)
                except:
                    continue

In [7]:
import pandas as pd

# Creating a dataframe from a dictionary
data = {
    'Number': numbers,
    'Date': update_dates,
    'Chamber': origin_chambers,
    'Bill Type': bill_types,
    'Action Date': action_dates,
    'Action Decision': action_descs, 
    'Texts': clean_texts,
    'Titles':result_titles
}

df = pd.DataFrame(data)

# To view the DataFrame
# print(df)
df.head(10)

Unnamed: 0,Number,Date,Chamber,Bill Type,Action Date,Action Decision,Texts,Titles
0,58,2022-12-30T11:03:19Z,House,HJRES,2021-09-14,Introduced in House,Ulysses S. Grant Bicentennial Recognition Act ...,Display Title: Ulysses S. Grant Bicentennial R...
1,93,2022-12-24T08:14:15Z,House,HJRES,2022-09-02,Introduced in House,This joint resolution proposes a constitutiona...,Display Title: Proposing an amendment to the C...
2,67,2022-12-30T02:48:55Z,House,HJRES,2021-12-09,Introduced in House,This joint resolution nullifies the rule title...,Display Title: Providing for congressional dis...
3,60,2022-12-30T07:03:57Z,House,HJRES,2021-10-12,Introduced in House,This joint resolution proposes a constitutiona...,Display Title: Proposing an amendment to the C...
4,94,2022-12-29T19:18:36Z,House,HJRES,2022-09-09,Introduced in House,This joint resolution nullifies a Department o...,Display Title: Providing for congressional dis...
5,69,2022-12-30T04:04:45Z,House,HJRES,2022-02-01,Introduced in House,This joint resolution proposes a constitutiona...,Display Title: Proposing an amendment to the C...
6,56,2022-12-30T10:33:29Z,House,HJRES,2021-08-20,Introduced in House,This joint resolution supports the designation...,Display Title: Expressing support for designat...
7,51,2022-12-30T21:30:33Z,House,HJRES,2021-06-14,Introduced in House,This joint resolution proposes a constitutiona...,Official Title as Introduced: Proposing an ame...
8,34,2022-12-31T06:25:14Z,House,HJRES,2021-06-17,Reported to House,This joint resolution nullifies the Oil and Na...,Display Title: Providing for congressional dis...
9,33,2022-12-16T14:06:48Z,House,HJRES,2021-03-23,Introduced in House,This resolution expresses congressional disapp...,Display Title: Providing for congressional dis...


In [8]:
clean_texts[5]

'This joint resolution proposes a constitutional amendment limiting Members of the House of Representatives to eight terms and Senators to three terms. The term limits do not apply to any person serving a term as a Member of Congress on the date the amendment is ratified.'

# pick the data based on TF-IDF and paragraph length

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# List of text documents
text = clean_texts

# Create the transform
vectorizer = TfidfVectorizer()

# Tokenize and build vocab
vectorizer.fit(text)

# Get feature names and their idf values
feature_names = vectorizer.get_feature_names_out()
idf_values = vectorizer.idf_

# Set the threshold
threshold = 3

# Filter the idf_values and get corresponding feature names
filtered_words = [word for word, idf in zip(feature_names, idf_values) if idf > threshold]
low_importance_words = [word for word, idf in zip(feature_names, idf_values) if idf <= threshold]

print(len(filtered_words))
print(filtered_words)

19915


In [10]:
print(len(low_importance_words))
low_importance_words

49


['2021',
 'act',
 'also',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'bill',
 'by',
 'certain',
 'department',
 'establishes',
 'federal',
 'for',
 'from',
 'health',
 'in',
 'including',
 'is',
 'may',
 'must',
 'national',
 'not',
 'of',
 'on',
 'or',
 'other',
 'program',
 'provide',
 'related',
 'requirements',
 'requires',
 'resolution',
 'services',
 'specifically',
 'state',
 'states',
 'such',
 'that',
 'the',
 'this',
 'to',
 'under',
 'united',
 'who',
 'with']

In [11]:
# Titles, length: use idf and tf-idf to exclude the frequent words, number of unique words

In [12]:
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer

from nltk.tokenize import word_tokenize

tokenized_text = [word_tokenize(doc) for doc in clean_texts]

[nltk_data] Downloading package punkt to /Users/zongxiali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
clean_tokenized_text = []
for doc in tokenized_text:
    sub_doc = []
    for token in doc:
        if token not in low_importance_words:
            sub_doc.append(token)
            
    clean_tokenized_text.append(sub_doc)

In [14]:
print(len(tokenized_text))
print(len(clean_tokenized_text))

15281
15281


In [15]:
clean_tokenized_text[0]

['Ulysses',
 'S.',
 'Grant',
 'Bicentennial',
 'Recognition',
 'Act',
 'This',
 'joint',
 'authorizes',
 'President',
 'posthumously',
 'appoint',
 'Ulysses',
 'S.',
 'Grant',
 'grade',
 'General',
 'Armies',
 'United',
 'States',
 ',',
 'effective',
 'April',
 '27',
 ',',
 '2022',
 '.']

In [16]:
counter = 0
text_indices = []
for j in clean_tokenized_text:
    if len(j) > 186:
        text_indices.append(counter)
    counter += 1

In [17]:
clean_texts[text_indices[6]]

"Humanitarian Standards for Individuals in U.S. Customs and Border Protection Custody Act This bill imposes requirements and standards related to the care of aliens in U.S. Customs and Border Protection (CBP) custody. CBP must conduct an initial health screening of each individual in custody to identify those with acute conditions and high-risk vulnerabilities and to provide appropriate healthcare. CBP must conduct the screening within 12 hours of each\xa0individual's arrival at a CBP facility, and within 6 hours for certain priority individuals such as children and individuals with disabilities.  The bill imposes various requirements related to providing such screenings, such as providing interpreters, chaperones, and mental health treatment when necessary. CBP must ensure detainees have access to drinking water, toilets, sanitation facilities, hygiene products, food, and shelter. The bill imposes certain standards relating to such requirements, such as the minimum amount of drinking 

In [18]:
filtered_numbers = [num for i, num in enumerate(numbers) if i in text_indices]
filtered_dates = [date for i, date in enumerate(update_dates) if i in text_indices]
filtered_chambers = [chamber for i, chamber in enumerate(origin_chambers) if i in text_indices]
filtered_bill_type = [bill_type for i, bill_type in enumerate(bill_types) if i in text_indices]
filtered_action_date = [action_date for i, action_date in enumerate(action_dates) if i in text_indices]
filtered_action_desc = [action_desc for i, action_desc in enumerate(action_descs) if i in text_indices]
filtered_clean_text = [clean_text for i, clean_text in enumerate(clean_texts) if i in text_indices]
filtered_result_title = [result_title for i, result_title in enumerate(result_titles) if i in text_indices]

In [28]:
# Creating a dataframe from a dictionary
filtered_data = {
    'number': filtered_numbers,
    'date': filtered_dates,
    'vhamber': filtered_chambers,
    'bill Type': filtered_bill_type,
    'action Date': filtered_action_date,
    'action Decision': filtered_action_desc, 
    'text': filtered_clean_text,
    'label':filtered_result_title
}

df = pd.DataFrame(filtered_data)

# To view the DataFrame
# print(df)
df.head(10)

Unnamed: 0,number,date,vhamber,bill Type,action Date,action Decision,text,label
0,29,2022-12-16T14:07:39Z,House,HJRES,2021-03-08,Introduced in House,War Powers Amendments of 2021 This joint resol...,"Display Title: War Powers Amendments of 2021, ..."
1,5,2022-12-16T14:09:42Z,Senate,SCONRES,2021-02-05,Passed Senate,This concurrent resolution establishes the con...,Display Title: A concurrent resolution setting...
2,14,2022-12-30T11:03:19Z,Senate,SCONRES,2021-08-24,Passed House,This concurrent resolution establishes the con...,Display Title: A concurrent resolution setting...
3,1379,2022-12-31T12:17:36Z,Senate,S,2021-04-27,Introduced in Senate,Combating Sexual Harassment in Science Act Thi...,Display Title: Combating Sexual Harassment in ...
4,3075,2022-12-30T07:48:45Z,Senate,S,2021-10-26,Introduced in Senate,"Department of State, Foreign Operations, and R...","Display Title: Department of State, Foreign Op..."
5,747,2022-12-31T08:01:14Z,Senate,S,2021-03-15,Introduced in Senate,Citizenship for Essential Workers Act This bil...,Display Title: Citizenship for Essential Worke...
6,4096,2022-12-29T23:48:36Z,Senate,S,2022-04-27,Introduced in Senate,Humanitarian Standards for Individuals in U.S....,Display Title: Humanitarian Standards for Indi...
7,2957,2022-12-30T07:33:56Z,Senate,S,2021-10-07,Introduced in Senate,Protecting Data at the Border Act This bill li...,Display Title: Protecting Data at the Border A...
8,120,2022-12-30T03:18:57Z,Senate,S,2022-03-17,Passed Senate,Safe Connections Act of 2022 This bill establi...,"Display Title: Safe Connections Act of 2021, S..."
9,312,2022-12-31T03:58:26Z,Senate,S,2021-06-08,Reported to Senate,COVID-19 Safer Detention Act of 2021 This bill...,Display Title: COVID–19 Safer Detention Act of...


# Convert the data to a json format pandas could use

In [29]:
json_data = {}
for k, v in filtered_data.items():
    json_data[k] = {}
    for i, ele in enumerate(v):
        json_data[k][str(i)] = ele

In [31]:
import json
with open('congressional_bills.json', 'w') as f:
    json.dump(json_data, f)

In [32]:
df = pd.read_json('congressional_bills.json')
df.head(10)

Unnamed: 0,number,date,vhamber,bill Type,action Date,action Decision,text,label
0,29,2022-12-16 14:07:39+00:00,House,HJRES,2021-03-08,Introduced in House,War Powers Amendments of 2021 This joint resol...,"Display Title: War Powers Amendments of 2021, ..."
1,5,2022-12-16 14:09:42+00:00,Senate,SCONRES,2021-02-05,Passed Senate,This concurrent resolution establishes the con...,Display Title: A concurrent resolution setting...
2,14,2022-12-30 11:03:19+00:00,Senate,SCONRES,2021-08-24,Passed House,This concurrent resolution establishes the con...,Display Title: A concurrent resolution setting...
3,1379,2022-12-31 12:17:36+00:00,Senate,S,2021-04-27,Introduced in Senate,Combating Sexual Harassment in Science Act Thi...,Display Title: Combating Sexual Harassment in ...
4,3075,2022-12-30 07:48:45+00:00,Senate,S,2021-10-26,Introduced in Senate,"Department of State, Foreign Operations, and R...","Display Title: Department of State, Foreign Op..."
5,747,2022-12-31 08:01:14+00:00,Senate,S,2021-03-15,Introduced in Senate,Citizenship for Essential Workers Act This bil...,Display Title: Citizenship for Essential Worke...
6,4096,2022-12-29 23:48:36+00:00,Senate,S,2022-04-27,Introduced in Senate,Humanitarian Standards for Individuals in U.S....,Display Title: Humanitarian Standards for Indi...
7,2957,2022-12-30 07:33:56+00:00,Senate,S,2021-10-07,Introduced in Senate,Protecting Data at the Border Act This bill li...,Display Title: Protecting Data at the Border A...
8,120,2022-12-30 03:18:57+00:00,Senate,S,2022-03-17,Passed Senate,Safe Connections Act of 2022 This bill establi...,"Display Title: Safe Connections Act of 2021, S..."
9,312,2022-12-31 03:58:26+00:00,Senate,S,2021-06-08,Reported to Senate,COVID-19 Safer Detention Act of 2021 This bill...,Display Title: COVID–19 Safer Detention Act of...
