**Cell 1**: 
- This cell imports necessary modules (`json` and `os`) and reads a configuration file (`config.json`) to extract an API key for further usage. It constructs the file path dynamically based on the current working directory.

In [1]:
import json
import os

# Assuming your Jupyter notebook is in a directory that contains the config.json in a subdirectory
config_path = os.path.join(os.path.dirname(os.getcwd()), 'config.json')

with open(config_path, 'r') as file:
    config = json.load(file)

api_key = config['LegiScan_api_key']

**Cell 2**:
- This cell imports additional modules (`requests` and `json`) and defines three functions:
  - `search_laws`: Uses the LegiScan API to search for laws based on a query and returns the results.
  - `save_to_file`: Saves data in JSON format to a specified file within the 'data' directory.
  - `process_search_results`: Processes search results to fetch detailed information for each bill by calling another function (`fetch_bill_details`).
  - `fetch_bill_details`: Fetches detailed information for a specific bill using the LegiScan API.
- It iterates through a list of keywords, searches for laws related to each keyword, saves the search results, processes the search results to get detailed bill information, and saves the detailed bill information to separate JSON files.

In [6]:
import requests
import json
import os


def search_laws(api_key, query, state='ALL'):
    """Search for laws based on a query and return the results."""
    url = f"https://api.legiscan.com/?key={api_key}&op=getSearch&state={state}&query={query}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None


def save_to_file(data, filename):
    """Save the given data to a file in JSON format."""
    os.makedirs("data", exist_ok=True)
    with open(os.path.join('data', filename), 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def process_search_results(search_results, api_key):
    """Process search results to fetch detailed information for each bill."""
    detailed_bills = []
    if 'searchresult' in search_results:
        # Correct key for bills if it is 'bills'
        for bill in search_results['searchresult'].get('bills', []):
            try:
                bill_id = bill['bill_id']
                bill_detail = fetch_bill_details(api_key, bill_id)
                if bill_detail:
                    detailed_bills.append(bill_detail)
                else:
                    print(f"No details found for bill ID: {bill_id}")
            except KeyError as e:
                # Log to see what keys are missing or incorrect
                print(f"Key error: {e} in bill: {bill}")
    return detailed_bills


def fetch_bill_details(api_key, bill_id):
    """Fetch detailed information for a specific bill."""
    url = f"https://api.legiscan.com/?key={api_key}&op=getBill&id={bill_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(
            f"Failed to fetch details for bill ID {bill_id}, Status code: {response.status_code}, Response: {response.text}")
        return None


keywords = [
    "E-Waste", "electronic waste", "e-scrap", "WEEE",
    "sustainability", "environmental stewardship", "green practices",
    "Right to Repair", "repairability",
    "Planned Obsolescence", "product lifespan",
    "Circular Economy", "zero waste"
]

# Process each keyword
for query in keywords:
    print(f"Searching for: {query}")
    search_results = search_laws(api_key, query)
    if search_results:
        filename = f"{query.replace(' ', '_').lower()}_search_results.json"
        save_to_file(search_results, filename)
        detailed_bills = process_search_results(search_results, api_key)
        detailed_filename = f"{query.replace(' ', '_').lower()}_detailed_bills.json"
        save_to_file(detailed_bills, detailed_filename)
        print(
            f"Details of {len(detailed_bills)} bills saved to 'data/{detailed_filename}'.")
    else:
        print(f"Failed to retrieve search results for '{query}'.")

Searching for: E-Waste
Details of 0 bills saved to 'data/e-waste_detailed_bills.json'.
Searching for: electronic waste
Details of 0 bills saved to 'data/electronic_waste_detailed_bills.json'.
Searching for: e-scrap
Details of 0 bills saved to 'data/e-scrap_detailed_bills.json'.
Searching for: WEEE
Details of 0 bills saved to 'data/weee_detailed_bills.json'.
Searching for: sustainability
Details of 0 bills saved to 'data/sustainability_detailed_bills.json'.
Searching for: environmental stewardship
Details of 0 bills saved to 'data/environmental_stewardship_detailed_bills.json'.
Searching for: green practices
Details of 0 bills saved to 'data/green_practices_detailed_bills.json'.
Searching for: Right to Repair
Details of 0 bills saved to 'data/right_to_repair_detailed_bills.json'.
Searching for: repairability
Details of 0 bills saved to 'data/repairability_detailed_bills.json'.
Searching for: Planned Obsolescence
Details of 0 bills saved to 'data/planned_obsolescence_detailed_bills.json'

**Cell 3**:
- This cell imports additional modules (`os`, `json`, `requests`, and `pandas`) and defines three functions:
  - `fetch_bill_details`: Re-defined to fetch detailed information for a specific bill.
  - `extract_bill_ids`: Extracts bill IDs from a given JSON file.
  - `process_files_and_fetch_details`: Processes each file to fetch detailed bill information for each bill ID and saves the information to a CSV file.
- It specifies a list of file paths, processes each file to extract bill IDs, fetches detailed information for each bill, and saves all the detailed information to a CSV file.

In [13]:
import os
import json
import requests
import pandas as pd


def fetch_bill_details(api_key, bill_id):
    """Fetch detailed information for a specific bill."""
    url = f"https://api.legiscan.com/?key={api_key}&op=getBill&id={bill_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json().get('bill', {})
    else:
        print(
            f"Failed to fetch details for bill ID {bill_id}, HTTP Status: {response.status_code}")
        return None


def extract_bill_ids(file_path):
    """Extract bill IDs from a given JSON file."""
    with open(file_path, 'r') as file:
        data = json.load(file)
        bill_ids = []
        results = data.get('searchresult', {})
        # Extract all numbered keys which contain bill details
        for key in results:
            if key.isdigit():  # Check if key is a digit, indicating a bill entry
                bill_ids.append(results[key]['bill_id'])
        return bill_ids


def process_files_and_fetch_details(api_key, file_paths):
    """Process each file to fetch detailed bill information for each bill ID and save to a CSV."""
    all_details = []
    for file_path in file_paths:
        bill_ids = extract_bill_ids(file_path)
        for bill_id in bill_ids:
            details = fetch_bill_details(api_key, bill_id)
            if details:
                all_details.append(details)

    # Convert list of dictionaries to a DataFrame
    if all_details:
        df = pd.DataFrame(all_details)
        # Save to CSV
        df.to_csv('data/detailed_bills.csv', index=False)
        print("Saved detailed bills information to 'data/detailed_bills.csv'")
    else:
        print("No details were found to save to CSV.")


file_paths = [
    'data/e-waste_search_results.json',
    'data/electronic_waste_search_results.json',
    'data/e-scrap_search_results.json',
    'data/weee_search_results.json',
    'data/sustainability_search_results.json',
    'data/environmental_stewardship_search_results.json',
    'data/green_practices_search_results.json',
    'data/right_to_repair_search_results.json',
    'data/repairability_search_results.json',
    'data/planned_obsolescence_search_results.json',
    'data/product_lifespan_search_results.json',
    'data/circular_economy_search_results.json',
    'data/zero_waste_search_results.json'
]

process_files_and_fetch_details(api_key, file_paths)

Saved detailed bills information to 'data/detailed_bills.csv'


**Cell 4**:
- This cell performs the following steps:
  - Loads the data from the previously saved CSV file (`detailed_bills.csv`).
  - Ensures the `status_date` column is of datetime type.
  - Sorts the data by `status_date` and removes duplicate entries, keeping the most recent one.
  - Optionally, sorts by `session_id` to ensure only the latest session information is retained.
  - Saves the filtered data to a new CSV file (`updated_detailed_bills.csv`).

In [17]:
import pandas as pd

# Step 1: Load the data
df = pd.read_csv('data/detailed_bills.csv')

# Step 2: Ensure 'status_date' is a datetime type
df['status_date'] = pd.to_datetime(df['status_date'])

# Step 3: Sort by 'status_date' and drop duplicates keeping the last entry which is the most recent
df_sorted = df.sort_values(
    'status_date').drop_duplicates('bill_id', keep='last')

# Optional: If you want to ensure you only have the latest session information
# This assumes 'session_id' or similar logic can determine the latest session
df_sorted = df_sorted.sort_values(
    'session_id').drop_duplicates('bill_id', keep='last')

# Step 4: Save the filtered data to a new CSV file
df_sorted.to_csv('data/updated_detailed_bills.csv', index=False)

print("Updated dataset saved to 'data/updated_detailed_bills.csv'.")

Updated dataset saved to 'data/updated_detailed_bills.csv'.


**Cell 5**:
- This cell performs the following steps:
  - Loads the data from the previously saved CSV file (`updated_detailed_bills.csv`).
  - Specifies the columns to keep in the filtered dataset.
  - Reindexes the DataFrame to ensure it contains only the specified columns.
  - Saves the filtered data to a new CSV file (`refined_detailed_bills.csv`).

In [18]:
import pandas as pd

# Step 1: Load the previously updated dataset
df = pd.read_csv('data/updated_detailed_bills.csv')

# Step 2: Specify the columns to keep
columns_to_keep = [
    'bill_id', 'bill_number', 'title', 'description', 'url', 'state_link',
    'status', 'status_date', 'session_id', 'state_id', 'state', 'body_id',
    'current_body_id', 'sponsors', 'subjects', 'texts', 'votes'
]

# Filter the DataFrame to keep only the specified columns
df_filtered = df[columns_to_keep]

# Handle any missing columns in the original data that are expected to be in columns_to_keep
# This step ensures that if any column is missing from the data, it doesn't cause an error
df_filtered = df_filtered.reindex(columns=columns_to_keep)

# Step 3: Save the filtered data to a new CSV file
df_filtered.to_csv('data/refined_detailed_bills.csv', index=False)

print("Refined dataset saved to 'data/refined_detailed_bills.csv'.")

Refined dataset saved to 'data/refined_detailed_bills.csv'.


**Cell 6**:
- This cell performs a comprehensive exploratory data analysis (EDA) on the refined dataset:
  - Loads the dataset from `refined_detailed_bills.csv`.
  - Displays basic information about the dataset, including the number of entries and column types.
  - Shows data types of each column.
  - Displays the number of non-null entries for each column.
  - Displays the number of missing entries for each column.
  - Shows the number of unique values for each column.
  - Displays a preview of the first few rows of the dataset.
  - Provides a basic statistical summary of numeric columns.

In [19]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/refined_detailed_bills.csv')

# Display basic information about the dataset
print("Basic Information:")
print(df.info())

# Show data types of each column
print("\nData Types:")
print(df.dtypes)

# Display the number of non-null entries for each column
print("\nNon-Null Count:")
print(df.notnull().sum())

# Display the number of missing entries for each column
print("\nMissing Values Count:")
print(df.isnull().sum())

# Show the number of unique values for each column
print("\nUnique Values Count:")
print(df.nunique())

# Display the first few rows of the dataset to understand its structure
print("\nPreview of Data:")
print(df.head())

# Basic statistical summary of numeric columns
print("\nStatistical Summary of Numeric Columns:")
print(df.describe())

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 444 entries, 0 to 443
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   bill_id          444 non-null    int64 
 1   bill_number      444 non-null    object
 2   title            444 non-null    object
 3   description      444 non-null    object
 4   url              444 non-null    object
 5   state_link       444 non-null    object
 6   status           444 non-null    int64 
 7   status_date      444 non-null    object
 8   session_id       444 non-null    int64 
 9   state_id         444 non-null    int64 
 10  state            444 non-null    object
 11  body_id          444 non-null    int64 
 12  current_body_id  444 non-null    int64 
 13  sponsors         444 non-null    object
 14  subjects         444 non-null    object
 15  texts            444 non-null    object
 16  votes            444 non-null    object
dtypes: int64(6), obj