In [1]:
import pandas as pd
import os
import json
import re
import torch
from sklearn.model_selection import train_test_split as tts
from transformers import pipeline
import requests
from openai import OpenAI
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
citation_directory = 'citations/citations/citations'
citation_files = os.listdir(citation_directory)

In [4]:
citation_files

['HR10445.txt.json',
 'HR1435.txt.json',
 'HR1631.txt.json',
 'HR2.txt.json',
 'HR2407.txt.json',
 'HR2474.txt.json',
 'HR2617.txt.json',
 'HR2670.txt.json',
 'HR2799.txt.json',
 'HR2864.txt.json',
 'HR2872.txt.json',
 'HR2882.txt.json',
 'HR3561.txt.json',
 'HR3684.txt.json',
 'HR3746.txt.json',
 'HR3774.txt.json',
 'HR3881.txt.json',
 'HR3935.txt.json',
 'HR3950.txt.json',
 'HR4365.txt.json',
 'HR4366.txt.json',
 'HR4367.txt.json',
 'HR4368.txt.json',
 'HR4394.txt.json',
 'HR4468.txt.json',
 'HR4510.txt.json',
 'HR4531.txt.json',
 'HR4639.txt.json',
 'HR4664.txt.json',
 'HR4665.txt.json',
 'HR4758.txt.json',
 'HR4763.txt.json',
 'HR4818.txt.json',
 'HR4820.txt.json',
 'HR4821.txt.json',
 'HR485.txt.json',
 'HR4984.txt.json',
 'HR5009.txt.json',
 'HR5376.txt.json',
 'HR5378.txt.json',
 'HR5403.txt.json',
 'HR5768.txt.json',
 'HR5860.txt.json',
 'HR5863.txt.json',
 'HR5893.txt.json',
 'HR5894.txt.json',
 'HR5933.txt.json',
 'HR5961.txt.json',
 'HR6053.txt.json',
 'HR6283.txt.json',
 'H

In [5]:
citation_files = [os.path.join(citation_directory, f) for f in citation_files if f.endswith(".json")]

In [6]:
citation_data = []

In [7]:
for file in citation_files:
    with open(file, 'r', encoding='utf-8') as f:
        try:
            citations = json.load(f)
            bill_id = os.path.basename(file).split(".")[0]  
            for citation in citations:
                citation_data.append({
                    "Citation ID": f"{bill_id}_{citation.get('startPosition')}",
                    "Start Position": citation.get("startPosition", ""),
                    "End Position": citation.get("endPosition", ""),
                    "Citation Text": citation.get("text", ""),
                    "Normalized Citation": citation.get("normCite", ""),
                    "Citation Type": citation.get("citeType", ""),
                    "All Citations": citation.get("allCites", ""),
                    'Pin Cite': citation.get("pinCite", ""),
                    'Page Range': citation.get("pageRange", ""),
                    'Node ID': citation.get("nodeId", ""),
                    "Section": citation.get("sectionAndSubSection", ""),
                    'Is Short Cite': citation.get("isShortCite", ""),
                    "Bill ID": bill_id
                })
        except json.JSONDecodeError:
            print(f"Error reading JSON file: {file}")

In [8]:
citation_df = pd.DataFrame(citation_data)
citation_df.head(15)

Unnamed: 0,Citation ID,Start Position,End Position,Citation Text,Normalized Citation,Citation Type,All Citations,Pin Cite,Page Range,Node ID,Section,Is Short Cite,Bill ID
0,HR10445_2581,2581,2600,PROVISIONS Sec. 201,,,,,,0,,False,HR10445
1,HR10445_2643,2643,2713,Sec. 202. Recycling Infrastructure and Accessi...,,,,,,0,,False,HR10445
2,HR10445_2766,2766,2893,Sec. 204. Reauthorization of Diesel Emissions ...,,,,,,0,,False,HR10445
3,HR10445_3293,3293,3434,TITLE IV—VETERANS Sec. 401. Protecting Regular...,,,,,,0,,False,HR10445
4,HR10445_4039,4039,4047,Sec. 111,,,,,,0,,True,HR10445
5,HR10445_4145,4145,4153,Sec. 112,,,,,,0,,True,HR10445
6,HR10445_4555,4555,4586,TITLE VI—MISCELLANEOUS Sec. 601,,,,,,0,,False,HR10445
7,HR10445_4648,4648,4656,Sec. 602,,,,,,0,,True,HR10445
8,HR10445_5350,5350,5417,TITLE II—YOUTH POISONING PREVENTION Sec. 201. ...,,,,,,0,,False,HR10445
9,HR10445_5490,5490,5563,TITLE III—CONSUMER PRODUCT SAFETY STANDARD FOR...,,,,,,0,,False,HR10445


In [9]:
citation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38791 entries, 0 to 38790
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Citation ID          38791 non-null  object
 1   Start Position       38791 non-null  int64 
 2   End Position         38791 non-null  int64 
 3   Citation Text        38791 non-null  object
 4   Normalized Citation  38791 non-null  object
 5   Citation Type        38791 non-null  object
 6   All Citations        38791 non-null  object
 7   Pin Cite             38791 non-null  object
 8   Page Range           38791 non-null  object
 9   Node ID              38791 non-null  int64 
 10  Section              38791 non-null  object
 11  Is Short Cite        38791 non-null  bool  
 12  Bill ID              38791 non-null  object
dtypes: bool(1), int64(3), object(9)
memory usage: 3.6+ MB


In [9]:
# # So if there is empty call in normalized citation, citition type, all citations, pin cite, page range, node id, section, is short cite, we can replace them with None
# 
# if 'Normalized Citation' in citation_df.columns:
#     citation_df['Normalized Citation'] = citation_df['Normalized Citation'].replace('', 'None').fillna('None')
# if 'Citation Type' in citation_df.columns:
#     citation_df['Citation Type'] = citation_df['Citation Type'].replace('', 'None').fillna('None')
# if 'All Citations' in citation_df.columns:
#     citation_df['All Citations'] = citation_df['All Citations'].replace('', 'None').fillna('None')
# if 'Pin Cite' in citation_df.columns:
#     citation_df['Pin Cite'] = citation_df['Pin Cite'].replace('', 'None').fillna('None')
# if 'Page Range' in citation_df.columns:
#     citation_df['Page Range'] = citation_df['Page Range'].replace('', 'None').fillna('None')
# if 'Node ID' in citation_df.columns:
#     citation_df['Node ID'] = citation_df['Node ID'].replace('', 'None').fillna('None')
# if 'Section' in citation_df.columns:
#     citation_df['Section'] = citation_df['Section'].replace('', 'None').fillna('None')
# if 'Is Short Cite' in citation_df.columns:
#     citation_df['Is Short Cite'] = citation_df['Is Short Cite'].replace('', 'None').fillna('None')
#     

In [10]:
citation_df.head(20)

Unnamed: 0,Citation ID,Start Position,End Position,Citation Text,Normalized Citation,Citation Type,All Citations,Pin Cite,Page Range,Node ID,Section,Is Short Cite,Bill ID
0,HR10445_2581,2581,2600,PROVISIONS Sec. 201,,,,,,0,,False,HR10445
1,HR10445_2643,2643,2713,Sec. 202. Recycling Infrastructure and Accessi...,,,,,,0,,False,HR10445
2,HR10445_2766,2766,2893,Sec. 204. Reauthorization of Diesel Emissions ...,,,,,,0,,False,HR10445
3,HR10445_3293,3293,3434,TITLE IV—VETERANS Sec. 401. Protecting Regular...,,,,,,0,,False,HR10445
4,HR10445_4039,4039,4047,Sec. 111,,,,,,0,,True,HR10445
5,HR10445_4145,4145,4153,Sec. 112,,,,,,0,,True,HR10445
6,HR10445_4555,4555,4586,TITLE VI—MISCELLANEOUS Sec. 601,,,,,,0,,False,HR10445
7,HR10445_4648,4648,4656,Sec. 602,,,,,,0,,True,HR10445
8,HR10445_5350,5350,5417,TITLE II—YOUTH POISONING PREVENTION Sec. 201. ...,,,,,,0,,False,HR10445
9,HR10445_5490,5490,5563,TITLE III—CONSUMER PRODUCT SAFETY STANDARD FOR...,,,,,,0,,False,HR10445


In [11]:
for col in citation_df.select_dtypes(include=['object']).columns:
    citation_df[col] = citation_df[col].astype('category')

In [12]:
bill_directory = 'citations/citations/sample_bills'

In [13]:
bill_files = os.listdir(bill_directory)

In [14]:
bill_files = [os.path.join(bill_directory, f) for f in bill_files if f.endswith(".txt")]

In [15]:
bill_data = []

In [16]:
for file in bill_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
        bill_id = file.split("/")[-1].split(".")[0]  
        
        bill_data.append({
            "Bill ID": bill_id,
            "content": content
        })       

In [17]:
bill_df = pd.DataFrame(bill_data)
bill_df['content'] = bill_df['content'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
bill_df['content'] = bill_df['content'].apply(lambda x: re.sub(r'<.*?>', '', x))
bill_df['Bill ID'] = bill_df['Bill ID'].apply(lambda x: re.sub('sample_bills', '', x))
bill_df['Bill ID'] = bill_df['Bill ID'].apply(lambda x: x.replace('\\', ''))
bill_df.head(15)

Unnamed: 0,Bill ID,content
0,HJRES98,IIA 118th CONGRESS 2d Session H. J. RES. 98 I...
1,HR10445,Further Continuing Appropriations and Disaste...
2,HR1435,Preserving Choice in Vehicle Purchases Act II...
3,HR1631,Protecting and Enhancing Public Access to Cod...
4,HR2,Secure the Border Act of 2023 II Calendar No....
5,HR2407,Nancy Gardner Sewell Medicare Multi-Cancer Ea...
6,HR2474,Strengthening Medicare for Patients and Provi...
7,HR2617,"Consolidated Appropriations Act, 2023 Depart..."
8,HR2670,National Defense Authorization Act for Fisca...
9,HR2799,Expanding Access to Capital Act of 2023 Helpi...


In [18]:
# we can join the two dataframes on the Bill ID column
comb_df = pd.merge(citation_df, bill_df, on='Bill ID', how='inner')

In [19]:
comb_df.head(45)

Unnamed: 0,Citation ID,Start Position,End Position,Citation Text,Normalized Citation,Citation Type,All Citations,Pin Cite,Page Range,Node ID,Section,Is Short Cite,Bill ID,content
0,HR10445_2581,2581,2600,PROVISIONS Sec. 201,,,,,,0,,False,HR10445,Further Continuing Appropriations and Disaste...
1,HR10445_2643,2643,2713,Sec. 202. Recycling Infrastructure and Accessi...,,,,,,0,,False,HR10445,Further Continuing Appropriations and Disaste...
2,HR10445_2766,2766,2893,Sec. 204. Reauthorization of Diesel Emissions ...,,,,,,0,,False,HR10445,Further Continuing Appropriations and Disaste...
3,HR10445_3293,3293,3434,TITLE IV—VETERANS Sec. 401. Protecting Regular...,,,,,,0,,False,HR10445,Further Continuing Appropriations and Disaste...
4,HR10445_4039,4039,4047,Sec. 111,,,,,,0,,True,HR10445,Further Continuing Appropriations and Disaste...
5,HR10445_4145,4145,4153,Sec. 112,,,,,,0,,True,HR10445,Further Continuing Appropriations and Disaste...
6,HR10445_4555,4555,4586,TITLE VI—MISCELLANEOUS Sec. 601,,,,,,0,,False,HR10445,Further Continuing Appropriations and Disaste...
7,HR10445_4648,4648,4656,Sec. 602,,,,,,0,,True,HR10445,Further Continuing Appropriations and Disaste...
8,HR10445_5350,5350,5417,TITLE II—YOUTH POISONING PREVENTION Sec. 201. ...,,,,,,0,,False,HR10445,Further Continuing Appropriations and Disaste...
9,HR10445_5490,5490,5563,TITLE III—CONSUMER PRODUCT SAFETY STANDARD FOR...,,,,,,0,,False,HR10445,Further Continuing Appropriations and Disaste...


In [20]:
context_window = 200

In [21]:
def extract_context_from_position(text, start, end, window=200):
    start_context = max(0, start - window)
    end_context = min(len(text), end + window)
    context = text[start_context:end_context]
    return re.sub(r'\s+', ' ', context).strip()

In [22]:
comb_df['Start Position'] = comb_df['Start Position'].astype(int)
comb_df['End Position'] = comb_df['End Position'].astype(int)

In [23]:
comb_df['Context'] = comb_df.apply(lambda row: extract_context_from_position(row['content'], row['Start Position'], row['End Position'], context_window), axis=1)

In [24]:
comb_df['Context'].head(15)

0     air, replacement, and restoration work on priv...
1     idges impacted by Tropical Storm Helene. TITLE...
2     and composting accountability. Sec. 202. Recyc...
3     n foreign boycotts of Israel. Sec. 305. Licens...
4     Imposition of sanctions. Sec. 102. Definitions...
5     ating to covered national security transaction...
6     g to the Non-SDN Chinese Military-Industrial C...
7     iling deadline for certain pre-existing report...
8     zation. Division D—Commerce matters TITLE I—SE...
9     ing programs. TITLE II—YOUTH POISONING PREVENT...
10    ion of sodium nitrite. TITLE III—CONSUMER PROD...
11    ec. 401. Short title. Sec. 402. List of entiti...
12    nalysis. Sec. 503. Critical supply chain resil...
13    nitions. TITLE VI—DEPLOYING AMERICAN BLOCKCHAI...
14    RICAN BLOCKCHAINS Sec. 601. Short title. Sec. ...
Name: Context, dtype: object

In [25]:
comb_df[['Citation Text', 'Context']].head(10)

Unnamed: 0,Citation Text,Context
0,PROVISIONS Sec. 201,"air, replacement, and restoration work on priv..."
1,Sec. 202. Recycling Infrastructure and Accessi...,idges impacted by Tropical Storm Helene. TITLE...
2,Sec. 204. Reauthorization of Diesel Emissions ...,and composting accountability. Sec. 202. Recyc...
3,TITLE IV—VETERANS Sec. 401. Protecting Regular...,n foreign boycotts of Israel. Sec. 305. Licens...
4,Sec. 111,Imposition of sanctions. Sec. 102. Definitions...
5,Sec. 112,ating to covered national security transaction...
6,TITLE VI—MISCELLANEOUS Sec. 601,g to the Non-SDN Chinese Military-Industrial C...
7,Sec. 602,iling deadline for certain pre-existing report...
8,TITLE II—YOUTH POISONING PREVENTION Sec. 201. ...,zation. Division D—Commerce matters TITLE I—SE...
9,TITLE III—CONSUMER PRODUCT SAFETY STANDARD FOR...,ing programs. TITLE II—YOUTH POISONING PREVENT...


In [26]:
comb_df.shape

(38791, 15)

In [28]:
annotate_df, holdout_df = tts(comb_df, train_size=0.25, random_state=2025)

In [30]:
annotate_df.shape, holdout_df.shape

((9697, 15), (29094, 15))

In [31]:
client = OpenAI(base_url="http://localhost:8051/v1", api_key="lm-studio")

In [76]:
def classify_with_llm_with_explanation(citation, context):
    prompt = f"""
You are a legal expert with 20 years of experience in analyzing legislative documents.

Given a **citation** and its **surrounding context**, determine the legislative intent. Choose the most appropriate label based on whether the citation introduces new authority, modifies existing law, rescinds prior law, or serves another legal function.

Possible labels:
- Authority
- Amending
- Rescinding
- Definition
- Precedent
- Exception

Citation:
\"\"\"{citation}\"\"\"

Context:
\"\"\"{context}\"\"\"

Instructions:
- Consider both the citation language and the surrounding context.
- If a citation reaffirms or continues a law (e.g., reauthorization), label as "Amending".
- If a citation grants new rulemaking power or legal basis, label as "Authority".
- Only use “Amending” if the citation clearly refers to modifying or extending an existing statute (e.g., "Section 230 of the Communications Act is amended to...").
- If the citation appears to introduce new procedures or programs without referencing a specific law being modified, classify as “Authority.”

Answer in this format:
Label: <One of the six labels>  
Reason: <One-sentence explanation of why this label applies>

"""

    try:
        response = client.chat.completions.create(
            model="lmstudio-community/Meta-Llama-3-1-8B-Instruct-GGUF",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_tokens=100
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("LLM Error:", e)
        return None

In [77]:
example_1 = """
air, replacement, and restoration work on private roads and bridges impacted by Tropical Storm Helene. TITLE II—RECYCLING, WATER, AND ENVIRONMENT RELATED PROVISIONS Sec. 201. Recycling and composting accountability. Sec. 202. Recycling Infrastructure and Accessibility Program. Sec. 203. Drinking water infrastructure risk and resilience. Sec. 204. Reauthorization of Diesel Emissions Reduction Act. Sec. 205. Nationwid
"""

row = comb_df.iloc[0]
citation = row['Citation Text']
print("Citation Text:", citation)

label = classify_with_llm_with_explanation(example_1, citation)
print("Predicted label:", label)

Citation Text: PROVISIONS Sec. 201
Predicted label: Label: Authority
Reason: The citation introduces new provisions and programs related to recycling, water, and environmental issues without referencing a specific law being modified, indicating that it grants new rulemaking power or legal basis.


In [78]:
example_2 = """
idges impacted by Tropical Storm Helene. TITLE II—RECYCLING, WATER, AND ENVIRONMENT RELATED PROVISIONS Sec. 201. Recycling and composting accountability. Sec. 202. Recycling Infrastructure and Accessibility Program. Sec. 203. Drinking water infrastructure risk and resilience. Sec. 204. Reauthorization of Diesel Emissions Reduction Act. Sec. 205. Nationwide Consumer and Fuel Retailer Choice Act of 2024. TITLE III—FOREIGN AFFAIRS Sec. 301. Global engagement center ext
"""

row = comb_df.iloc[1]
citation = row['Citation Text']
print("Citation Text:", citation)

label = classify_with_llm_with_explanation(example_2, citation)
print("Predicted label:", label)

Citation Text: Sec. 202. Recycling Infrastructure and Accessibility Program. Sec. 203
Predicted label: Label: Authority  
Reason: The citation introduces new authority by establishing the Recycling Infrastructure and Accessibility Program (Sec. 202) without referencing a specific law being modified, indicating that it grants new rulemaking power or legal basis.


In [79]:
example_3 = """
and composting accountability. Sec. 202. Recycling Infrastructure and Accessibility Program. Sec. 203. Drinking water infrastructure risk and resilience. Sec. 204. Reauthorization of Diesel Emissions Reduction Act. Sec. 205. Nationwide Consumer and Fuel Retailer Choice Act of 2024. TITLE III—FOREIGN AFFAIRS Sec. 301. Global engagement center extension. Sec. 302. Haiti Criminal Collusion Transparency Act of 2024. Sec. 303. Extension of special rules for Haiti under Caribbean Basin Economic Recovery Act. Sec. 304. Reports o
"""

row = comb_df.iloc[2]
citation = row['Citation Text']
print("Citation Text:", citation)

label = classify_with_llm_with_explanation(example_3, citation)
print("Predicted label:", label)

Citation Text: Sec. 204. Reauthorization of Diesel Emissions Reduction Act. Sec. 205. Nationwide Consumer and Fuel Retailer Choice Act of 2024
Predicted label: Label: Authority  
Reason: The citations for Sec. 204 and Sec. 205 introduce new programs (Reauthorization of Diesel Emissions Reduction Act and Nationwide Consumer and Fuel Retailer Choice Act of 2024) without referencing a specific law being modified, indicating the introduction of new authority.


In [50]:
comb_df[['Citation Text', 'Context']].head(30)

Unnamed: 0,Citation Text,Context
0,PROVISIONS Sec. 201,"air, replacement, and restoration work on priv..."
1,Sec. 202. Recycling Infrastructure and Accessi...,idges impacted by Tropical Storm Helene. TITLE...
2,Sec. 204. Reauthorization of Diesel Emissions ...,and composting accountability. Sec. 202. Recyc...
3,TITLE IV—VETERANS Sec. 401. Protecting Regular...,n foreign boycotts of Israel. Sec. 305. Licens...
4,Sec. 111,Imposition of sanctions. Sec. 102. Definitions...
5,Sec. 112,ating to covered national security transaction...
6,TITLE VI—MISCELLANEOUS Sec. 601,g to the Non-SDN Chinese Military-Industrial C...
7,Sec. 602,iling deadline for certain pre-existing report...
8,TITLE II—YOUTH POISONING PREVENTION Sec. 201. ...,zation. Division D—Commerce matters TITLE I—SE...
9,TITLE III—CONSUMER PRODUCT SAFETY STANDARD FOR...,ing programs. TITLE II—YOUTH POISONING PREVENT...


In [80]:
example_4 = """
TITLE IX—TAKE IT DOWN ACT Sec. 1001. Short title. Sec. 1002. Criminal prohibition on intentional disclosure of nonconsensual intimate visual depictions. Sec. 1003. Notice and removal of nonconsensual intimate visual depictions. Sec. 1004. Definitions. Sec. 1005. Severability. TITLE X—RURAL BROADBAND PROTECTION ACT OF 2024 Sec. 1101. Short title. Sec. 1102. Vetting process for prospective high-cost univers
"""

row = comb_df.iloc[17]
citation = row['Citation Text']
print("Citation Text:", citation)

label = classify_with_llm_with_explanation(example_4, citation)
print("Predicted label:", label)

Citation Text: Sec. 1003
Predicted label: Label: Authority  
Reason: The citation introduces new procedures and programs related to nonconsensual intimate visual depictions without referencing a specific law being modified, indicating that it grants new rulemaking power or legal basis.


In [81]:
example_5 = """
X—RURAL BROADBAND PROTECTION ACT OF 2024 Sec. 1101. Short title. Sec. 1102. Vetting process for prospective high-cost universal service fund applicants. TITLE XI—AMERICAN MUSIC TOURISM Sec. 1201. Short title. Sec. 1202. Responsibilities of the Assistant Secretary of Commerce for Travel and Tourism. TITLE XII—INFORMING CONSUMERS ABOUT SMART DEVICES Sec. 1301. Short title. Sec. 1302. Required disclosure of a camera or recording capability in certain internet-con
"""

row = comb_df.iloc[19]
citation = row['Citation Text']
print("Citation Text:", citation)

label = classify_with_llm_with_explanation(example_5, citation)
print("Predicted label:", label)

Citation Text: TITLE XI—AMERICAN MUSIC TOURISM Sec. 1201. Short title. Sec. 1202
Predicted label: Based on the provided citation and surrounding context, I would label the citation as:

Label: Authority
Reason: The citation introduces new authority by establishing a "vetting process for prospective high-cost universal service fund applicants" without referencing a specific law being modified, indicating that it grants new rulemaking power or legal basis.

The surrounding context of TITLE XI—AMERICAN MUSIC TOURISM and TITLE XII—INFORMING CONSUMERS ABOUT SMART DEVICES suggests that these titles are separate from the main body


In [83]:
example_6 = """
n foreign boycotts of Israel. Sec. 305. Licensing transparency. Sec. 306. Ten-year statute of limitations for export control and anti-boycott violations. TITLE IV—VETERANS Sec. 401. Protecting Regular Order for Veterans Act of 2024. Sec. 402. Improving Veterans’ Experience Act of 2024. Sec. 403. Naming the Department of Veterans Affairs community-based outpatient clinic in Plano, Texas, as the U.S. Congressman Sam Johnson Memorial VA Clinic. Sec. 404. Eddie Bernice Johnson VA Medical Center. TITLE V—COMPREHENSIVE OUTBOUND INVESTMENT NA
"""

row = comb_df.iloc[3]
citation = row['Citation Text']
print("Citation Text:", citation)

label = classify_with_llm_with_explanation(example_6, citation)
print("Predicted label:", label)

Citation Text: TITLE IV—VETERANS Sec. 401. Protecting Regular Order for Veterans Act of 2024. Sec. 402. Improving Veterans’ Experience Act of 2024. Sec. 403
Predicted label: Label: Authority
Reason: The citation introduces new legislation and programs for veterans, such as the Protecting Regular Order for Veterans Act of 2024 and Improving Veterans’ Experience Act of 2024, without referencing a specific law being modified or rescinded.
