In [27]:
import pandas as pd
from openai import OpenAI
from sklearn.model_selection import train_test_split as tts
from concurrent.futures import ThreadPoolExecutor
import os
from tqdm import tqdm
import re
import warnings

In [4]:
warnings.filterwarnings("ignore")

In [5]:
df_citation = pd.read_csv("Cleaned Citations with Context.csv")

In [6]:
df_citation.info

<bound method DataFrame.info of      Unnamed: 0                                               text  \
0           113                                     section 101(9)   
1           120                                        section 101   
2           121                                        section 101   
3           123                                        section 101   
4           124                                        section 101   
...         ...                                                ...   
9490      28206  Section 102(c) of the Nuclear Energy Innovatio...   
9491      28210  Section 203 of the Energy Reorganization Act o...   
9492      28218       Section 181 of the Atomic Energy Act of 1954   
9493      28222       section 103 of the Atomic Energy Act of 1954   
9494      28239                                               2134   

     startPosition endPosition     normCite citeType             altCite  \
0            28956       28970   38 usc 101      US

In [7]:
df_citation.columns

Index(['Unnamed: 0', 'text', 'startPosition', 'endPosition', 'normCite',
       'citeType', 'altCite', 'pinCiteStr', 'pageRangeStr', 'nodeId',
       ...
       'Unnamed: 2122', 'Unnamed: 2123', 'Unnamed: 2124', 'Unnamed: 2125',
       'Unnamed: 2126', 'Unnamed: 2127', 'Unnamed: 2128', 'Unnamed: 2129',
       'Unnamed: 2130', 'Unnamed: 2131'],
      dtype='object', length=2132)

In [8]:
# Remove features that have Unnamed in the feature name
df_citation = df_citation.loc[:, ~df_citation.columns.str.contains('^Unnamed')]

In [9]:
df_citation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9495 entries, 0 to 9494
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   text                  9491 non-null   object
 1   startPosition         9491 non-null   object
 2   endPosition           9491 non-null   object
 3   normCite              9491 non-null   object
 4   citeType              9306 non-null   object
 5   altCite               2007 non-null   object
 6   pinCiteStr            5 non-null      object
 7   pageRangeStr          5 non-null      object
 8   nodeId                9491 non-null   object
 9   section               6730 non-null   object
 10  sectionAndSubSection  6730 non-null   object
 11  isShortCite           9491 non-null   object
 12  chunk_id              6403 non-null   object
 13  context               9491 non-null   object
dtypes: object(14)
memory usage: 1.0+ MB


In [10]:
annotate_df, holdout_df = tts(df_citation, train_size=0.25, random_state=2025)

In [11]:
annotate_df.shape, holdout_df.shape

((2373, 14), (7122, 14))

In [12]:
annotate_df.columns

Index(['text', 'startPosition', 'endPosition', 'normCite', 'citeType',
       'altCite', 'pinCiteStr', 'pageRangeStr', 'nodeId', 'section',
       'sectionAndSubSection', 'isShortCite', 'chunk_id', 'context'],
      dtype='object')

In [13]:
client = OpenAI(base_url="http://localhost:8051/v1", api_key="lm-studio")

In [14]:
# def classify_with_llm_with_explanation(citation, text):
#     prompt = f"""
# You are a legal expert with 20+ years of experience in legislative analysis.
# 
# You are given a **citation** and its **surrounding context** from a U.S. congressional bill. Based on the citation’s legal function in the text, classify it into ONE of the following categories:
# 
# ---
# 
# POSSIBLE LABELS:
# 
# 1. Authority – The citation grants rulemaking power, allocates legal authority, or authorizes actions/programs.
#    - Keywords: "authorized", "shall", "under the authority of", "empowered to", "required to"
#    - Example: “The Secretary shall issue regulations under 42 USC 1983.”
# 
# 2. Amending – The citation changes, updates, or extends an existing statute.
#    - Keywords: "is amended", "insert", "strike", "reauthorized"
#    - Example: “Section 102 of the Fair Housing Act is amended to include…”
# 
# 3. Rescinding – The citation repeals or invalidates part of a law.
#    - Keywords: "repealed", "rescinded", "null and void", "terminated"
#    - Example: “Section 12 of the Act is hereby repealed.”
# 
# 4. Definition – The citation is used to define a term or concept used in the bill.
#    - Keywords: "means", "includes", "as defined in", "for the purposes of this section"
#    - Example: “‘State’ means any of the 50 states, as defined in Section 2.”
# 
# 5. Precedent – The citation is included to guide interpretation or reference prior laws without changing or applying them.
#    - Keywords: "see", "reference", "according to", "based on", "note"
#    - Example: “According to 1 USC 1 note, singular includes plural.”
# 
# 6. Exception – The citation provides an exception or special case to a rule or regulation.
#    - Keywords: "notwithstanding", "except as provided", "does not apply", "exempted"
#    - Example: “Notwithstanding section 4(b), this clause shall apply to…"
# 
# Citation:
# \"\"\"{citation}\"\"\"
# 
# Context:
# \"\"\"{context}\"\"\"
# 
# ---
# 
# INSTRUCTIONS:
# 
# - Read both the **citation text** and the **context** carefully.
# - Choose **only one** of the six labels that best represents the citation’s legislative intent.
# - Do **not infer** beyond the context given.
# - Respond in this format:
# 
# Label: <One of the six labels>  
# Reason: <Brief, one-sentence justification based on the context>
# 
# """
# 
#     try:
#         response = client.chat.completions.create(
#             model="lmstudio-community/Meta-Llama-3-1-8B-Instruct-GGUF",
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.3,
#             max_tokens=100
#         )
#         return response.choices[0].message.content.strip()
#     except Exception as e:
#         print("LLM Error:", e)
#         return None

In [15]:
annotate_df[['text', 'context']].head(5)

Unnamed: 0,text,context
4593,1 USC 1,"Division A—Military Construction, Veterans Aff..."
4066,or direction,"16353(b)). <paragraph display-inline=""no-displ..."
937,42 U.S.C.,Domestic Food Programs Food and Nutrition Serv...
3623,19 USC 2434,"4655)— <clause display-inline=""no-display-inli..."
8571,2 FAM 154,(d) None of the funds appropriated or otherwis...


In [16]:
# Testing the function
# context = annotate_df.iloc[0]['context']
# citation = annotate_df.iloc[0]['text']
# 
# response = classify_with_llm_with_explanation(citation, context)
# print("Response:", response)

In [17]:
# Testing the function
# context = annotate_df.iloc[1]['context']
# citation = annotate_df.iloc[1]['text']
# 
# response = classify_with_llm_with_explanation(citation, context)
# print("Response:", response)

In [18]:
# Testing the function
# context = annotate_df.iloc[2]['context']
# citation = annotate_df.iloc[2]['text']
# 
# response = classify_with_llm_with_explanation(citation, context)
# print("Response:", response)

In [19]:
# Testing the function
# context = annotate_df.iloc[3]['context']
# citation = annotate_df.iloc[3]['text']
# 
# response = classify_with_llm_with_explanation(citation, context)
# print("Response:", response)

In [20]:
# Testing the function
# context = annotate_df.iloc[4]['context']
# citation = annotate_df.iloc[4]['text']
# 
# response = classify_with_llm_with_explanation(citation, context)
# print("Response:", response)

In [21]:
def classify_with_llm(citation, text):
    prompt = f"""

You are a legal expert with 20+ years of experience in legislative analysis.

You are given a **citation** and its **surrounding context** from a U.S. congressional bill. Based on the citation’s legal function in the text, classify it into ONE of the following categories:

---

POSSIBLE LABELS:

1. Authority – The citation grants rulemaking power, allocates legal authority, or authorizes actions/programs.
   - Keywords: "authorized", "shall", "under the authority of", "empowered to", "required to"
   - Example: “The Secretary shall issue regulations under 42 USC 1983.”

2. Amending – The citation changes, updates, or extends an existing statute.
   - Keywords: "is amended", "insert", "strike", "reauthorized"
   - Example: “Section 102 of the Fair Housing Act is amended to include…”

3. Rescinding – The citation repeals or invalidates part of a law.
   - Keywords: "repealed", "rescinded", "null and void", "terminated"
   - Example: “Section 12 of the Act is hereby repealed.”

4. Definition – The citation is used to define a term or concept used in the bill.
   - Keywords: "means", "includes", "as defined in", "for the purposes of this section"
   - Example: “‘State’ means any of the 50 states, as defined in Section 2.”

5. Precedent – The citation is included to guide interpretation or reference prior laws without changing or applying them.
   - Keywords: "see", "reference", "according to", "based on", "note"
   - Example: “According to 1 USC 1 note, singular includes plural.”

6. Exception – The citation provides an exception or special case to a rule or regulation.
   - Keywords: "notwithstanding", "except as provided", "does not apply", "exempted"
   - Example: “Notwithstanding section 4(b), this clause shall apply to…"

Citation:
\"\"\"{citation}\"\"\"

Context:
\"\"\"{text}\"\"\"

---

INSTRUCTIONS:

- Read both the **citation text** and the **context** carefully.
- Choose **only one** of the six labels that best represents the citation’s legislative intent.
- Do **not infer** beyond the context given.
- Respond in this format:

Label: <One of the six labels>  
"""
    try:
        response = client.chat.completions.create(
            model="lmstudio-community/Meta-Llama-3-1-8B-Instruct-GGUF",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_tokens=100
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("None:", e)
        return None



In [22]:
max_workers = os.cpu_count()
print("Max Workers:", max_workers)

Max Workers: 16


In [23]:
# Annotate the label to the annotate_df
def get_label(row):
    citation = row['text']
    context = row['context']
    try:
        response = classify_with_llm(citation, context)
        if response and "Label:" in response:
            return response.split("Label:")[-1].strip()
        else:
            return "Error"
    except Exception as e:
        return "Error"

In [24]:
def annotate_labels_parallel(df, max_workers=max_workers):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        labels = list(tqdm(
            executor.map(get_label, [row for _, row in df.iterrows()]),
            total=len(df),
            desc="Annotating with LLM"
        ))
    df["label"] = labels
    return df

In [25]:
annotate_df = annotate_labels_parallel(annotate_df, max_workers=max_workers)

Annotating with LLM: 100%|██████████| 2373/2373 [5:35:26<00:00,  8.48s/it]  


In [26]:
# Check the labels
annotate_df['label'].value_counts()

label
Authority                                                                                                                                                                                                                                                                                                                                                                                                   5
Definition  \n\nExplanation: The citation "20 U.S.C." is used to define a term or concept, specifically the meaning of "institution of higher education", which is referenced from section 101 of the Higher Education Act of 1965. This falls under the category of definition as it uses the keyword "has the meaning given that term" to clarify the definition.                                         3
Definition  \n\nExplanation: The citation "20 U.S.C." is used to define a term or concept, specifically the meaning of "institution of higher education", which is referenced from section 101 of the 

In [28]:
def clean_label(label):
    match = re.search(r"(Authority|Amending|Rescinding|Definition|Precedent|Exception)", label, re.IGNORECASE)
    return match.group(1).capitalize() if match else "Error"

In [29]:
annotate_df['label'] = annotate_df['label'].apply(clean_label)

In [30]:
annotate_df['label'].value_counts()

label
Authority     1801
Amending       304
Definition     225
Error           20
Rescinding      11
Exception        7
Precedent        5
Name: count, dtype: int64

In [31]:
annotate_df.head(5)

Unnamed: 0,text,startPosition,endPosition,normCite,citeType,altCite,pinCiteStr,pageRangeStr,nodeId,section,sectionAndSubSection,isShortCite,chunk_id,context,label
4593,1 USC 1,3479,3486,1 usc 1,USC,,,,0,1 USC 1,1 USC 1,False,0.0,"Division A—Military Construction, Veterans Aff...",Definition
4066,or direction,188589,188601,or dir ection,,,,,0,,,False,9.0,"16353(b)). <paragraph display-inline=""no-displ...",Definition
937,42 U.S.C.,245062,245071,42 usc,USC,,,,0,42 U.S.C.,42 U.S.C.,False,4.0,Domestic Food Programs Food and Nutrition Serv...,Authority
3623,19 USC 2434,110102,110113,19 usc 2434,USC,,,,0,19 USC 2434,19 USC 2434,False,16.0,"4655)— <clause display-inline=""no-display-inli...",Amending
8571,2 FAM 154,343562,343571,[2] 1 fam 154,UK,,,,0,,,False,,(d) None of the funds appropriated or otherwis...,Authority


In [None]:
annotate_df = annotate_df[annotate_df['label'] != 'Error'].reset_index(drop=True)

In [33]:
annotate_df['label'].value_counts()

label
Authority     1801
Amending       304
Definition     225
Rescinding      11
Exception        7
Precedent        5
Name: count, dtype: int64

In [34]:
# Save the annotated dataframe
annotate_df.to_csv("Annotated_Citations.csv", index=False)

In [35]:
# Save the holdout dataframe
holdout_df.to_csv("Holdout_Citations.csv", index=False)