In [1]:
import numpy as np
import pandas as pd
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import random
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import nlpaug.augmenter.word as naw 
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
final_labeled_data = pd.read_csv('Final_Citations_Label.csv')
final_labeled_data.head(5)

Unnamed: 0,text,startPosition,endPosition,normCite,citeType,altCite,pinCiteStr,pageRangeStr,nodeId,section,sectionAndSubSection,isShortCite,chunk_id,context,original_label
0,1 USC 1,3479,3486,1 usc 1,USC,,,,0,1 USC 1,1 USC 1,False,0.0,"Division A—Military Construction, Veterans Aff...",Definition
1,or direction,188589,188601,or dir ection,,,,,0,,,False,9.0,"16353(b)). <paragraph display-inline=""no-displ...",Definition
2,42 U.S.C.,245062,245071,42 usc,USC,,,,0,42 U.S.C.,42 U.S.C.,False,4.0,Domestic Food Programs Food and Nutrition Serv...,Authority
3,19 USC 2434,110102,110113,19 usc 2434,USC,,,,0,19 USC 2434,19 USC 2434,False,16.0,"4655)— <clause display-inline=""no-display-inli...",Amending
4,2 FAM 154,343562,343571,[2] 1 fam 154,UK,,,,0,,,False,,(d) None of the funds appropriated or otherwis...,Authority


In [4]:
final_labeled_data["original_label"].value_counts()

original_label
Authority     7295
Amending      1302
Definition     855
Rescinding      11
Exception        7
Precedent        5
Name: count, dtype: int64

In [5]:
class_counts = final_labeled_data["original_label"].value_counts()
max_count = class_counts.max()

In [6]:
labels_to_generate = {
    'Amending': 5993,
    'Definition': 6440,
    'Rescinding': 7284,
    'Exception': 7288,
    'Precedent': 7290
}

In [7]:
label_dfs = {
    label: final_labeled_data[final_labeled_data["original_label"] == label].reset_index(drop=True)
    for label in labels_to_generate.keys()
}

In [8]:
label_context_counts = {label: len(label_dfs[label]) for label in label_dfs}
label_context_counts

{'Amending': 1302,
 'Definition': 855,
 'Rescinding': 11,
 'Exception': 7,
 'Precedent': 5}

In [10]:
client = OpenAI(base_url="http://localhost:8051/v1", api_key="lm-studio")

In [14]:
def generate_synthetic_context(label, seed_context):
    prompt = f"""
You are a legislative domain expert with 20+ years of experience drafting, interpreting, and classifying U.S. Congressional bill language.

Your task is to generate a new, original and human-like legislative context that falls under the category "{label}". Use the given seed context as inspiration, but do not copy or reuse any phrases directly.

---

Seed Context:
\"\"\"{seed_context}\"\"\"

Instructions:
- Produce a realistic legislative paragraph or section, written in the style of U.S. Congressional bills.
- The text must belong to the label: "{label}" based on legal function.
- Only return the new legislative context (no explanations).

Label Guidance:
- Amending: Modify existing statute.
- Rescinding: Repeal or nullify.
- Definition: Define terms clearly.
- Precedent: Reference past legal rules.
- Exception: Override or exempt rules.
"""
    response = client.chat.completions.create(
        model="lmstudio-community/Meta-Llama-3-1-8B-Instruct-GGUF",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=300
    )
    return response.choices[0].message.content.strip()

# Seed context examples for each label
seed_contexts = {
    "Amending": "Section 102 of the Fair Housing Act is amended by inserting after subsection (c) the following new subsection.",
    "Rescinding": "Section 12 of the 2015 Education Reform Act is hereby repealed in its entirety.",
    "Definition": "'Cybersecurity threat' means any unauthorized access to data or systems as defined in section 302(b).",
    "Precedent": "Pursuant to the ruling in Brown v. Board of Education, all segregated facilities are deemed unconstitutional.",
    "Exception": "Notwithstanding section 4(b), individuals under the age of 18 shall be exempt from the tax penalty."
}

# Generate 5 examples per label
synthetic_data = []
for label, seed in seed_contexts.items():
    print(f"🔧 Generating 5 examples for label: {label}")
    for i in tqdm(range(5)):
        try:
            new_context = generate_synthetic_context(label, seed)
            synthetic_data.append({
                "text": f"Synthetic-{label[:4].capitalize()}-{i+1:03}", 
                "context": new_context,
                "original_label": label
            })
        except Exception as e:
            print(f"Failed for {label}: {e}")

🔧 Generating 5 examples for label: Amending


100%|██████████| 5/5 [00:54<00:00, 10.97s/it]


🔧 Generating 5 examples for label: Rescinding


100%|██████████| 5/5 [00:37<00:00,  7.52s/it]


🔧 Generating 5 examples for label: Definition


100%|██████████| 5/5 [00:28<00:00,  5.61s/it]


🔧 Generating 5 examples for label: Precedent


100%|██████████| 5/5 [00:24<00:00,  4.94s/it]


🔧 Generating 5 examples for label: Exception


100%|██████████| 5/5 [00:27<00:00,  5.47s/it]


In [16]:
synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df.head(5)

Unnamed: 0,text,context,original_label
0,Synthetic-Amen-001,Section 305 of the Public Health Service Act i...,Amending
1,Synthetic-Amen-002,Section 305 of the Clean Air Act is amended by...,Amending
2,Synthetic-Amen-003,Section 307 of the Federal Trade Commission Ac...,Amending
3,Synthetic-Amen-004,Section 305 of the Clean Water Act is modified...,Amending
4,Synthetic-Amen-005,"""Section 305 of the Public Lands Act is amende...",Amending


In [17]:
combined_df = pd.concat([final_labeled_data, synthetic_df], ignore_index=True)

In [18]:
combined_df.shape

(9500, 15)

In [19]:
combined_df['combined'] = combined_df['text'].fillna("None") + " [SEP] " + combined_df['context'].fillna("None")

In [20]:
label_encoder  = LabelEncoder()
combined_df['encoded_label'] = label_encoder .fit_transform(combined_df['original_label'])

In [21]:
model_path = "legal_model/checkpoint-1475"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).eval().cuda()

Some weights of RobertaModel were not initialized from the model checkpoint at legal_model/checkpoint-1475 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

In [24]:
embeddings = np.array([get_embedding(txt) for txt in tqdm(combined_df["combined"], desc="Embedding full dataset")])
labels = combined_df["encoded_label"].values

Embedding full dataset: 100%|██████████| 9500/9500 [01:45<00:00, 89.80it/s] 


In [25]:
target_count = 7295
class_distribution = combined_df['encoded_label'].value_counts().to_dict()
sampling_strategy = {cls: target_count for cls, count in class_distribution.items() if count < target_count}

In [26]:
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=5934, k_neighbors=5)
X_res, y_res = smote.fit_resample(embeddings, labels)

In [27]:
X_syn = X_res[len(embeddings):]
y_syn = y_res[len(embeddings):]
sim = cosine_similarity(X_syn, embeddings)
closest_indices = sim.argmax(axis=1)

In [28]:
synthetic_df = pd.DataFrame({
    "text": combined_df.loc[closest_indices, "text"].values,
    "context": combined_df.loc[closest_indices, "context"].values,
    "original_label": label_encoder.inverse_transform(y_syn),
    "synthetic": True
})#%%


In [29]:
combined_df['synthetic'] = False
final_df = pd.concat([combined_df, synthetic_df], ignore_index=True)

In [30]:
final_df.shape

(43770, 18)

In [31]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43770 entries, 0 to 43769
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   text                  43766 non-null  object 
 1   startPosition         9471 non-null   object 
 2   endPosition           9471 non-null   object 
 3   normCite              9471 non-null   object 
 4   citeType              9286 non-null   object 
 5   altCite               1991 non-null   object 
 6   pinCiteStr            5 non-null      object 
 7   pageRangeStr          5 non-null      object 
 8   nodeId                9471 non-null   object 
 9   section               6729 non-null   object 
 10  sectionAndSubSection  6729 non-null   object 
 11  isShortCite           9471 non-null   object 
 12  chunk_id              6391 non-null   object 
 13  context               43766 non-null  object 
 14  original_label        43770 non-null  object 
 15  combined           

In [32]:
final_df['original_label'].value_counts()

original_label
Definition    7295
Authority     7295
Amending      7295
Exception     7295
Rescinding    7295
Precedent     7295
Name: count, dtype: int64

In [34]:
final_df.drop(columns=['encoded_label', 'combined', 'synthetic'], inplace=True)

In [35]:
final_df.head(5)

Unnamed: 0,text,startPosition,endPosition,normCite,citeType,altCite,pinCiteStr,pageRangeStr,nodeId,section,sectionAndSubSection,isShortCite,chunk_id,context,original_label
0,1 USC 1,3479,3486,1 usc 1,USC,,,,0,1 USC 1,1 USC 1,False,0.0,"Division A—Military Construction, Veterans Aff...",Definition
1,or direction,188589,188601,or dir ection,,,,,0,,,False,9.0,"16353(b)). <paragraph display-inline=""no-displ...",Definition
2,42 U.S.C.,245062,245071,42 usc,USC,,,,0,42 U.S.C.,42 U.S.C.,False,4.0,Domestic Food Programs Food and Nutrition Serv...,Authority
3,19 USC 2434,110102,110113,19 usc 2434,USC,,,,0,19 USC 2434,19 USC 2434,False,16.0,"4655)— <clause display-inline=""no-display-inli...",Amending
4,2 FAM 154,343562,343571,[2] 1 fam 154,UK,,,,0,,,False,,(d) None of the funds appropriated or otherwis...,Authority


In [36]:
final_df.tail(5)

Unnamed: 0,text,startPosition,endPosition,normCite,citeType,altCite,pinCiteStr,pageRangeStr,nodeId,section,sectionAndSubSection,isShortCite,chunk_id,context,original_label
43765,Synthetic-Resc-003,,,,,,,,,,,,,Section 305 of the 2002 Environmental Protecti...,Rescinding
43766,936,,,,,,,,,,,,,(f) Effective date Except with respect to subp...,Rescinding
43767,40 CFR part 60 subparts CCCC,,,,,,,,,,,,,To support the key role that forests in the Un...,Rescinding
43768,section 27 of the Stevenson-Wydler Technology ...,,,,,,,,,,,,,"The Department of Commerce, the National Aeron...",Rescinding
43769,section 872 of the Homeland Security Act of 2002,,,,,,,,,,,,,"1448). <section display-inline=""no-display-inl...",Rescinding


In [38]:
final_df.to_csv('Final_Citations_Labels_Smote.csv', index=False)