In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import re
import os
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
import torch


nltk.download("punkt")
nltk.download('punkt_tab')

device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

candidate_labels = ["Precedent", "Authority", "Definition", "Example", "Exception", "Amendment"]


base_dir = "/content/drive/MyDrive/Colab Notebooks/"
citations_dir = os.path.join(base_dir, "citations/")
bills_dir = os.path.join(base_dir, "sample_bills/")
output_dir = os.path.join(base_dir, "labeled_citations/")

os.makedirs(output_dir, exist_ok=True)

def extract_sentence_before_and_after_citation(bill_text, start, end):
    sentences = sent_tokenize(bill_text)
    cumulative_length = 0
    target_sentence_index = None
    for i, sentence in enumerate(sentences):
        start_index = cumulative_length
        end_index = cumulative_length + len(sentence)
        if start >= start_index and start < end_index:
            target_sentence_index = i
            break
        cumulative_length = end_index + 1

    context_sentences = []
    if target_sentence_index is not None:
        if target_sentence_index > 0:
            context_sentences.append(sentences[target_sentence_index - 1])
        context_sentences.append(sentences[target_sentence_index])
        if target_sentence_index < len(sentences) - 1:
            context_sentences.append(sentences[target_sentence_index + 1])
    else:
        context_sentences = sentences[:3]

    return " ".join(context_sentences)

def clean_text(text):
    return re.sub(r"\s+", " ", text)

for citation_file in os.listdir(citations_dir):
    if citation_file.endswith(".json"):
        bill_filename = citation_file.replace(".json", "")
        bill_text_path = os.path.join(bills_dir, bill_filename)

        if not os.path.exists(bill_text_path):
            print(f"Bill text file not found for: {bill_filename}")
            continue

        citation_path = os.path.join(citations_dir, citation_file)
        output_path = os.path.join(output_dir, citation_file)

        with open(bill_text_path, "r", encoding="utf-8") as f:
            bill_text = clean_text(f.read())

        with open(citation_path, "r", encoding="utf-8") as f:
            citations = json.load(f)

        for citation in citations:
            context = extract_sentence_before_and_after_citation(
                bill_text, citation["startPosition"], citation["endPosition"]
            )
            result = classifier(context, candidate_labels)
            best_label = result["labels"][0]  # Label with highest confidence

            citation["high_confidence"] = best_label

        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(citations, f, indent=4)

        print(f"Processed and saved: {output_path}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Using device: GPU
True
Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
