In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import os
import glob
import json
import re
import numpy as np
import torch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, util

device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

def clean_text(text):
    return re.sub(r"\s+", " ", text)

def load_json(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(data, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

# Initialize the sentence transformer model.
model = SentenceTransformer('all-MiniLM-L6-v2')

prototypes = {
    "Precedent": "This citation refers to a prior court decision or established case law that sets a legal standard for future rulings.",
    "Authority": "This citation invokes a binding legal source, such as a statute, regulation, or constitutional provision, to support an argument.",
    "Definition": " This citation is used to define or clarify a specific legal term or concept for accurate interpretation.",
    "Example": "This citation to a rule that is used to introduce something chosen as a typical case or is defining the subject by illustrating/describing it.",
    "Exception": "This citation identifies a deviation from the general rule, highlighting a specific exception in the law.",
    "Amendment": "This citation indicates a modification or update to an existing law or regulation."
}

# Pre-compute embeddings for the prototypes.
prototype_embeddings = {label: model.encode(text) for label, text in prototypes.items()}

def assign_label(context):
    context = clean_text(context)
    context_emb = model.encode(context)

    best_label = None
    best_score = -1
    for label, proto_emb in prototype_embeddings.items():
        sim = util.cos_sim(context_emb, proto_emb)
        sim_value = sim.item()  # Convert tensor to float.
        if sim_value > best_score:
            best_score = sim_value
            best_label = label
    return best_label

def process_file(json_file, output_folder):

    data = load_json(json_file)

    for record in data:
        context = record.get("context", "").strip()
        if context:
            record["low_confidence"] = assign_label(context)
        else:
            record["low_confidence"] = "Unknown"

    base_name = os.path.basename(json_file)
    output_path = os.path.join(output_folder, base_name)
    save_json(data, output_path)
    print(f"Processed and saved: {output_path}")

def process_all_files(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    json_files = glob.glob(os.path.join(input_folder, "*.json"))

    for json_file in json_files:
        process_file(json_file, output_folder)

if __name__ == "__main__":
    input_folder = "/content/drive/MyDrive/Colab Notebooks/labeled_citations_context"
    output_folder = "/content/drive/MyDrive/Colab Notebooks/final_labeled_folder"

    process_all_files(input_folder, output_folder)


Using device: GPU
True
Tesla T4
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR8580.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR3684.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR7980.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR8771.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR812.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR5009.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR4763.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/S1939.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR7891.txt.json
Processed and saved: /content/drive/MyDrive/Colab Notebooks/final_labeled_folder/HR9747.txt.json
