In [1]:
import os
import json
from tqdm import tqdm

In [8]:
# Define paths for input and output directories
judgement_dir = "/home/corpadm/Downloads/legal_dataset/dataset/IN-Ext/judgement/"
full_summary_dir = "/home/corpadm/Downloads/legal_dataset/dataset/IN-Ext/summary/full/"
segment_summary_dir = "/home/corpadm/Downloads/legal_dataset/dataset/IN-Ext/summary/segment-wise/"
output_dir = "/home/corpadm/Downloads/legal_dataset/dataset/processed-IN-Ext/"
os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist

In [9]:
def load_full_summaries(judgement_dir, full_summary_dir, author):
    """
    Load full summaries written by a specific author.
    """
    data = []
    for filename in tqdm(os.listdir(judgement_dir)):
        if filename.endswith(".txt"):
            judgement_path = os.path.join(judgement_dir, filename)
            summary_path = os.path.join(full_summary_dir, author, filename)

            if os.path.exists(summary_path):
                with open(judgement_path, "r", encoding="utf-8") as f:
                    judgement = f.read()
                with open(summary_path, "r", encoding="utf-8") as f:
                    summary = f.read()
                data.append({"filename": filename, "judgement": judgement, "summary": summary, "author": author})
    return data
def load_segment_summaries(segment_summary_dir, author):
    """
    Load segment-wise summaries written by a specific author, handling potential encoding issues.
    """
    data = []
    segments = ["analysis", "argument", "facts", "judgement", "statute"]
    for filename in tqdm(os.listdir(os.path.join(segment_summary_dir, author, "analysis"))):
        if filename.endswith(".txt"):
            segment_text = {}
            for segment in segments:
                segment_path = os.path.join(segment_summary_dir, author, segment, filename)
                if os.path.exists(segment_path):
                    # Try reading with UTF-8, fallback to Latin-1 if decoding fails
                    try:
                        with open(segment_path, "r", encoding="utf-8") as f:
                            segment_text[segment] = f.read()
                    except UnicodeDecodeError:
                        with open(segment_path, "r", encoding="latin-1") as f:
                            segment_text[segment] = f.read()
            data.append({"filename": filename, "segments": segment_text, "author": author})
    return data

In [10]:
# Load full and segment summaries
print("Loading full summaries...")
full_summaries_A1 = load_full_summaries(judgement_dir, full_summary_dir, "A1")
full_summaries_A2 = load_full_summaries(judgement_dir, full_summary_dir, "A2")

print("Loading segment-wise summaries...")
segment_summaries_A1 = load_segment_summaries(segment_summary_dir, "A1")
segment_summaries_A2 = load_segment_summaries(segment_summary_dir, "A2")

Loading full summaries...


100%|██████████| 50/50 [00:00<00:00, 23597.97it/s]
100%|██████████| 50/50 [00:00<00:00, 16332.96it/s]


Loading segment-wise summaries...


100%|██████████| 50/50 [00:00<00:00, 16987.87it/s]
100%|██████████| 50/50 [00:00<00:00, 15445.22it/s]
