In [4]:
import os
import json
import random
import pandas as pd

# Path where metadata is stored
metadata_dir = os.path.expanduser("~/high_court_dataset/metadata")

# Collect all metadata records
records = []
for root, _, files in os.walk(metadata_dir):
    for f in files:
        if f.endswith(".json"):
            with open(os.path.join(root, f), "r") as infile:
                try:
                    data = json.load(infile)
                    records.append(data)
                except:
                    pass  # skip broken json

# Convert to DataFrame for easier handling
df = pd.DataFrame(records)

print("Total judgments in metadata:", len(df))
print("Courts available:", df["court_name"].unique())


Total judgments in metadata: 15609103
Courts available: ['Bombay High Court' 'Calcutta High Court' 'High Court of Meghalaya'
 'High Court of Jammu and Kashmir' 'High Court of Uttarakhand'
 'High Court of Punjab and Haryana' 'High Court of Madhya Pradesh'
 'Madras High Court' 'High Court for State of Telangana'
 'Patna High Court' 'Allahabad High Court' 'High Court of Kerala'
 'High Court of Gujarat' 'High Court of Andhra Pradesh'
 'High Court of Orissa' 'Gauhati High Court' 'High Court of Chhattisgarh'
 'High Court of Himachal Pradesh' 'High Court of Tripura'
 'High Court of Delhi' 'High Court of Jharkhand' 'High Court of Manipur'
 'High Court of Sikkim' 'High Court of Karnataka'
 'High Court of Rajasthan']


In [5]:
# Shuffle dataframe
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Target: 50k total
target_total = 52000
courts = df["court_name"].unique()
per_court = target_total // len(courts)

sampled = df.groupby("court_name").head(per_court)
print("Sampled judgments:", len(sampled))

# Save list of keys (paths in S3) for downloading
sampled.to_csv("sampled_metadata.csv", index=False)


Sampled judgments: 52000


In [2]:
import os
import json
import pandas as pd

# Path where metadata is stored
metadata_dir = os.path.expanduser("~/high_court_dataset/metadata")

# Collect all metadata records
records = []
for root, _, files in os.walk(metadata_dir):
    for f in files:
        if f.endswith(".json"):
            file_path = os.path.join(root, f)
            with open(file_path, "r") as infile:
                try:
                    data = json.load(infile)
                    records.append(data)
                except Exception as e:
                    print(f"‚ö†Ô∏è Skipped broken file: {file_path} ({e})")

# Convert to DataFrame
df = pd.DataFrame(records)

# Print dataset info
print("="*50)
print("‚úÖ Total judgments in metadata:", len(df))
print("="*50)

# Print available columns
print("üìÇ Columns in DataFrame:", df.columns.tolist())
print("="*50)

# Show a sample row
print("üîç Sample record:")
print(df.head(1).to_dict(orient="records"))
print("="*50)

# Try accessing 'court' if it exists
if "court" in df.columns:
    print("‚öñÔ∏è Courts available:", df["court_name"].unique())
else:
    print("‚ö†Ô∏è 'court' column not found.")
    print("üëâ Available columns you can try instead:", df.columns.tolist())


‚úÖ Total judgments in metadata: 15609102
üìÇ Columns in DataFrame: ['court_code', 'court_name', 'raw_html', 'pdf_link', 'downloaded']
üîç Sample record:
[{'court_code': '27~1', 'court_name': 'Bombay High Court', 'raw_html': '<button type=\'button\' role=\'link\' class=\'btn btn-link p-0 text-start\' id=\'link_0\' aria-label="INPT/49/1969 of SETH RAMCHAND SHAMDAS .Array[93]. S.D. TALREJA pdf"  class=\'noToken\' href=\'#\' onclick=javascript:open_pdf(\'0\',\'\',\'court/cnrorders/newos/orders/HCBM020000311969_1_2006-12-22.pdf#page=&search=%20\'); ><font size=\'3\'>INPT/49/1969 of SETH RAMCHAND SHAMDAS Vs S.D. TALREJA</button></font><br><strong>Judge : RETIRED JUDGE</strong><br> THE HIGH COURT OF JUDICATURE AT BOMBAY THE HIGH COURT OF JUDICATURE<br><strong class=\'caseDetailsTD\' ><span style=\'color:#212F3D\'> CNR :</span><font color=\'green\'> HCBM020000311969</font><span style=\'color:#212F3D\' > | Date of registration :</span><font color=\'green\'> 21-12-2006</font><span style=\'col

In [None]:
import os
import json
import pandas as pd
import random

# Path where metadata is stored
metadata_dir = os.path.expanduser("~/high_court_dataset/metadata")

# Collect metadata records
records = []
for root, _, files in os.walk(metadata_dir):
    for f in files:
        if f.endswith(".json"):
            file_path = os.path.join(root, f)
            with open(file_path, "r", encoding="utf-8") as infile:
                try:
                    data = json.load(infile)
                    records.append(data)
                except Exception as e:
                    print(f"‚ö†Ô∏è Skipped broken file {file_path}: {e}")

# Convert to DataFrame
df = pd.DataFrame(records)

print("="*60)
print("‚úÖ Total judgments in metadata:", len(df))
print("üìë Available columns:", df.columns.tolist())
print("="*60)

# Show a sample row
print("üîç Sample record:")
print(df.head(1).to_dict(orient="records"))
print("="*60)

# Use court_name instead of court
if "court_name" in df.columns:
    print("‚öñÔ∏è Courts available:", df["court_name"].unique())
else:
    print("‚ö†Ô∏è No 'court_name' column found. Available:", df.columns.tolist())

# ---- Balanced Sampling ----
if len(df) > 0:
    target_total = 52000
    courts = df["court_name"].dropna().unique()
    per_court = max(1, target_total // len(courts))
    print(f"üéØ Sampling about {per_court} judgments per court (total ~{target_total})")

    sampled = df.groupby("court_name", group_keys=False).apply(
        lambda g: g.sample(min(per_court, len(g)), random_state=42)
    )

    print("‚úÖ Final sampled judgments:", len(sampled))

    # Save sampled list to CSV
    out_csv = "sampled_metadata.csv"
    sampled.to_csv(out_csv, index=False)
    print(f"üìÇ Saved sampled metadata list to: {out_csv}")

    # Also save just the file paths for easier downloading
    if "pdf_link" in sampled.columns:
        sampled["pdf_link"].to_csv("sampled_paths.txt", index=False, header=False)
        print("üìÇ Saved file paths to: sampled_paths.txt")
    else:
        print("‚ö†Ô∏è No 'pdf_link' field found. Check available columns again.")
