In [None]:

import os
import re
import fitz  # PyMuPDF
import csv

pdf_dir = r"D:\NLP\Data\1950_data"     # replace with your location
output_csv = os.path.join(pdf_dir, "legal_case_metadata_final.csv")
log_file = os.path.join(pdf_dir, "error_log.txt")

def clean_text(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_text_from_pdf(pdf_path):
    try:
        with fitz.open(pdf_path) as doc:
            text = ""
            for page in doc[:5]: 
                text += page.get_text()
            return clean_text(text)
    except Exception as e:
        with open(log_file, "a", encoding="utf-8") as f:
            f.write(f"{pdf_path}: {e}\n")
        return None

def extract_about_case(text):
    intro_patterns = [
        r"(This (appeal|petition|case|application)[^\.\n]{20,500}\.)",
        r"(The petitioner[^\.\n]{20,500}\.)",
        r"(The present (appeal|case)[^\.\n]{20,500}\.)",
        r"(This criminal revision[^\.\n]{20,500}\.)",
        r"(This writ petition[^\.\n]{20,500}\.)"
    ]
    for pattern in intro_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return text[:400] + "..." if len(text) > 400 else text

def get_year_from_filename(filename):
    match = re.search(r'(\d{4})', filename)
    return match.group(1) if match else "Unknown"

def extract_case_metadata(text):
    metadata = {
        "case_title": "Unknown",
        "citations": "Unknown",
        "bench": "Unknown",
        "about_case": "Unknown",
        "category": "Unknown"
    }

    title_match = re.search(r'([A-Z][^\n]+vs[^\n]+)', text, re.IGNORECASE)
    if title_match:
        metadata["case_title"] = title_match.group(1).strip()

    citations = re.findall(r'(\d{4}\s+[A-Z]+\s+\d+)', text)
    if citations:
        metadata["citations"] = ', '.join(set(citations))

    bench_match = re.search(r'Bench\s*[:\-]\s*([^\n]+)', text)
    if bench_match:
        metadata["bench"] = bench_match.group(1).strip()

    metadata["about_case"] = extract_about_case(text)

    lower = text.lower()
    if "rape" in lower or "section 376" in lower:
        metadata["category"] = "Rape / Sexual Offence"
    elif "murder" in lower:
        metadata["category"] = "Murder / Homicide"
    elif "property" in lower:
        metadata["category"] = "Property Dispute"
    elif "contract act" in lower:
        metadata["category"] = "Contract Law"
    elif "tax" in lower or "income tax" in lower:
        metadata["category"] = "Taxation"
    elif "custody" in lower:
        metadata["category"] = "Custody / Family Law"
    elif "maintenance" in lower:
        metadata["category"] = "Maintenance / Divorce"
    elif "constitutional" in lower:
        metadata["category"] = "Constitutional Law"
    elif "bail" in lower:
        metadata["category"] = "Bail / Criminal Procedure"
    elif "pocso" in lower:
        metadata["category"] = "Child Sexual Offence (POCSO)"
    elif "evidence act" in lower:
        metadata["category"] = "Evidence / Procedure"
    else:
        metadata["category"] = "Other / General"

    return metadata

with open(output_csv, "w", newline='', encoding='utf-8-sig') as f:
    writer = csv.writer(f)
    writer.writerow([
        'case_name', 'year', 'citations', 'bench',
        'about_case', 'category', 'text'
    ])

count = 0
for file in os.listdir(pdf_dir):
    if file.lower().endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, file)
        print(f" [{count}] Processing: {file}")

        text = extract_text_from_pdf(pdf_path)
        if not text:
            continue

        meta = extract_case_metadata(text)
        case_name = os.path.splitext(file)[0]
        year = get_year_from_filename(file)

        with open(output_csv, "a", newline='', encoding='utf-8-sig') as f:
            writer = csv.writer(f)
            writer.writerow([
                case_name, year, meta['citations'], meta['bench'],
                meta['about_case'], meta['category'], text
            ])
        count += 1

print(f"\nFinished extracting metadata for {count} PDFs.")
print(f"Output saved to: {output_csv}")

In [None]:
import pandas as pd
import joblib

data = pd.read_csv(r"D:\NLP\Data\1950_data\legal_case_metadata_final.csv")    # replace with your location
data.head()

In [None]:
judgment_texts = data["text"].fillna("").tolist()
joblib.dump(judgment_texts, "judgment_texts.pkl")

In [None]:
case_names = data["case_name"].fillna("Unknown Case").tolist()
joblib.dump(case_names, "case_names.pkl")

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')   
embeddings = model.encode(judgment_texts, show_progress_bar=True)

joblib.dump(model, "model.pkl") 
joblib.dump(embeddings, "embeddings.pkl")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

modell = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(multi_class='multinomial', max_iter=1000))
])

modell.fit(data['text'], data['category'])

joblib.dump(modell, "modellog.pkl")

In [None]:
judgment_texts = joblib.load("judgment_texts.pkl")
case_names = joblib.load("case_names.pkl")
model = joblib.load("model.pkl")
embeddings = joblib.load("embeddings.pkl")
modellog = joblib.load("modellog.pkl")