In [None]:
!pip install --upgrade pymupdf  # Installs the PyMuPDF library for handling PDF and document processing.
!pip install pytesseract  # Installs pytesseract for OCR functionality to extract text from images.
!pip install google-api-python-client  # Installs the client library to access Google APIs programmatically.
!pip install google-auth  # Installs Google authentication library for secure access to Google services.
!pip install Pillow  # Installs Pillow, a powerful library for image manipulation and processing.
!pip install summa
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!apt install tesseract-ocr  # Installs Tesseract OCR engine
!apt install libtesseract-dev # Installs Tesseract development file


Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.4
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: summa
  Building wheel for summa (setup.py) ... [?25l[?25hdone
  Create

In [None]:
from google.colab import drive
import os

# 🚀 Mount Google Drive
drive.mount('/content/drive')

# 📌 Define base directory
base_dir = "/content/drive/MyDrive/Legal_Cases_Dataset"

# 📌 Define paths for Civil and Criminal datasets (matching your actual folder names)
civil_dirs = {
    "train": os.path.join(base_dir, "Civil Supreme Court and High Court Judgements", "Training"),
    "val": os.path.join(base_dir, "Civil Supreme Court and High Court Judgements", "Validation"),
    "test": os.path.join(base_dir, "Civil Supreme Court and High Court Judgements", "Testing")
}

criminal_dirs = {
    "train": os.path.join(base_dir, "Criminal Supreme Court and High Court Judgements", "Training"),
    "val": os.path.join(base_dir, "Criminal Supreme Court and High Court Judgements", "Validation"),
    "test": os.path.join(base_dir, "Criminal Supreme Court and High Court Judgements", "Testing")
}

# Function to list all PDF files in a given directory
def list_pdf_files(directory):
    if not os.path.exists(directory):
        print(f"❌ Directory does not exist: {directory}")
        return []
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]

# List files in each dataset folder
for category, paths in {"Civil": civil_dirs, "Criminal": criminal_dirs}.items():
    print(f"\n📌 {category} Dataset:")
    for split, path in paths.items():
        pdf_files = list_pdf_files(path)
        print(f"  🔹 {split.capitalize()} Set: {len(pdf_files)} PDFs found")


Mounted at /content/drive

📌 Civil Dataset:
  🔹 Train Set: 595 PDFs found
  🔹 Val Set: 129 PDFs found
  🔹 Test Set: 125 PDFs found

📌 Criminal Dataset:
  🔹 Train Set: 560 PDFs found
  🔹 Val Set: 124 PDFs found
  🔹 Test Set: 116 PDFs found


In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import os

# 🚀 Set Tesseract OCR Path (Modify if needed)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Linux/Colab default
# Windows: r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Function to extract text using the Hybrid Method (PyMuPDF + OCR)
def extract_text_hybrid(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""

    for page_num, page in enumerate(doc):
        extracted_text = page.get_text("text")  # Try fast text extraction
        if extracted_text.strip():
            text += f"\n📄 Page {page_num+1}:\n" + extracted_text + "\n"
        else:
            # If no text found, use OCR for the page
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img)
            text += f"\n📄 Page {page_num+1} (OCR Applied):\n" + ocr_text + "\n"

    return text.strip()

# Function to list all PDF files in a directory
def list_pdf_files(directory):
    if not os.path.exists(directory):
        print(f"❌ Directory does not exist: {directory}")
        return []
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]

# Process all PDFs and save extracted text
def process_dataset(dataset_dirs, category):
    print(f"\n🚀 Processing {category} Dataset...")
    for split, directory in dataset_dirs.items():
        pdf_files = list_pdf_files(directory)
        output_dir = os.path.join(directory, "Processed_Text")
        os.makedirs(output_dir, exist_ok=True)  # Create output folder if not exists

        for pdf in pdf_files:
            print(f"📄 Extracting: {pdf}")
            text = extract_text_hybrid(pdf)
            text_filename = os.path.join(output_dir, os.path.basename(pdf).replace(".pdf", ".txt"))
            with open(text_filename, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"✅ Extracted: {text_filename}")

# Run extraction for Civil & Criminal datasets
process_dataset(civil_dirs, "Civil")
process_dataset(criminal_dirs, "Criminal")



🚀 Processing Civil Dataset...
📄 Extracting: /content/drive/MyDrive/Legal_Cases_Dataset/Civil Supreme Court and High Court Judgements/Training/Landownership - 1.pdf
✅ Extracted: /content/drive/MyDrive/Legal_Cases_Dataset/Civil Supreme Court and High Court Judgements/Training/Processed_Text/Landownership - 1.txt
📄 Extracting: /content/drive/MyDrive/Legal_Cases_Dataset/Civil Supreme Court and High Court Judgements/Training/Landownership - 2.pdf
✅ Extracted: /content/drive/MyDrive/Legal_Cases_Dataset/Civil Supreme Court and High Court Judgements/Training/Processed_Text/Landownership - 2.txt
📄 Extracting: /content/drive/MyDrive/Legal_Cases_Dataset/Civil Supreme Court and High Court Judgements/Training/Landownership - 3.pdf
✅ Extracted: /content/drive/MyDrive/Legal_Cases_Dataset/Civil Supreme Court and High Court Judgements/Training/Processed_Text/Landownership - 3.txt
📄 Extracting: /content/drive/MyDrive/Legal_Cases_Dataset/Civil Supreme Court and High Court Judgements/Training/Landownersh

In [None]:
import os
import shutil

# 🚀 Mount Google Drive
drive.mount('/content/drive')

# 📌 Define base directory where extracted files are stored
# This is the base directory where your "Civil Supreme Court and High Court Judgements"
# and "Criminal Supreme Court and High Court Judgements" folders are located
base_dir = "/content/drive/MyDrive/Legal_Cases_Dataset"

# 📌 Define destination folder in Google Drive
destination_folder = "/content/drive/MyDrive/Extracted Dataset from PDF"

# 📂 Define structured paths for Civil and Criminal cases
structured_folders = {
    "Civil_Train": os.path.join(destination_folder, "Civil", "Train"),
    "Civil_Validation": os.path.join(destination_folder, "Civil", "Validation"),
    "Civil_Testing": os.path.join(destination_folder, "Civil", "Testing"),
    "Criminal_Train": os.path.join(destination_folder, "Criminal", "Train"),
    "Criminal_Validation": os.path.join(destination_folder, "Criminal", "Validation"),
    "Criminal_Testing": os.path.join(destination_folder, "Criminal", "Testing"),
}

# 📂 Create directories if they don't exist
for path in structured_folders.values():
    os.makedirs(path, exist_ok=True)

# 🛠️ Function to move extracted text files into structured folders
def move_files_structured(base_dir, destination_map):
    """
    Moves extracted text files from the base directory's subfolders
    into the structured folders in Google Drive.

    Args:
        base_dir: The base directory containing the "Civil..." and "Criminal..." folders.
        destination_map: A dictionary mapping category names to destination folders.
    """
    for category in ["Civil Supreme Court and High Court Judgements",
                      "Criminal Supreme Court and High Court Judgements"]:
        for split in ["Training", "Validation", "Testing"]:
            source_folder = os.path.join(base_dir, category, split, "Processed_Text")

            # Check if source folder exists
            if not os.path.exists(source_folder):
                print(f"⚠️ Source folder not found: {source_folder}")
                continue

            for file_name in os.listdir(source_folder):
                if not file_name.endswith(".txt"):
                    continue  # Skip non-text files

                # Determine the destination folder based on category and split
                dest_key = f"{category.split()[0]}_" + split  # e.g., "Civil_Training"
                if dest_key.replace("Training", "Train") in destination_map:
                  dest_folder = destination_map[dest_key.replace("Training", "Train")]
                else:
                    print(f"⚠️ Skipping {file_name} (unknown category)")
                    continue  # Skip files that don't match known categories

                source_path = os.path.join(source_folder, file_name)
                destination_path = os.path.join(dest_folder, file_name)
                shutil.move(source_path, destination_path)
                print(f"✅ Moved: {file_name} → {destination_path}")

# 🚀 Move extracted files into structured folders
move_files_structured(base_dir, structured_folders)

print("✅ All extracted files have been saved in a structured format in Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Moved: Landownership - 2.txt → /content/drive/MyDrive/Extracted Dataset from PDF/Civil/Train/Landownership - 2.txt
✅ Moved: Landownership - 3.txt → /content/drive/MyDrive/Extracted Dataset from PDF/Civil/Train/Landownership - 3.txt
✅ Moved: Landownership - 4.txt → /content/drive/MyDrive/Extracted Dataset from PDF/Civil/Train/Landownership - 4.txt
✅ Moved: Landownership - 5.txt → /content/drive/MyDrive/Extracted Dataset from PDF/Civil/Train/Landownership - 5.txt
✅ Moved: Landownership - 1.txt → /content/drive/MyDrive/Extracted Dataset from PDF/Civil/Train/Landownership - 1.txt
✅ Moved: Landownership - 6.txt → /content/drive/MyDrive/Extracted Dataset from PDF/Civil/Train/Landownership - 6.txt
✅ Moved: Landownership - 7.txt → /content/drive/MyDrive/Extracted Dataset from PDF/Civil/Train/Landownership - 7.txt
✅ Moved: Landownership - 8.txt → /content/drive/MyDr

In [None]:
from google.colab import drive

# 🚀 Mount Google Drive
drive.mount('/content/drive')

# 📌 Define extracted dataset path in Google Drive
extracted_folder = "/content/drive/MyDrive/Extracted Dataset from PDF"

# 📂 Define structured paths for Civil and Criminal cases
structured_folders = {
    "Civil_Train": os.path.join(extracted_folder, "Civil", "Train"),
    "Civil_Validation": os.path.join(extracted_folder, "Civil", "Validation"),
    "Civil_Testing": os.path.join(extracted_folder, "Civil", "Testing"),
    "Criminal_Train": os.path.join(extracted_folder, "Criminal", "Train"),
    "Criminal_Validation": os.path.join(extracted_folder, "Criminal", "Validation"),
    "Criminal_Testing": os.path.join(extracted_folder, "Criminal", "Testing"),
}

# Function to check if files exist in a directory
def check_files_in_directory(directory):
    if not os.path.exists(directory):
        print(f"❌ Directory does not exist: {directory}")
        return 0
    files = [f for f in os.listdir(directory) if f.endswith(".txt")]
    return len(files)

# 🚀 Check all extracted dataset folders
print("\n📌 Checking Extracted Files in Google Drive:")
for category, path in structured_folders.items():
    file_count = check_files_in_directory(path)
    print(f"  🔹 {category.replace('_', ' ')}: {file_count} files found")

print("\n✅ Verification Complete!")


Mounted at /content/drive

📌 Checking Extracted Files in Google Drive:
  🔹 Civil Train: 595 files found
  🔹 Civil Validation: 129 files found
  🔹 Civil Testing: 125 files found
  🔹 Criminal Train: 560 files found
  🔹 Criminal Validation: 124 files found
  🔹 Criminal Testing: 116 files found

✅ Verification Complete!
