In [None]:
# This script extracts text from PDF files in a specified directory and saves the text to new files.
# It uses the pdfplumber library to read the PDFs and extract text from each page.

import pdfplumber, os

pdf_dir = '../data/raw_pdfs/'
out_dir = '../data/extracted_texts/'
os.makedirs(out_dir, exist_ok=True)

for fname in os.listdir(pdf_dir):
    if fname.endswith('.pdf'):
        with pdfplumber.open(os.path.join(pdf_dir, fname)) as pdf:
            text = ''
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + '\n'
        out_path = os.path.join(out_dir, fname.replace('.pdf', '.txt'))
        with open(out_path, 'w', encoding='utf-8') as f:
            f.write(text)

In [None]:
# Preprocessing the extracted text
# This part cleans up the text by removing extra whitespace and newlines.
import re

def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# Example usage
for fname in os.listdir(out_dir):
    with open(os.path.join(out_dir, fname), 'r', encoding='utf-8') as f:
        raw_text = f.read()
    cleaned = clean_text(raw_text)
    with open(os.path.join(out_dir, fname.replace('.txt', '.cleaned.txt')), 'w', encoding='utf-8') as f:
        f.write(cleaned)


In [2]:
# This script processes legal documents in various subdomains using spaCy for NLP tasks.
# It tokenizes the text, segments sentences, and performs named entity recognition (NER).

import os
import spacy
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# List of subdomain folders
subdomains = [
    "company_law", "tax_law", "banking_law", "securities_law", "insolvency_law",
    "contract_law", "negotiable_instruments_law", "consumer_law", "ip_law",
    "arbitration_law", "trust_law", "electronic_transactions_law", "foreign_exchange_law"
]

base_dir = "../data/subdomains/"

for subdomain in subdomains:
    folder = os.path.join(base_dir, subdomain)
    for fname in os.listdir(folder):
        if fname.endswith(".cleaned.txt"):
            file_path = os.path.join(folder, fname)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            # Run NLP pipeline
            doc = nlp(text)

            # Tokenization (word tokens, excluding stopwords and punctuation)
            tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]

            # Sentence segmentation
            sentences = [sent.text.strip() for sent in doc.sents]

            # Named Entity Recognition
            entities = [(ent.text, ent.label_) for ent in doc.ents]

            # Save tokens
            token_out = file_path.replace(".cleaned.txt", ".tokens.txt")
            with open(token_out, "w", encoding="utf-8") as f:
                f.write(" ".join(tokens))

            # Save sentences
            sent_out = file_path.replace(".cleaned.txt", ".sentences.txt")
            with open(sent_out, "w", encoding="utf-8") as f:
                f.write("\n".join(sentences))

            # Save entities as CSV
            ent_out = file_path.replace(".cleaned.txt", ".entities.csv")
            df = pd.DataFrame(entities, columns=["entity", "label"])
            df.to_csv(ent_out, index=False)

            print(f"Processed: {file_path}")


Processed: ../data/subdomains/company_law\01. Companies Act No. 7 of 2007.cleaned.txt
Processed: ../data/subdomains/tax_law\02. Inland Revenue Act_No_24_2017_E.cleaned.txt
Processed: ../data/subdomains/tax_law\03. Inland Revenue (Amendment) Act No. 2 of 2025.cleaned.txt
Processed: ../data/subdomains/banking_law\04. Banking Act 30_1988.cleaned.txt
Processed: ../data/subdomains/banking_law\05. Banking_Amendment_Act_No_24_of_2024_e.cleaned.txt
Processed: ../data/subdomains/banking_law\06. Banking (Special Provisions) Act, No. 17 of 2023.cleaned.txt
Processed: ../data/subdomains/securities_law\07. Securities and Exchange Commission of Sri Lanka.cleaned.txt
Processed: ../data/subdomains/insolvency_law\08. INSOLVENTS [Cap.103 - Lanka Law.cleaned.txt
Processed: ../data/subdomains/contract_law\09. Sale_of_Goods_Ordinance_No_11_of_1896_of.cleaned.txt
Processed: ../data/subdomains/negotiable_instruments_law\10. Bills of Exchanger Ordinance.cleaned.txt
Processed: ../data/subdomains/negotiable_ins