In [1]:
# This script prepares training data for a legal document classification task.
# It reads text files and their corresponding embeddings from subdomain folders,

import os
import pandas as pd

# Base directory for subdomains
base_dir = "../data/subdomains/"

# List all subdomain folders
subdomains = [
    "company_law", "tax_law", "banking_law", "securities_law", "insolvency_law",
    "contract_law", "negotiable_instruments_law", "consumer_law", "ip_law",
    "arbitration_law", "trust_law", "electronic_transactions_law", "foreign_exchange_law"
]

def prepare_training_data():
    for subdomain in subdomains:
        folder = os.path.join(base_dir, subdomain)
        data = []
        for fname in os.listdir(folder):
            if fname.endswith(".embedding.npy"):
                # Corresponding cleaned text file
                text_file = fname.replace(".embedding.npy", ".cleaned.txt")
                text_path = os.path.join(folder, text_file)
                embedding_path = os.path.join(folder, fname)
                # Read text
                with open(text_path, "r", encoding="utf-8") as f:
                    text = f.read()
                # Label is the subdomain name
                label = subdomain
                data.append({
                    "text": text,
                    "embedding_path": embedding_path,
                    "label": label
                })
        # Create DataFrame and save as CSV
        df = pd.DataFrame(data)
        csv_path = os.path.join(folder, "train.csv")
        df.to_csv(csv_path, index=False)
        print(f"Training data CSV created for {subdomain}: {csv_path}")

prepare_training_data()

Training data CSV created for company_law: ../data/subdomains/company_law\train.csv
Training data CSV created for tax_law: ../data/subdomains/tax_law\train.csv
Training data CSV created for banking_law: ../data/subdomains/banking_law\train.csv
Training data CSV created for securities_law: ../data/subdomains/securities_law\train.csv
Training data CSV created for insolvency_law: ../data/subdomains/insolvency_law\train.csv
Training data CSV created for contract_law: ../data/subdomains/contract_law\train.csv
Training data CSV created for negotiable_instruments_law: ../data/subdomains/negotiable_instruments_law\train.csv
Training data CSV created for consumer_law: ../data/subdomains/consumer_law\train.csv
Training data CSV created for ip_law: ../data/subdomains/ip_law\train.csv
Training data CSV created for arbitration_law: ../data/subdomains/arbitration_law\train.csv
Training data CSV created for trust_law: ../data/subdomains/trust_law\train.csv
Training data CSV created for electronic_tra