In [16]:
import os
import glob
import pandas as pd
import torch
from transformers import AutoTokenizer
from tokenizers import BertWordPieceTokenizer, pre_tokenizers, processors

# Directory setup
base_dir = os.path.abspath("")
dataset_dir = os.path.join(base_dir, "datasets", "Predex_Dataset")
output_dir = os.path.join(base_dir, "new_legal_bert_tokenizer")
vocab_file = os.path.join(output_dir, "vocab.txt")

print(f"Current working directory: {base_dir}")
print(f"Dataset directory: {dataset_dir}")
print(f"Tokenizer output directory: {output_dir}")
print(f"Final vocab file will be: {vocab_file}")

# Collect texts from PredEx dataset for validation
def collect_texts(directory: str):
    files = glob.glob(os.path.join(directory, "*.csv"))
    texts = []

    if not files:
        raise FileNotFoundError(f"No CSV files found in {directory}")

    for file in files:
        print(f"Loading file: {file}")
        df = pd.read_csv(file, low_memory=False)
        string_data = df.select_dtypes(include=["object"]).values.flatten().tolist()
        string_data = [str(t).strip() for t in string_data if pd.notna(t) and str(t).strip()]
        texts.extend(string_data)
        print(f"Extracted {len(string_data)} text entries from {os.path.basename(file)}")

    print(f"‚úÖ Total text entries collected: {len(texts)}")
    return texts

# Check available GPUs
excluded_gpus = [0, 6]
available_gpus = [i for i in range(torch.cuda.device_count()) if i not in excluded_gpus]
print(f"Available GPUs for training: {available_gpus}")

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Check for existing vocab.txt
if not os.path.exists(vocab_file):
    print(f"No vocab.txt found at {vocab_file}, loading Legal-BERT base vocab...")
    original_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
    vocab_files = original_tokenizer.save_vocabulary(output_dir)
    print(f"Saved original Legal-BERT vocab to: {vocab_files[0]}")
else:
    print(f"Found existing vocab.txt at {vocab_file}, using it...")

# Initialize BertWordPieceTokenizer with existing vocab
tokenizer = BertWordPieceTokenizer(
    vocab_file,
    lowercase=True,
    strip_accents=False,
    clean_text=True
)

# Configure tokenizer
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ]
)

# Save complete tokenizer
tokenizer.save(os.path.join(output_dir, "tokenizer.json"))
print(f"‚úÖ Saved tokenizer configuration to {os.path.join(output_dir, 'tokenizer.json')}")

# Save Hugging Face-compatible tokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
hf_tokenizer.vocab_files = {"vocab_file": vocab_file}
hf_tokenizer.save_pretrained(output_dir)
print(f"‚úÖ Saved Hugging Face tokenizer configuration to {output_dir}")
print(f"‚úÖ Vocabulary saved to {vocab_file}")

# Collect texts for validation
texts = collect_texts(dataset_dir)

# Validate tokenizer
print("üîç Validating tokenizer...")
sample_text = texts[0] if texts else "Sample Indian legal judgment text for testing."
hf_tokenizer = AutoTokenizer.from_pretrained(output_dir)
encoding = hf_tokenizer(
    sample_text,
    max_length=512,
    stride=50,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    return_overflowing_tokens=True
)
device = "cuda" if torch.cuda.is_available() and available_gpus else "cpu"
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
print(f"Sample text tokenized: {len(encoding['input_ids'])} chunks")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention mask shape: {attention_mask.shape}")

print("üéâ Tokenizer saved and validated successfully!")

Current working directory: /home/infodna
Dataset directory: /home/infodna/datasets/Predex_Dataset
Tokenizer output directory: /home/infodna/new_legal_bert_tokenizer
Final vocab file will be: /home/infodna/new_legal_bert_tokenizer/vocab.txt
Available GPUs for training: [1, 2, 3, 4, 5, 7]
Found existing vocab.txt at /home/infodna/new_legal_bert_tokenizer/vocab.txt, using it...
‚úÖ Saved tokenizer configuration to /home/infodna/new_legal_bert_tokenizer/tokenizer.json
‚úÖ Saved Hugging Face tokenizer configuration to /home/infodna/new_legal_bert_tokenizer
‚úÖ Vocabulary saved to /home/infodna/new_legal_bert_tokenizer/vocab.txt
Loading file: /home/infodna/datasets/Predex_Dataset/L-NLProc_PredEx_Instruction-Tuning_Prediction_test.csv
Extracted 4868 text entries from L-NLProc_PredEx_Instruction-Tuning_Prediction_test.csv
Loading file: /home/infodna/datasets/Predex_Dataset/L-NLProc_PredEx_Instruction_sets_train.csv
Extracted 32 text entries from L-NLProc_PredEx_Instruction_sets_train.csv
Loadi

In [21]:
import os
import glob
import random
import torch
from transformers import BertTokenizer, AutoTokenizer
from tokenizers import BertWordPieceTokenizer, pre_tokenizers, processors

# Directory setup
base_dir = os.path.abspath("")
previous_output_dir = os.path.join(base_dir, "new_legal_bert_tokenizer")
dataset_dir = os.path.join(base_dir, "datasets", "high_court_dataset", "txt_new")
output_dir = os.path.join(base_dir, "high_court_legal_bert_tokenizer")
previous_vocab_file = os.path.join(previous_output_dir, "vocab.txt")

print(f"Current working directory: {base_dir}")
print(f"Previous tokenizer directory: {previous_output_dir}")
print(f"Dataset directory: {dataset_dir}")
print(f"Tokenizer output directory: {output_dir}")
print(f"Loading previous vocab file: {previous_vocab_file}")

# Collect text files for training
def collect_text_files(directory: str):
    files = glob.glob(os.path.join(directory, "*.txt"))
    if not files:
        raise FileNotFoundError(f"No TXT files found in {directory}")
    print(f"‚úÖ Found {len(files)} TXT files for training")
    return files

# Collect sample texts for validation (load a few to avoid memory issues)
def collect_sample_texts(directory: str, num_samples: int = 5):
    files = glob.glob(os.path.join(directory, "*.txt"))
    if not files:
        raise FileNotFoundError(f"No TXT files found in {directory}")
    random.shuffle(files)
    texts = []
    for file in files[:num_samples]:
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read().strip()
            if text:
                texts.append(text)
    print(f"‚úÖ Collected {len(texts)} sample texts for validation")
    return texts

# Check available GPUs
excluded_gpus = [0, 6]
available_gpus = [i for i in range(torch.cuda.device_count()) if i not in excluded_gpus]
print(f"Available GPUs: {available_gpus}")
device = f"cuda:{available_gpus[0]}" if available_gpus and torch.cuda.is_available() else "cpu"
print(f"Using device for validation: {device}")

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Load previous vocab
if not os.path.exists(previous_vocab_file):
    raise FileNotFoundError(f"No vocab.txt found at {previous_vocab_file}")

# Initialize BertWordPieceTokenizer with previous vocab
tokenizer = BertWordPieceTokenizer(
    previous_vocab_file,
    lowercase=True,
    strip_accents=False,
    clean_text=True
)

# Get current vocab size
current_vocab_size = tokenizer.get_vocab_size()
new_vocab_size = 32000  # Allow adding new tokens; adjust if needed
print(f"Current vocab size: {current_vocab_size}. Training to new size: {new_vocab_size}")

# Configure special tokens and processors
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# Collect files and train tokenizer
text_files = collect_text_files(dataset_dir)
tokenizer.train(
    files=text_files,
    vocab_size=new_vocab_size,
    special_tokens=special_tokens,
    min_frequency=2,
    show_progress=True,
    wordpieces_prefix="##",
    limit_alphabet=1000
)
print("‚úÖ Tokenizer training completed")

# Save the trained tokenizer model (includes new vocab.txt)
saved_files = tokenizer.save_model(output_dir)
print(f"‚úÖ Saved trained tokenizer model to: {saved_files[0]}")  # vocab.txt

# Save complete tokenizer configuration
tokenizer.save(os.path.join(output_dir, "tokenizer.json"))
print(f"‚úÖ Saved tokenizer configuration to {os.path.join(output_dir, 'tokenizer.json')}")

# Load and save as Hugging Face BertTokenizer
hf_tokenizer = BertTokenizer.from_pretrained(output_dir)
hf_tokenizer.save_pretrained(output_dir)
print(f"‚úÖ Saved Hugging Face tokenizer to {output_dir}")

# Validate tokenizer with chunking
print("üîç Validating tokenizer...")
samples = collect_sample_texts(dataset_dir)
sample_text = samples[0] if samples else "Sample High Court legal judgment text for testing."
encoding = hf_tokenizer(
    sample_text,
    max_length=512,
    stride=50,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    return_overflowing_tokens=True
)
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
print(f"Sample text tokenized into {len(encoding['input_ids'])} chunks")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention mask shape: {attention_mask.shape}")

print("üéâ Tokenizer fine-tuned and validated successfully!")

Current working directory: /home/infodna
Previous tokenizer directory: /home/infodna/new_legal_bert_tokenizer
Dataset directory: /home/infodna/datasets/high_court_dataset/txt_new
Tokenizer output directory: /home/infodna/high_court_legal_bert_tokenizer
Loading previous vocab file: /home/infodna/new_legal_bert_tokenizer/vocab.txt
Available GPUs: [1, 2, 3, 4, 5, 7]
Using device for validation: cuda:1
Current vocab size: 30522. Training to new size: 32000
‚úÖ Found 48590 TXT files for training



‚úÖ Tokenizer training completed
‚úÖ Saved trained tokenizer model to: /home/infodna/high_court_legal_bert_tokenizer/vocab.txt
‚úÖ Saved tokenizer configuration to /home/infodna/high_court_legal_bert_tokenizer/tokenizer.json
‚úÖ Saved Hugging Face tokenizer to /home/infodna/high_court_legal_bert_tokenizer
üîç Validating tokenizer...
‚úÖ Collected 5 sample texts for validation
Sample text tokenized into 1 chunks
Input IDs shape: torch.Size([1, 512])
Attention mask shape: torch.Size([1, 512])
üéâ

In [23]:
import os
import glob
import random
import re
import torch
from transformers import BertTokenizer, AutoTokenizer
from tokenizers import BertWordPieceTokenizer, pre_tokenizers, processors

# Directory setup
base_dir = os.path.abspath("")
previous_output_dir = os.path.join(base_dir, "new_legal_bert_tokenizer")
dataset_dir = os.path.join(base_dir, "datasets", "high_court_dataset", "txt_new")
output_dir = os.path.join(base_dir, "high_court_legal_bert_tokenizer")
previous_vocab_file = os.path.join(previous_output_dir, "vocab.txt")

print(f"Current working directory: {base_dir}")
print(f"Previous tokenizer directory: {previous_output_dir}")
print(f"Dataset directory: {dataset_dir}")
print(f"Tokenizer output directory: {output_dir}")
print(f"Loading previous vocab file: {previous_vocab_file}")

# Function to filter strictly English-only text
def filter_english_text(text: str) -> str:
    # Keep only ASCII letters, digits, and minimal punctuation
    pattern = r'[^a-zA-Z0-9\s.,!?]'  # Exclude all non-English characters
    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text if cleaned_text.strip() else None

# Collect text files and filter for English-only content
def collect_text_files(directory: str):
    files = glob.glob(os.path.join(directory, "*.txt"))
    if not files:
        raise FileNotFoundError(f"No TXT files found in {directory}")
    
    english_texts = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read().strip()
            cleaned_text = filter_english_text(text)
            if cleaned_text:
                temp_file = os.path.join(output_dir, f"temp_{os.path.basename(file)}")
                with open(temp_file, 'w', encoding='utf-8') as temp_f:
                    temp_f.write(cleaned_text)
                english_texts.append(temp_file)
    
    print(f"‚úÖ Found {len(files)} TXT files, filtered to {len(english_texts)} English-only files")
    return english_texts

# Collect sample texts for validation
def collect_sample_texts(directory: str, num_samples: int = 5):
    files = glob.glob(os.path.join(directory, "*.txt"))
    if not files:
        raise FileNotFoundError(f"No TXT files found in {directory}")
    random.shuffle(files)
    texts = []
    for file in files[:num_samples]:
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read().strip()
            cleaned_text = filter_english_text(text)
            if cleaned_text:
                texts.append(cleaned_text)
    print(f"‚úÖ Collected {len(texts)} English-only sample texts for validation")
    return texts

# Check available GPUs
excluded_gpus = [0, 6]
available_gpus = [i for i in range(torch.cuda.device_count()) if i not in excluded_gpus]
print(f"Available GPUs: {available_gpus}")
device = f"cuda:{available_gpus[0]}" if available_gpus and torch.cuda.is_available() else "cpu"
print(f"Using device for validation: {device}")

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Load previous vocab
if not os.path.exists(previous_vocab_file):
    raise FileNotFoundError(f"No vocab.txt found at {previous_vocab_file}")

# Initialize BertWordPieceTokenizer with previous vocab
tokenizer = BertWordPieceTokenizer(
    previous_vocab_file,
    lowercase=True,
    strip_accents=False,
    clean_text=True
)

# Get current vocab size
current_vocab_size = tokenizer.get_vocab_size()
new_vocab_size = current_vocab_size + 500  # Allow limited new English tokens
print(f"Current vocab size: {current_vocab_size}. Training to new size: {new_vocab_size}")

# Configure special tokens and processors
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# Collect and filter files, then train tokenizer
text_files = collect_text_files(dataset_dir)
if not text_files:
    raise ValueError("No English-only text files available for training after filtering")
tokenizer.train(
    files=text_files,
    vocab_size=new_vocab_size,
    special_tokens=special_tokens,
    min_frequency=2,
    show_progress=True,
    wordpieces_prefix="##",
    limit_alphabet=100  # Restrictive to enforce English-only
)
print("‚úÖ Tokenizer training completed")

# Clean up temporary files
for temp_file in text_files:
    os.remove(temp_file)
print("‚úÖ Cleaned up temporary files")

# Save the trained tokenizer model
saved_files = tokenizer.save_model(output_dir)
print(f"‚úÖ Saved trained tokenizer model to: {saved_files[0]}")  # vocab.txt

# Load into Hugging Face BertTokenizer and filter vocabulary
hf_tokenizer = BertTokenizer(vocab_file=os.path.join(output_dir, "vocab.txt"))
vocab = hf_tokenizer.get_vocab()
english_vocab = {k: v for k, v in vocab.items() if re.match(r'^[a-zA-Z0-9\s.,!?]*$', k) or k in special_tokens}
hf_tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})  # Correctly pass as dictionary
hf_tokenizer.add_tokens([k for k in english_vocab.keys() if k not in special_tokens])  # Add English tokens
print(f"‚úÖ Filtered vocab size: {len(hf_tokenizer.get_vocab())} English-only tokens")

# Save the filtered Hugging Face tokenizer
hf_tokenizer.save_pretrained(output_dir)
print(f"‚úÖ Saved Hugging Face tokenizer with English-only vocab to {output_dir}")

# Validate tokenizer with chunking
print("üîç Validating tokenizer...")
samples = collect_sample_texts(dataset_dir)
sample_text = samples[0] if samples else "Sample High Court legal judgment text for testing."
encoding = hf_tokenizer(
    sample_text,
    max_length=512,
    stride=50,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    return_overflowing_tokens=True
)
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
print(f"Sample text tokenized into {len(encoding['input_ids'])} chunks")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention mask shape: {attention_mask.shape}")

# Print decoded tokens to verify English-only
decoded_tokens = hf_tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
print(f"Sample decoded tokens: {decoded_tokens[:10]}")  # Show first 10 tokens

print("üéâ English-only tokenizer fine-tuned and validated successfully!")

Current working directory: /home/infodna
Previous tokenizer directory: /home/infodna/new_legal_bert_tokenizer
Dataset directory: /home/infodna/datasets/high_court_dataset/txt_new
Tokenizer output directory: /home/infodna/high_court_legal_bert_tokenizer
Loading previous vocab file: /home/infodna/new_legal_bert_tokenizer/vocab.txt
Available GPUs: [1, 2, 3, 4, 5, 7]
Using device for validation: cuda:1
Current vocab size: 30522. Training to new size: 31022
‚úÖ Found 48590 TXT files, filtered to 48590 English-only files



‚úÖ Tokenizer training completed
‚úÖ Cleaned up temporary files
‚úÖ Saved trained tokenizer model to: /home/infodna/high_court_legal_bert_tokenizer/vocab.txt


AttributeError: 'list' object has no attribute 'items'