In [5]:
import os
import glob
import pandas as pd
from tqdm import tqdm
from tokenizers import BertWordPieceTokenizer

# ----------------- Config -----------------
base_dir = os.path.abspath("")

# Existing tokenizer (already fine-tuned on Predex + High Court)
base_tokenizer_dir = os.path.join(base_dir, "custom_tokenizer")

# Stage 2 tokenizer output
output_dir = os.path.join(base_dir, "custom_tokenizer_stage2")
os.makedirs(output_dir, exist_ok=True)

# New datasets (CSV format)
court_datasets = {
    "DistrictCourt": os.path.join(base_dir, "datasets", "district_only_clean1"),
    "TribunalCourt": os.path.join(base_dir, "datasets", "Tribunal_single_2020_2024"),
    "DailyOrders": os.path.join(base_dir, "datasets", "Dailyorder_single_2020_2024"),
}

# ----------------- Function to collect texts -----------------
def collect_texts(directory: str):
    texts = []
    csv_files = glob.glob(os.path.join(directory, "*.csv"))
    for file in tqdm(csv_files, desc=f"CSV -> {os.path.basename(directory)}"):
        try:
            df = pd.read_csv(file, low_memory=False)
            string_data = df.select_dtypes(include=["object"]).values.flatten().tolist()
            string_data = [str(t).strip() for t in string_data if pd.notna(t) and str(t).strip()]
            texts.extend(string_data)
        except Exception as e:
            print(f"‚ö† Error reading {file}: {e}")
    return texts

# ----------------- Collect new texts -----------------
all_texts = []
for court_name, path in court_datasets.items():
    print(f"\nüìÇ Collecting texts for {court_name} ...")
    court_texts = collect_texts(path)
    print(f"‚úÖ {court_name}: {len(court_texts)} texts collected")
    all_texts.extend(court_texts)

print(f"\nüéØ Total new texts collected: {len(all_texts)}")

# Save to training corpus
training_corpus_file = os.path.join(output_dir, "training_texts_stage2.txt")
with open(training_corpus_file, "w", encoding="utf-8") as f:
    for line in all_texts:
        f.write(line + "\n")

print(f"üìÑ Training corpus saved at {training_corpus_file}")

# ----------------- Load existing tokenizer -----------------
print("\nüîë Loading base tokenizer...")
tokenizer = BertWordPieceTokenizer(
    os.path.join(base_tokenizer_dir, "vocab.txt"),
    lowercase=True,
)

# ----------------- Train tokenizer with new data -----------------
VOCAB_SIZE = 48000   # between 45k‚Äì50k
print(f"üöÄ Training tokenizer with vocab size = {VOCAB_SIZE}")

tokenizer.train(
    files=[training_corpus_file],
    vocab_size=VOCAB_SIZE,
    min_frequency=2,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

# ----------------- Save new tokenizer -----------------
tokenizer.save_model(output_dir)
print(f"‚úÖ Tokenizer saved at {output_dir}")

# ----------------- Validation sample -----------------
sample_text = "This is a sample Indian legal judgment text for tokenizer validation."
encoded = tokenizer.encode(sample_text)
print("\nüîç Validation sample:")
print("Input:", sample_text)
print("Tokens:", encoded.tokens)



üìÇ Collecting texts for DistrictCourt ...


CSV -> district_only_clean1: 0it [00:00, ?it/s]


‚úÖ DistrictCourt: 0 texts collected

üìÇ Collecting texts for TribunalCourt ...


CSV -> Tribunal_single_2020_2024: 0it [00:00, ?it/s]


‚úÖ TribunalCourt: 0 texts collected

üìÇ Collecting texts for DailyOrders ...


CSV -> Dailyorder_single_2020_2024: 0it [00:00, ?it/s]

‚úÖ DailyOrders: 0 texts collected

üéØ Total new texts collected: 0
üìÑ Training corpus saved at /home/infodna/custom_tokenizer_stage2/training_texts_stage2.txt

üîë Loading base tokenizer...
üöÄ Training tokenizer with vocab size = 48000



‚úÖ Tokenizer saved at /home/infodna/custom_tokenizer_stage2

üîç Validation sample:
Input: This is a sample Indian legal judgment text for tokenizer validation.
Tokens: ['[CLS]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[SEP]']





In [10]:
from tokenizers import BertWordPieceTokenizer
import pandas as pd
import os

# ----------------- Config -----------------
base_dir = os.path.abspath("")
tokenizer_dir = os.path.join(base_dir, "final_tokenizer")  # Folder containing vocab.txt, tokenizer.json, etc.
lora_dataset_path = os.path.join(base_dir, "datasets/dataset_lora1.jsonl")  # JSONL dataset path

max_length = 512
stride = 128

# Load custom tokenizer
print("üîë Loading custom tokenizer...")
vocab_path = os.path.join(tokenizer_dir, "vocab.txt")
tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=True)
print("‚úÖ Loaded tokenizer")

# Load a sample of LoRA dataset
df = pd.read_json(lora_dataset_path, lines=True)
print(f"üìä Dataset has {len(df)} rows. Columns: {df.columns.tolist()}")

text_column = 'input'
label_column = 'output'

# Take first 3 samples
samples = df[[text_column, label_column]].head(3).to_dict(orient='records')

print("\nüöÄ Verifying tokenization on 3 samples...")
for i, sample in enumerate(samples, 1):
    text = sample[text_column]
    label = sample[label_column]

    print(f"\n=== Sample {i} ===")
    print(f"Original Text (first 120 chars): {text[:120]}...")
    encoded = tokenizer.encode(text)

    # IDs and tokens
    ids = encoded.ids
    tokens = encoded.tokens

    print("Token IDs (first 20):", ids[:20])
    print("Tokens (first 20):", tokens[:20])

    # Decode back
    decoded = tokenizer.decode(ids)
    print("Decoded back (first 120 chars):", decoded[:120], "...")

    # UNK check
    unk_id = tokenizer.token_to_id("[UNK]")
    unk_count = sum(1 for _id in ids if _id == unk_id)
    print(f"UNK tokens: {unk_count}")


üîë Loading custom tokenizer...
‚úÖ Loaded tokenizer
üìä Dataset has 18208 rows. Columns: ['instruction', 'input', 'output', 'metadata']

üöÄ Verifying tokenization on 3 samples...

=== Sample 1 ===
Original Text (first 120 chars): Judgment on 18th April, 2008.Bhaskar J.These two first appeals arise out of two cases under Section 18 of the Land Act, ...
Token IDs (first 20): [2, 602, 121, 7421, 2087, 7, 2102, 9, 11112, 31, 9, 597, 805, 758, 908, 2554, 416, 88, 805, 965]
Tokens (first 20): ['[CLS]', 'judgment', 'on', '18th', 'april', ',', '2008', '.', 'bhaskar', 'j', '.', 'these', 'two', 'first', 'appeals', 'arise', 'out', 'of', 'two', 'cases']
Decoded back (first 120 chars): judgment on 18th april, 2008. bhaskar j. these two first appeals arise out of two cases under section 18 of the land act ...
UNK tokens: 0

=== Sample 2 ===
Original Text (first 120 chars): Beaumont, C.J.This is an appeal from a decision of J. sitting in bankruptcy.The facts as found by the learned Judge, whi...