In [2]:
from tokenizers import BertWordPieceTokenizer
import pandas as pd
import os

# ----------------- Config -----------------
base_dir = os.path.abspath("")
tokenizer_dir = os.path.join(base_dir, "final_tokenizer")  # Folder containing vocab.txt, tokenizer.json, etc.
lora_dataset_path = os.path.join(base_dir, "datasets/dataset_lora1.jsonl")  # Specific JSONL file

# Tokenizer parameters
max_length = 512  # Maximum sequence length for BERT
stride = 128      # Stride for overlapping chunks

# Load the custom tokenizer
print("ðŸ”‘ Loading custom tokenizer...")
try:
    # Check for required files
    vocab_path = os.path.join(tokenizer_dir, "vocab.txt")
    json_path = os.path.join(tokenizer_dir, "tokenizer.json")
    if not os.path.exists(vocab_path):
        raise FileNotFoundError(f"vocab.txt not found at {vocab_path}")
    if not os.path.exists(json_path):
        print(f"âš  tokenizer.json not found at {json_path}. Loading with vocab.txt only.")
    # Initialize tokenizer with vocab.txt (basic setup)
    tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=True)
    print("âœ… Loaded tokenizer using vocab.txt")
except FileNotFoundError as e:
    print(f"âš  {e}. Please ensure final_tokenizer contains vocab.txt.")
except Exception as e:
    print(f"âš  Error loading tokenizer: {e}")
    raise

# Load a sample of the LoRA dataset (JSONL format)
try:
    # Read JSONL file
    df = pd.read_json(lora_dataset_path, lines=True)
    print(f"ðŸ“Š Loaded dataset with {len(df)} rows. Columns: {df.columns.tolist()}")
    
    # Update column names based on dataset schema
    text_column = 'input'  # Use 'input' as the text column
    label_column = 'output'  # Use 'output' as the label column
    if text_column not in df.columns or label_column not in df.columns:
        print(f"âš  Columns {text_column} or {label_column} not found. Available columns: {df.columns.tolist()}")
    else:
        # Take first 3 samples
        samples = df[[text_column, label_column]].head(3).to_dict(orient='records')
        
        # Tokenize the 3 samples with chunking
        print("\nðŸš€ Tokenizing 3 samples with chunking...")
        for i, sample in enumerate(samples, 1):
            text = sample[text_column]
            label = sample[label_column]
            print(f"\nSample {i}:")
            print(f"Label: {label}")
            print(f"Original Text (first 100 chars): {text[:100]}...")

            # Chunk the text
            chunks = []
            tokens = tokenizer.encode(text).tokens
            for start in range(0, len(tokens), max_length - stride):
                end = min(start + max_length, len(tokens))
                chunk_tokens = tokens[start:end]
                if chunk_tokens:  # Ensure non-empty chunk
                    # Encode the chunk directly without manual ID conversion
                    encoded = tokenizer.encode(" ".join(chunk_tokens))
                    chunks.append({
                        "token_ids": encoded.ids,
                        "tokens": encoded.tokens,
                        "start_idx": start,
                        "end_idx": end
                    })
            
            # Print chunked results
            print(f"Number of chunks: {len(chunks)}")
            for j, chunk in enumerate(chunks, 1):
                print(f"Chunk {j} (Indices {chunk['start_idx']} to {chunk['end_idx']}):")
                print(f"Token IDs: {chunk['token_ids']}")
                print(f"Tokens: {chunk['tokens'][:10]}...")  # Show first 10 tokens

except FileNotFoundError:
    print(f"âš  File {lora_dataset_path} not found. Please check the path.")
except Exception as e:
    print(f"âš  Error: {e}")

ðŸ”‘ Loading custom tokenizer...
âœ… Loaded tokenizer using vocab.txt
ðŸ“Š Loaded dataset with 18208 rows. Columns: ['instruction', 'input', 'output', 'metadata']

ðŸš€ Tokenizing 3 samples with chunking...

Sample 1:
Label: Rejected
Original Text (first 100 chars): Judgment on 18th April, 2008.Bhaskar J.These two first appeals arise out of two cases under Section ...
Number of chunks: 5
Chunk 1 (Indices 0 to 512):
Token IDs: [2, 2, 602, 121, 7421, 2087, 7, 2102, 9, 11112, 31, 9, 597, 805, 758, 908, 2554, 416, 88, 805, 965, 178, 194, 674, 88, 85, 740, 189, 7, 7022, 108, 1357, 225, 1195, 436, 85, 1910, 2828, 259, 5733, 3241, 7, 2951, 479, 124, 85, 1212, 1011, 1239, 7, 37978, 7, 93, 33, 9, 22, 9, 198, 728, 9, 2262, 88, 4503, 108, 2113, 88, 4503, 7, 2199, 1357, 965, 9, 85, 552, 88, 3199, 2491, 7, 121, 8032, 4020, 539, 22, 178, 194, 15, 88, 85, 740, 189, 93, 1996, 103, 85, 88, 12, 9, 2142, 5560, 88, 740, 93, 695, 93, 85, 3034, 88, 37978, 522, 31, 9, 33, 9, 110, 9, 5204, 5, 13, 6, 7, 37, 9,

In [5]:
from tokenizers import BertWordPieceTokenizer
import pandas as pd
import os
import json

# ----------------- Config -----------------
base_dir = os.path.abspath("")
tokenizer_dir = os.path.join(base_dir, "final_tokenizer")  # Folder containing vocab.txt, tokenizer.json, etc.
lora_dataset_path = os.path.join(base_dir, "datasets/dataset_lora2.jsonl")  # Specific JSONL file
output_dir = os.path.join(base_dir, "tokenized_output2")  # Directory to save tokenized data
os.makedirs(output_dir, exist_ok=True)

# Tokenizer parameters
max_length = 512  # Maximum sequence length for BERT
stride = 128      # Stride for overlapping chunks

# Load the custom tokenizer
print("ðŸ”‘ Loading custom tokenizer...")
try:
    # Check for required files
    vocab_path = os.path.join(tokenizer_dir, "vocab.txt")
    json_path = os.path.join(tokenizer_dir, "tokenizer.json")
    if not os.path.exists(vocab_path):
        raise FileNotFoundError(f"vocab.txt not found at {vocab_path}")
    if not os.path.exists(json_path):
        print(f"âš  tokenizer.json not found at {json_path}. Loading with vocab.txt only.")
    # Initialize tokenizer with vocab.txt (basic setup)
    tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=True)
    print("âœ… Loaded tokenizer using vocab.txt")
except FileNotFoundError as e:
    print(f"âš  {e}. Please ensure final_tokenizer contains vocab.txt.")
except Exception as e:
    print(f"âš  Error loading tokenizer: {e}")
    raise

# Load the entire LoRA dataset (JSONL format)
try:
    # Read JSONL file
    df = pd.read_json(lora_dataset_path, lines=True)
    total_samples = len(df)
    print(f"ðŸ“Š Loaded dataset with {total_samples} rows. Columns: {df.columns.tolist()}")
    if total_samples != 18208:
        print(f"âš  Expected 18,208 samples, but found {total_samples}. Proceeding anyway.")
    
    # Update column names based on dataset schema
    text_column = 'input'  # Use 'input' as the text column
    label_column = 'output'  # Use 'output' as the label column
    if text_column not in df.columns or label_column not in df.columns:
        print(f"âš  Columns {text_column} or {label_column} not found. Available columns: {df.columns.tolist()}")
    else:
        # Take all samples
        samples = df[[text_column, label_column]].to_dict(orient='records')
        
        # Tokenize the entire dataset with chunking and save to file
        print("\nðŸš€ Tokenizing entire dataset with chunking and saving to file...")
        all_tokenized_data = []
        for i, sample in enumerate(samples, 1):
            text = sample[text_column]
            label = sample[label_column]
            if i % 1000 == 0 or i == total_samples:  # Progress update every 1000 samples and at the end
                print(f"Processing sample {i} of {total_samples}")

            # Chunk the text
            chunks = []
            tokens = tokenizer.encode(text).tokens
            for start in range(0, len(tokens), max_length - stride):
                end = min(start + max_length, len(tokens))
                chunk_tokens = tokens[start:end]
                if chunk_tokens:
                    encoded = tokenizer.encode(" ".join(chunk_tokens))
                    chunks.append({
                        "token_ids": encoded.ids,
                        "tokens": encoded.tokens,
                        "start_idx": start,
                        "end_idx": end
                    })
            
            # Save chunked results without previewing token_ids
            all_tokenized_data.append({
                "sample_index": i,
                "label": label,
                "num_chunks": len(chunks),
                "chunks": chunks
            })

        # Validate and save
        if len(all_tokenized_data) == total_samples:
            print(f"âœ… Processed exactly {total_samples} samples.")
        else:
            print(f"âš  Processed {len(all_tokenized_data)} samples, expected {total_samples}.")
        
        # Save to a JSON file
        output_file = os.path.join(output_dir, "tokenized_full_dataset.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_tokenized_data, f, ensure_ascii=False, indent=2)
        print(f"ðŸ“„ Saved tokenized data for entire dataset to {output_file}")

except FileNotFoundError:
    print(f"âš  File {lora_dataset_path} not found. Please check the path.")
except Exception as e:
    print(f"âš  Error: {e}")

ðŸ”‘ Loading custom tokenizer...
âœ… Loaded tokenizer using vocab.txt
ðŸ“Š Loaded dataset with 3457 rows. Columns: ['instruction', 'input', 'output', 'metadata']
âš  Expected 18,208 samples, but found 3457. Proceeding anyway.

ðŸš€ Tokenizing entire dataset with chunking and saving to file...
Processing sample 1000 of 3457
Processing sample 2000 of 3457
Processing sample 3000 of 3457
Processing sample 3457 of 3457
âœ… Processed exactly 3457 samples.
ðŸ“„ Saved tokenized data for entire dataset to /home/infodna/tokenized_output2/tokenized_full_dataset.json


In [None]:
import torch
import json
import os
from transformers import BertTokenizer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.multiprocessing import Pool, set_start_method
import numpy as np

# ----------------- Config -----------------
base_dir = os.path.abspath("")
tokenizer_dir = os.path.join(base_dir, "final_tokenizer")  # Folder containing vocab.txt, tokenizer.json, etc.
lora_dataset_path = os.path.join(base_dir, "datasets/dataset_single_lora.jsonl")  # Specific JSONL file
output_dir = os.path.join(base_dir, "tokenized_output")  # Directory to save tokenized data
os.makedirs(output_dir, exist_ok=True)

# Set visible GPUs (skip 0, 1, 5; use 2, 3, 4, 6, 7)
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,6,7"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"ðŸ”‘ Device in use: {device}")  # Verification print statement
num_gpus = torch.cuda.device_count()
print(f"ðŸ”‘ Using {num_gpus} GPUs: {torch.cuda.get_device_name(0)} et al.")

# Tokenizer parameters
max_length = 512  # Maximum sequence length for BERT
stride = 128      # Stride for overlapping chunks

# Load the custom tokenizer (using transformers)
print("ðŸ”‘ Loading custom tokenizer...")
try:
    tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)
    print("âœ… Loaded tokenizer using transformers")
except Exception as e:
    print(f"âš  Error loading tokenizer: {e}")
    raise

# Custom Dataset for GPU processing
class LoRADataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return sample['input'], sample['output']

# Load and process dataset
if __name__ == '__main__':
    set_start_method('spawn', force=True)

    # Tokenization function for GPU
    def tokenize_chunk(chunk_samples):
        chunk_data = []
        for i, sample in enumerate(chunk_samples):  # Unpack sample dict directly
            text = sample['input']
            label = sample['output']
            # Move tokenization to GPU
            encoded = tokenizer.encode_plus(
                text,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            ).to(device)
            
            # Chunking logic
            chunks = []
            input_ids = encoded['input_ids'][0]  # Shape: (max_length,)
            for start in range(0, max_length, max_length - stride):
                end = min(start + max_length, max_length)
                chunk_ids = input_ids[start:end]
                if torch.any(chunk_ids != 0):  # Skip padding
                    chunks.append({
                        "token_ids": chunk_ids.cpu().numpy().tolist(),
                        "start_idx": start,
                        "end_idx": end
                    })
            
            chunk_data.append({
                "sample_index": i + 1,
                "label": label,
                "num_chunks": len(chunks),
                "chunks": chunks
            })
        return chunk_data

    try:
        # Read JSONL file
        valid_data = []
        with open(lora_dataset_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f, 1):
                try:
                    obj = json.loads(line.strip())
                    valid_data.append(obj)
                except json.JSONDecodeError as e:
                    print(f"âš  Line {i} is malformed: {e}. Skipping.")
                    continue
        df = pd.DataFrame(valid_data)
        total_samples = len(df)
        print(f"ðŸ“Š Loaded dataset with {total_samples} rows. Columns: {df.columns.tolist()}")
        if total_samples != 18208:
            print(f"âš  Expected 18,208 samples, but found {total_samples}. Proceeding anyway.")
        
        # Validate columns
        text_column = 'input'
        label_column = 'output'
        if text_column not in df.columns or label_column not in df.columns:
            print(f"âš  Columns {text_column} or {label_column} not found. Available columns: {df.columns.tolist()}")
        else:
            samples = df[[text_column, label_column]].to_dict(orient='records')
            
            # Split samples into chunks for each GPU
            def split_into_chunks(samples, num_chunks):
                chunk_size = len(samples) // num_chunks
                chunks = []
                for i in range(num_chunks):
                    start = i * chunk_size
                    end = start + chunk_size if i < num_chunks - 1 else len(samples)
                    chunks.append(samples[start:end])
                return chunks
            
            # Parallel processing across GPUs
            print("\nðŸš€ Tokenizing entire dataset with chunking and saving to file...")
            sample_chunks = split_into_chunks(samples, num_gpus)
            with Pool(processes=num_gpus) as pool:
                results = pool.map(tokenize_chunk, sample_chunks)  # Pass only chunks
            
            # Combine results
            all_tokenized_data = []
            for chunk_results in results:
                all_tokenized_data.extend(chunk_results)
            
            # Validate and save
            if len(all_tokenized_data) == total_samples:
                print(f"âœ… Processed exactly {total_samples} samples.")
            else:
                print(f"âš  Processed {len(all_tokenized_data)} samples, expected {total_samples}.")
            
            # Save to a JSON file
            output_file = os.path.join(output_dir, "tokenized_full_dataset.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(all_tokenized_data, f, ensure_ascii=False, indent=2)
            print(f"ðŸ“„ Saved tokenized data for entire dataset to {output_file}")

    except FileNotFoundError:
        print(f"âš  File {lora_dataset_path} not found. Please check the path.")
    except Exception as e:
        print(f"âš  Error: {e}")