#   Preprocesses data for TFIDF-SRT-LegalBERT:
1. Tokenizes documents
2. Removes duplicate tokens
3. Sorts remaining tokens by TF-IDF scores
4. Truncates to 512 tokens if needed

In [1]:
from datasets import load_dataset
original_dataset=load_dataset("coastalcph/lex_glue", "scotus")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from transformers import AutoTokenizer
import pandas as pd

def preprocess_tfidf_srt_legalbert(dataset, tokenizer_name="nlpaueb/legal-bert-base-uncased"):
    """
    Args:
        dataset: HuggingFace dataset with train/validation/test splits
        tokenizer_name: Name of the tokenizer to use
        
    Returns:
        Preprocessed dataset with new 'processed_input_ids' and 'processed_attention_mask' fields
    """
    # Load LegalBERT tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    # First, we need to tokenize all documents to build the TF-IDF vocabulary
    tokenized_texts = {split: [] for split in dataset.keys()}
    
    # Tokenize all texts and store them as lists of token strings
    for split in dataset.keys():
        print(f"Processing {split} set with TF-IDF sorting and deduplication...")
        
        texts = dataset[split]['text']
        labels = dataset[split]['labels'] if 'labels' in dataset[split] else [None] * len(texts)
        
        for text, label in zip(texts, labels):
            # Tokenize text
            tokens = tokenizer.tokenize(text)
            
            # Calculate TF for each token
            token_counts = Counter(tokens)
            total_tokens = len(tokens)
            
            # Calculate TF-IDF scores
            tfidf_scores = {}
            for token, count in token_counts.items():
                tf = count / total_tokens
                idf = token_idf.get(token, 1.0)
                tfidf_scores[token] = tf * idf
            
            # Remove duplicates (keep only first occurrence)
            seen = set()
            unique_tokens = []
            for token in tokens:
                if token not in seen:
                    unique_tokens.append(token)
                    seen.add(token)
            
            # Sort tokens by TF-IDF scores (decreasing order)
            sorted_tokens = sorted(unique_tokens, key=lambda t: tfidf_scores.get(t, 0.0), reverse=True)
            
            # Convert tokens to input IDs
            input_ids = tokenizer.convert_tokens_to_ids(sorted_tokens)
            
            # Add special tokens
            input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
            
            # Create attention mask based on TF-IDF scores
            attention_mask = [1.0]  # For CLS token
            for token in sorted_tokens:
                attention_mask.append(min(1.0, tfidf_scores[token] / max(tfidf_scores.values())))
            attention_mask.append(1.0)  # For SEP token
            
            # Truncate or pad sequences
            if len(input_ids) > 512:
                input_ids = input_ids[:511] + [tokenizer.sep_token_id]
                attention_mask = attention_mask[:512]
            else:
                padding_length = 512 - len(input_ids)
                input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
                attention_mask = attention_mask + [0.0] * padding_length
            
            processed_dataset[split]['input_ids'].append(input_ids)
            processed_dataset[split]['attention_mask'].append(attention_mask)
            processed_dataset[split]['labels'].append(label)
    
    # Convert to DataFrame format
    result_dataset = {}
    for split in dataset.keys():
        result_dataset[split] = pd.DataFrame(processed_dataset[split])
    
    return result_dataset

TF_IDF_SRT = preprocess_tfidf_srt_legalbert(original_dataset)

In [4]:
TF_IDF_SRT = preprocess_tfidf_srt_legalbert(original_dataset)

Token indices sequence length is longer than the specified maximum sequence length for this model (4330 > 512). Running this sequence through the model will result in indexing errors


Tokenizing train set...
Tokenizing test set...
Tokenizing validation set...
Fitting TF-IDF vectorizer on training data...
Processing train set with TF-IDF sorting and deduplication...


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing test set with TF-IDF sorting and deduplication...
Processing validation set with TF-IDF sorting and deduplication...


In [6]:
type(TF_IDF_SRT)

dict

In [17]:
TF_IDF_SRT['train']['labels']=original_dataset['train']['label']
TF_IDF_SRT['test']['labels']=original_dataset['test']['label']
TF_IDF_SRT['validation']['labels']=original_dataset['validation']['label']

In [27]:
d={}
for i in range(len(TF_IDF_SRT['train'])):
    current_mask=str(TF_IDF_SRT['train']['attention_mask'][i])
    if  current_mask not in d:
        d[current_mask]=1
    else:
        d[current_mask]+=1

In [28]:
len(d.keys())

388

In [29]:
TF_IDF_SRT['train']['input_ids'][:4]

0    [101, 207, 117, 115, 3710, 13169, 2282, 210, 2...
1    [101, 117, 207, 115, 210, 14733, 12067, 211, 1...
2    [101, 117, 115, 207, 2479, 210, 26432, 4313, 8...
3    [101, 117, 115, 207, 4962, 210, 11955, 211, 48...
Name: input_ids, dtype: object

In [30]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from transformers import AutoTokenizer
import pandas as pd

def preprocess_tfidf_srt_legalbert(dataset, tokenizer_name="nlpaueb/legal-bert-base-uncased"):
    """
    Preprocesses data for TFIDF-SRT-LegalBERT:
    1. Tokenizes documents
    2. Removes duplicate tokens
    3. Sorts remaining tokens by TF-IDF scores
    4. Truncates to 512 tokens if needed
    
    Args:
        dataset: HuggingFace dataset with train/validation/test splits
        tokenizer_name: Name of the tokenizer to use
        
    Returns:
        Preprocessed dataset with new 'input_ids' and 'attention_mask' fields
    """
    # Load LegalBERT tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    # First, we need to tokenize all documents to build the TF-IDF vocabulary
    tokenized_texts = {split: [] for split in dataset.keys()}
    
    # Tokenize all texts and store them as lists of token strings
    for split in dataset.keys():
        print(f"Tokenizing {split} set...")
        
        # Assuming the input text is in 'text' field - adjust as needed
        texts = dataset[split]['text']
        
        for text in texts:
            # Tokenize without adding special tokens yet
            tokens = tokenizer.tokenize(text)
            tokenized_texts[split].append(tokens)
    
    # Combine all training tokens for fitting TF-IDF
    all_train_tokens = [' '.join(tokens) for tokens in tokenized_texts['train']]
    
    # Fit TF-IDF vectorizer on training data
    print("Fitting TF-IDF vectorizer on training data...")
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\S+')
    tfidf_vectorizer.fit(all_train_tokens)
    
    # Create vocabulary dictionary that maps tokens to their IDF values
    vocabulary = tfidf_vectorizer.vocabulary_
    idfs = tfidf_vectorizer.idf_
    token_idf = {token: idfs[idx] for token, idx in vocabulary.items()}
    
    # Process each split
    processed_dataset = {split: {'input_ids': [], 'attention_mask': [], 'labels': []} for split in dataset.keys()}
    
    for split in dataset.keys():
        print(f"Processing {split} set with TF-IDF sorting and deduplication...")
        
        # Get all texts and labels for this split
        texts = dataset[split]['text']
        labels = dataset[split]['labels'] if 'labels' in dataset[split] else [None] * len(texts)
        
        for text, label in zip(texts, labels):
            # Tokenize text
            tokens = tokenizer.tokenize(text)
            
            # Calculate TF for each token
            token_counts = Counter(tokens)
            total_tokens = len(tokens)
            
            # Calculate TF-IDF scores
            tfidf_scores = {}
            for token, count in token_counts.items():
                tf = count / total_tokens
                # Use default IDF value of 1.0 if token not in training vocabulary
                idf = token_idf.get(token, 1.0)
                tfidf_scores[token] = tf * idf
            
            # Remove duplicates (keep only first occurrence)
            seen = set()
            unique_tokens = []
            for token in tokens:
                if token not in seen:
                    unique_tokens.append(token)
                    seen.add(token)
            
            # Sort tokens by TF-IDF scores (decreasing order)
            sorted_tokens = sorted(unique_tokens, key=lambda t: tfidf_scores.get(t, 0.0), reverse=True)
            
            # Convert tokens to input IDs
            input_ids = tokenizer.convert_tokens_to_ids(sorted_tokens)
            
            # Add special tokens
            input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
            
            # Create attention mask based on TF-IDF scores
            attention_mask = [1.0]  # For CLS token
            for token in sorted_tokens:
                attention_mask.append(min(1.0, tfidf_scores[token] / max(tfidf_scores.values())))
            attention_mask.append(1.0)  # For SEP token
            
            # Truncate or pad sequences
            if len(input_ids) > 512:
                input_ids = input_ids[:511] + [tokenizer.sep_token_id]
                attention_mask = attention_mask[:512]
            else:
                padding_length = 512 - len(input_ids)
                input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
                attention_mask = attention_mask + [0.0] * padding_length
            
            processed_dataset[split]['input_ids'].append(input_ids)
            processed_dataset[split]['attention_mask'].append(attention_mask)
            processed_dataset[split]['labels'].append(label)
    
    # Convert to DataFrame format
    result_dataset = {}
    for split in dataset.keys():
        result_dataset[split] = pd.DataFrame(processed_dataset[split])
    
    return result_dataset

# Example of how to use the function:
# preprocessed_data = preprocess_tfidf_srt_legalbert(dataset)
TF_IDF_SRT_V2 = preprocess_tfidf_srt_legalbert(original_dataset)

Token indices sequence length is longer than the specified maximum sequence length for this model (4330 > 512). Running this sequence through the model will result in indexing errors


Tokenizing train set...
Tokenizing test set...
Tokenizing validation set...
Fitting TF-IDF vectorizer on training data...
Processing train set with TF-IDF sorting and deduplication...
Processing test set with TF-IDF sorting and deduplication...
Processing validation set with TF-IDF sorting and deduplication...


In [31]:
from datasets import DatasetDict, Dataset

# Example: if tfidf_srt_processed is a dict of lists or pandas DataFrames
# Convert it to DatasetDict
if isinstance(TF_IDF_SRT_V2, dict):
    TF_IDF_SRT_V2 = DatasetDict({
        split: Dataset.from_pandas(data) if not isinstance(data, Dataset) else data
        for split, data in TF_IDF_SRT_V2.items()
    })


In [32]:
TF_IDF_SRT_V2.push_to_hub("victorambrose11/TF_IDF_SRT_V2")


Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 34.50ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:09<00:00,  9.52s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 37.83ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  4.00s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 45.46ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.94s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/victorambrose11/TF_IDF_SRT_V2/commit/2102ee1bf685a1498d5fdbc3535545d99b4b776f', commit_message='Upload dataset', commit_description='', oid='2102ee1bf685a1498d5fdbc3535545d99b4b776f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/victorambrose11/TF_IDF_SRT_V2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='victorambrose11/TF_IDF_SRT_V2'), pr_revision=None, pr_num=None)