#    Preprocesses data for TFIDF-EMB-LegalBERT:
1. Tokenizes documents normally (maintains word order)
2. Calculates TF-IDF scores for all tokens
3. Bucketizes TF-IDF scores
4. Adds bucket IDs for each token

In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer
from datasets import DatasetDict
import pandas as pd

def tfidf_score_to_bucket(score, num_buckets=32, min_val=0.0, max_val=10.0):
    """Bucketize a continuous TF-IDF score into a discrete bin."""
    if score < min_val:
        return 0
    elif score >= max_val:
        return num_buckets - 1
    normalized = (score - min_val) / (max_val - min_val)
    return int(normalized * (num_buckets - 1))

def preprocess_tfidf_srt_emb(
    dataset: DatasetDict,
    tokenizer_name="nlpaueb/legal-bert-base-uncased",
    max_length=512,
    num_buckets=32
):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # Step 1: Fit TF-IDF on training set (tokenized)
    print("Tokenizing training set for TF-IDF fitting...")
    tokenized_train = [" ".join(tokenizer.tokenize(text)) for text in dataset['train']['text']]
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\S+')
    tfidf_vectorizer.fit(tokenized_train)

    idf_dict = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

    processed_data = {}
    for split in dataset.keys():
        print(f"Processing split: {split}")
        input_ids_list = []
        attention_mask_list = []
        tfidf_buckets_list = []
        labels = dataset[split]['label'] if 'label' in dataset[split].features else [None] * len(dataset[split])

        for text in dataset[split]['text']:
            tokens = tokenizer.tokenize(text)

            # Deduplicate (keep first occurrence)
            seen = set()
            unique_tokens = []
            for token in tokens:
                if token not in seen:
                    unique_tokens.append(token)
                    seen.add(token)

            # Score tokens by IDF only (TF not used)
            token_scores = {t: idf_dict.get(t, 0.0) for t in unique_tokens}
            sorted_tokens = sorted(unique_tokens, key=lambda t: token_scores[t], reverse=True)

            # Truncate to fit CLS/SEP
            sorted_tokens = sorted_tokens[:max_length - 2]
            tokens_final = [tokenizer.cls_token] + sorted_tokens + [tokenizer.sep_token]

            # Convert to IDs
            input_ids = tokenizer.convert_tokens_to_ids(tokens_final)
            attention_mask = [1] * len(input_ids)

            # Compute bucket IDs
            sorted_scores = [token_scores.get(t, 0.0) for t in sorted_tokens]
            bucket_ids = [0] + [tfidf_score_to_bucket(s, num_buckets) for s in sorted_scores] + [0]  # CLS/SEP bucket=0

            # Pad if needed
            padding_length = max_length - len(input_ids)
            input_ids += [tokenizer.pad_token_id] * padding_length
            attention_mask += [0] * padding_length
            bucket_ids += [0] * padding_length

            input_ids_list.append(input_ids)
            attention_mask_list.append(attention_mask)
            tfidf_buckets_list.append(bucket_ids)

        processed_data[split] = pd.DataFrame({
            "input_ids": input_ids_list,
            "attention_mask": attention_mask_list,
            "tfidf_bucket_ids": tfidf_buckets_list,
            "label": labels
        })

    return processed_data

In [14]:
from datasets import load_dataset

dataset = load_dataset("coastalcph/lex_glue", "scotus")
TF_IDF_EMB = preprocess_tfidf_srt_emb(dataset)

Token indices sequence length is longer than the specified maximum sequence length for this model (4330 > 512). Running this sequence through the model will result in indexing errors


Tokenizing training set for TF-IDF fitting...
Processing split: train
Processing split: test
Processing split: validation


In [15]:
from datasets import DatasetDict, Dataset

# Example: if tfidf_srt_processed is a dict of lists or pandas DataFrames
# Convert it to DatasetDict
if isinstance(TF_IDF_EMB, dict):
    TF_IDF_EMB = DatasetDict({
        split: Dataset.from_pandas(data) if not isinstance(data, Dataset) else data
        for split, data in TF_IDF_EMB.items()
    })

In [16]:
TF_IDF_EMB.push_to_hub("victorambrose11/TF_IDF_EMB")


ConnectionError: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: df1771a3-381b-4314-8516-ce8f25d84b67)')