# Deduplicate and Sort the words bassed on TFIDF

In [1]:
from datasets import load_dataset

dataset = load_dataset("coastalcph/lex_glue", "scotus")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
})

In [2]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import OrderedDict


# Fit a TF-IDF vectorizer on the training texts ONLY
train_texts = dataset["train"]["text"]
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(train_texts)

# Create a dictionary mapping each token to its IDF score
idf_dict = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

def deduplicate_and_sort(text):
    """
    Remove duplicate tokens (preserving first occurrence) and then sort 
    the unique tokens in descending order according to their TF-IDF scores.
    
    Tokens not seen during training (and thus not in idf_dict) get a default score of 0.
    """
    # Tokenize the text using whitespace splitting; adjust if more advanced tokenization is needed.
    tokens = text.split()
    
    # Remove duplicates while preserving order
    seen = set()
    unique_tokens = []
    for token in tokens:
        # Convert to lowercase for checking against the TF-IDF dictionary
        lower_token = token.lower()
        if lower_token not in seen:
            seen.add(lower_token)
            unique_tokens.append(token)
    
    # Sort tokens by TF-IDF score in descending order. If a token isn't present, default to 0.
    sorted_tokens = sorted(unique_tokens, key=lambda t: idf_dict.get(t.lower(), 0), reverse=True)
    
    # Join back into a single string
    return " ".join(sorted_tokens)

def process_batch(example):
    """
    Process a batch of examples by applying deduplication and TF-IDF based sorting
    to the "text" field.
    """
    new_texts = [deduplicate_and_sort(t) for t in example["text"]]
    return {"text": new_texts}

# Map the processing function to all splits (train, validation, test if available)
new_dataset = dataset.map(process_batch, batched=True)

# (Optional) Check a sample from each split
print("Sample processed text from train split:")
print(new_dataset["train"][0]["text"])

if "validation" in new_dataset:
    print("\nSample processed text from validation split:")
    print(new_dataset["validation"][0]["text"])

if "test" in new_dataset:
    print("\nSample processed text from test split:")
    print(new_dataset["test"][0]["text"])

Sample processed text from train split:
Cimarron offerers Valvoline servitudes Enid racks arouses 19a mastery CHAMPLIN connects refinery Uncle charts tanks crosses converts maps understands subjection enclosed connect Sam tank crude tariffs indiscriminately departures unitary calculating pricing pipeline deliveries finished vitally inventory Originating continually carriage valuation differential generality disclaimed destination supplementary contemplation classify flows compiled remedied transmission wells gathering aforesaid purview stations Harry marketing premature commodity intrastate pipe lying producers REFINING convey departed fields partly transporting technically petroleum subordinate commodities treats owns exceeds falling transported artificial storage 1906 expenditures offend meaningless Valley copies Rock belonging schedule distributing truck hypothetical petitioned hire conventional practicable terminal Naturally road delegated manufacturing master expand greatest freig

In [3]:
def compute_lengt(new_dataset):
    highest=0
    total_length=0
    for idx in range(len(new_dataset['train'])):
        total_length+=len(new_dataset['train'][idx]['text'])
        if len(new_dataset['train'][idx]['text']) > highest:
            highest=len(new_dataset['train'][idx]['text'])
    print (f'The average length of documents in training dataset is {round(total_length/len(new_dataset['train']))}\nThe lengthiest document in the dataset contains {highest} number of tokens')        

In [4]:
compute_lengt(dataset)

The average length of documents in training dataset is 35723
The lengthiest document in the dataset contains 562772 number of tokens


In [5]:
compute_lengt(new_dataset)

The average length of documents in training dataset is 12137
The lengthiest document in the dataset contains 96425 number of tokens


In [6]:
new_dataset.push_to_hub("victorambrose11/scotus_deduplicate_sort")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/victorambrose11/scotus_deduplicate_sort/commit/77d196d8cd6c8115e1d5f0ca3503186733de05b3', commit_message='Upload dataset', commit_description='', oid='77d196d8cd6c8115e1d5f0ca3503186733de05b3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/victorambrose11/scotus_deduplicate_sort', endpoint='https://huggingface.co', repo_type='dataset', repo_id='victorambrose11/scotus_deduplicate_sort'), pr_revision=None, pr_num=None)

# Normalize Deduplicate and Sort the words bassed on TFIDF

In [7]:
dataset = load_dataset("victorambrose11/normalized_scotus")
dataset

README.md:   0%|          | 0.00/894 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/95.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/40.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/39.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1400
    })
})

In [8]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import OrderedDict


# Fit a TF-IDF vectorizer on the training texts ONLY
train_texts = dataset["train"]["text"]
vectorizer = TfidfVectorizer(lowercase=True)
vectorizer.fit(train_texts)

# Create a dictionary mapping each token to its IDF score
idf_dict = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))

def deduplicate_and_sort(text):
    """
    Remove duplicate tokens (preserving first occurrence) and then sort 
    the unique tokens in descending order according to their TF-IDF scores.
    
    Tokens not seen during training (and thus not in idf_dict) get a default score of 0.
    """
    # Tokenize the text using whitespace splitting; adjust if more advanced tokenization is needed.
    tokens = text.split()
    
    # Remove duplicates while preserving order
    seen = set()
    unique_tokens = []
    for token in tokens:
        # Convert to lowercase for checking against the TF-IDF dictionary
        lower_token = token.lower()
        if lower_token not in seen:
            seen.add(lower_token)
            unique_tokens.append(token)
    
    # Sort tokens by TF-IDF score in descending order. If a token isn't present, default to 0.
    sorted_tokens = sorted(unique_tokens, key=lambda t: idf_dict.get(t.lower(), 0), reverse=True)
    
    # Join back into a single string
    return " ".join(sorted_tokens)

def process_batch(example):
    """
    Process a batch of examples by applying deduplication and TF-IDF based sorting
    to the "text" field.
    """
    new_texts = [deduplicate_and_sort(t) for t in example["text"]]
    return {"text": new_texts}

# Map the processing function to all splits (train, validation, test if available)
new_dataset = dataset.map(process_batch, batched=True)

# (Optional) Check a sample from each split
print("Sample processed text from train split:")
print(new_dataset["train"][0]["text"])

if "validation" in new_dataset:
    print("\nSample processed text from validation split:")
    print(new_dataset["validation"][0]["text"])

if "test" in new_dataset:
    print("\nSample processed text from test split:")
    print(new_dataset["test"][0]["text"])

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Sample processed text from train split:
Cimarron offerers Valvoline servitudes Enid racks arouses 19a mastery CHAMPLIN connects refinery Uncle charts tanks crosses converts maps understands subjection enclosed connect Sam tank crude tariffs indiscriminately departures unitary calculating pricing pipeline deliveries finished vitally inventory Originating continually carriage valuation differential generality disclaimed destination supplementary contemplation classify flows compiled remedied transmission wells gathering aforesaid purview stations Harry marketing premature commodity intrastate pipe lying producers REFINING convey departed fields partly transporting technically petroleum subordinate commodities treats owns exceeds falling transported artificial storage 1906 expenditures offend meaningless Valley copies Rock belonging schedule distributing truck hypothetical petitioned hire conventional practicable terminal Naturally road delegated manufacturing master expand greatest freig

In [9]:
compute_lengt(dataset)

The average length of documents in training dataset is 37956
The lengthiest document in the dataset contains 584365 number of tokens


In [10]:
compute_lengt(new_dataset)

The average length of documents in training dataset is 12241
The lengthiest document in the dataset contains 96529 number of tokens


In [11]:
new_dataset.push_to_hub("victorambrose11/scotus_normalize_deduplicate_sort")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/victorambrose11/scotus_normalize_deduplicate_sort/commit/83cabf8713b99e3d39dd94ab72bf6b0d74ecabc3', commit_message='Upload dataset', commit_description='', oid='83cabf8713b99e3d39dd94ab72bf6b0d74ecabc3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/victorambrose11/scotus_normalize_deduplicate_sort', endpoint='https://huggingface.co', repo_type='dataset', repo_id='victorambrose11/scotus_normalize_deduplicate_sort'), pr_revision=None, pr_num=None)