# UKPLab/liar

https://huggingface.co/datasets/UKPLab/liar


In [1]:
from datasets import DatasetDict, load_dataset

# load and download the dataset from huggingface
dataset = load_dataset("UKPLab/liar")

print(dataset)
print(f'train size: {len(dataset["train"])}')
print(f'validation size: {len(dataset["validation"])}')
print(f'test size: {len(dataset["test"])}')

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.
Generating train split: 100%|██████████| 10269/10269 [00:00<00:00, 34775.67 examples/s]
Generating validation split: 100%|██████████| 1284/1284 [00:00<00:00, 184346.08 examples/s]
Generating test split: 100%|██████████| 1283/1283 [00:00<00:00, 195925.58 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label_text', 'labels', 'context'],
        num_rows: 10269
    })
    validation: Dataset({
        features: ['text', 'label_text', 'labels', 'context'],
        num_rows: 1284
    })
    test: Dataset({
        features: ['text', 'label_text', 'labels', 'context'],
        num_rows: 1283
    })
})
train size: 10269
validation size: 1284
test size: 1283





# Add BERT Embeddings

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [3]:
import torch
import numpy as np

def generate_bert_embeddings(text, max_length=512, stride=256):
    # Tokenize the text without truncation
    tokens = bert_tokenizer(text, return_tensors='pt', truncation=False)
    input_ids = tokens['input_ids'][0]
    attention_mask = tokens['attention_mask'][0]

    chunks = []
    attention_chunks = []

    for i in range(0, len(input_ids), stride):
        chunk_ids = input_ids[i:i + max_length]
        chunk_mask = attention_mask[i:i + max_length]

        # Stop if the chunk is empty
        if len(chunk_ids) == 0:
            break

        # Padding if chunk is shorter than max_length
        padding_len = max_length - len(chunk_ids)
        if padding_len > 0:
            chunk_ids = torch.cat([chunk_ids, torch.zeros(padding_len, dtype=torch.long)])
            chunk_mask = torch.cat([chunk_mask, torch.zeros(padding_len, dtype=torch.long)])

        chunks.append(chunk_ids.unsqueeze(0))
        attention_chunks.append(chunk_mask.unsqueeze(0))

        if i + max_length >= len(input_ids):
            break

    all_embeddings = []

    with torch.no_grad():
        for chunk_ids, chunk_mask in zip(chunks, attention_chunks):
            inputs = {
                "input_ids": chunk_ids.to(device),
                "attention_mask": chunk_mask.to(device)
            }
            outputs = bert_model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # (1, 768)
            all_embeddings.append(cls_embedding.cpu().numpy())

    # Average all CLS embeddings
    all_embeddings = np.vstack(all_embeddings)
    final_embedding = np.mean(all_embeddings, axis=0)  # shape: (768,)

    return final_embedding

In [4]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map: 100%|██████████| 10269/10269 [02:15<00:00, 75.79 examples/s]
Map: 100%|██████████| 1284/1284 [00:16<00:00, 77.68 examples/s]
Map: 100%|██████████| 1283/1283 [00:16<00:00, 77.82 examples/s]


# Reformat the dataset

In [5]:
# remove the redundant text column: ['label_text', 'context']
dataset = dataset.remove_columns(['label_text', 'context'])

# rename the columns: ['labels'] -> ['label']
dataset = dataset.rename_column('labels', 'label')

# Upload to HuggingFace Hub

In [6]:
# push the dataset to the hub 
dataset.push_to_hub('Blueeeeee/Liar_Embeddings')

Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 15.76ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.14s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 14.10ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 13.00ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Blueeeeee/Liar_Embeddings/commit/4b83c1539257c61e9a0b0630311da6ce372fab3f', commit_message='Upload dataset', commit_description='', oid='4b83c1539257c61e9a0b0630311da6ce372fab3f', pr_url=None, pr_revision=None, pr_num=None)