# LittleFish-Coder/Fake_News_KDD2020

https://huggingface.co/datasets/LittleFish-Coder/Fake_News_KDD2020


In [2]:
from datasets import DatasetDict, load_dataset

# load and download the dataset from huggingface
dataset = load_dataset("LittleFish-Coder/Fake_News_KDD2020")

print(dataset)
print(f'train size: {len(dataset["train"])}')
print(f'test size: {len(dataset["test"])}')

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings'],
        num_rows: 4487
    })
    test: Dataset({
        features: ['text', 'label', 'bert_embeddings', 'roberta_embeddings'],
        num_rows: 499
    })
})
train size: 4487
test size: 499


# Add BERT Embeddings

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [4]:
import torch
import numpy as np

def generate_bert_embeddings(text, max_length=512, stride=256):
    # Tokenize the text without truncation
    tokens = bert_tokenizer(text, return_tensors='pt', truncation=False)
    input_ids = tokens['input_ids'][0]
    attention_mask = tokens['attention_mask'][0]

    chunks = []
    attention_chunks = []

    for i in range(0, len(input_ids), stride):
        chunk_ids = input_ids[i:i + max_length]
        chunk_mask = attention_mask[i:i + max_length]

        # Stop if the chunk is empty
        if len(chunk_ids) == 0:
            break

        # Padding if chunk is shorter than max_length
        padding_len = max_length - len(chunk_ids)
        if padding_len > 0:
            chunk_ids = torch.cat([chunk_ids, torch.zeros(padding_len, dtype=torch.long)])
            chunk_mask = torch.cat([chunk_mask, torch.zeros(padding_len, dtype=torch.long)])

        chunks.append(chunk_ids.unsqueeze(0))
        attention_chunks.append(chunk_mask.unsqueeze(0))

        if i + max_length >= len(input_ids):
            break

    all_embeddings = []

    with torch.no_grad():
        for chunk_ids, chunk_mask in zip(chunks, attention_chunks):
            inputs = {
                "input_ids": chunk_ids.to(device),
                "attention_mask": chunk_mask.to(device)
            }
            outputs = bert_model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # (1, 768)
            all_embeddings.append(cls_embedding.cpu().numpy())

    # Average all CLS embeddings
    all_embeddings = np.vstack(all_embeddings)
    final_embedding = np.mean(all_embeddings, axis=0)  # shape: (768,)

    return final_embedding

In [6]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map: 100%|██████████| 4487/4487 [03:22<00:00, 22.11 examples/s]
Map: 100%|██████████| 499/499 [00:23<00:00, 20.95 examples/s]


# Reformat the dataset

In [7]:
# remove the redundant text column: ['roberta_embeddings']
dataset = dataset.remove_columns(['roberta_embeddings'])

# Upload to HuggingFace Hub

In [8]:
# push the dataset to the hub 
dataset.push_to_hub('Blueeeeee/FND_KDD2020_Embeddings')

Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00,  9.94ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.26s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  9.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Blueeeeee/FND_KDD2020_Embeddings/commit/1a6fda7c7068279820b4f2e3a531f1ae4fec57e2', commit_message='Upload dataset', commit_description='', oid='1a6fda7c7068279820b4f2e3a531f1ae4fec57e2', pr_url=None, pr_revision=None, pr_num=None)