# Ahren09/MMSoc_PolitiFact

https://huggingface.co/datasets/Ahren09/MMSoc_PolitiFact

In [None]:
from datasets import DatasetDict, load_dataset

# load and download the dataset from huggingface
dataset = load_dataset("Ahren09/MMSoc_PolitiFact")

print(dataset)
print(f'train size: {len(dataset["train"])}')
print(f'test size: {len(dataset["test"])}')

# Add BERT Embeddings

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'

bert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", clean_up_tokenization_spaces=True)
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased").to(device)

In [None]:
import torch
import numpy as np

def generate_bert_embeddings(text, max_length=512, stride=256):
    # Tokenize the text without truncation
    tokens = bert_tokenizer(text, return_tensors='pt', truncation=False)
    input_ids = tokens['input_ids'][0]
    attention_mask = tokens['attention_mask'][0]

    chunks = []
    attention_chunks = []

    for i in range(0, len(input_ids), stride):
        chunk_ids = input_ids[i:i + max_length]
        chunk_mask = attention_mask[i:i + max_length]

        # Stop if the chunk is empty
        if len(chunk_ids) == 0:
            break

        # Padding if chunk is shorter than max_length
        padding_len = max_length - len(chunk_ids)
        if padding_len > 0:
            chunk_ids = torch.cat([chunk_ids, torch.zeros(padding_len, dtype=torch.long)])
            chunk_mask = torch.cat([chunk_mask, torch.zeros(padding_len, dtype=torch.long)])

        chunks.append(chunk_ids.unsqueeze(0))
        attention_chunks.append(chunk_mask.unsqueeze(0))

        if i + max_length >= len(input_ids):
            break

    all_embeddings = []

    with torch.no_grad():
        for chunk_ids, chunk_mask in zip(chunks, attention_chunks):
            inputs = {
                "input_ids": chunk_ids.to(device),
                "attention_mask": chunk_mask.to(device)
            }
            outputs = bert_model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # (1, 768)
            all_embeddings.append(cls_embedding.cpu().numpy())

    # Average all CLS embeddings
    all_embeddings = np.vstack(all_embeddings)
    final_embedding = np.mean(all_embeddings, axis=0)  # shape: (768,)

    return final_embedding

In [None]:
# Generate BERT embeddings
def add_bert_embeddings(batch):
    batch['bert_embeddings'] = [generate_bert_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_bert_embeddings, batched=True)

Map: 100%|██████████| 381/381 [00:30<00:00, 12.46 examples/s]
Map: 100%|██████████| 102/102 [00:08<00:00, 12.14 examples/s]


# Add RoBERTa Embeddings

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'

roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base", clean_up_tokenization_spaces=True)
roberta_model = AutoModel.from_pretrained("roberta-base").to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Sliding Window 產生 RoBERTa embeddings
def generate_roberta_embeddings(text, max_length=512, stride=256):
    # 先把文字轉成 token ids（不截斷）
    input_ids = roberta_tokenizer.encode(text, add_special_tokens=True)
    
    all_embeddings = []

    for i in range(0, len(input_ids), stride):
        chunk = input_ids[i:i+max_length]
        if len(chunk) == 0:
            break

        # 自動加上 attention mask & padding
        inputs = roberta_tokenizer.prepare_for_model(
            chunk,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # 加上 batch 維度
        for key in inputs:
            inputs[key] = inputs[key].unsqueeze(0)

        # 把資料移到 GPU 或 CPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = roberta_model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]  # 取 [CLS] 的表示
            all_embeddings.append(cls_embedding.cpu().numpy())

        if i + max_length >= len(input_ids):
            break

    # 將所有 [CLS] 向量平均，當作整篇文章的向量
    final_embedding = np.mean(np.vstack(all_embeddings), axis=0)  # shape: (768,)
    return final_embedding

In [None]:
# Generate RoBERTa embeddings
def add_roberta_embeddings(batch):
    batch['roberta_embeddings'] = [generate_roberta_embeddings(text) for text in batch['text']]
    return batch

# Apply the function to all splits
dataset = dataset.map(add_roberta_embeddings, batched=True)

Map: 100%|██████████| 381/381 [00:35<00:00, 10.58 examples/s]
Map: 100%|██████████| 102/102 [00:09<00:00, 10.65 examples/s]


# Reformat the dataset

In [17]:
# remove the redundant text column: ['image', 'split']
dataset = dataset.remove_columns(['image', 'split'])

# Upload to HuggingFace Hub

In [18]:
# push the dataset to the hub 
dataset.push_to_hub('Blueeeeee/PolitiFact_Embeddings')

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  5.52ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.44s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 18.01ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.88s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Blueeeeee/PolitiFact_Embeddings/commit/40a8868c6bf076f85671125d4a1bddbc8d4775b0', commit_message='Upload dataset', commit_description='', oid='40a8868c6bf076f85671125d4a1bddbc8d4775b0', pr_url=None, pr_revision=None, pr_num=None)