# Get dataset
HuggingFace provides a lot of datasets. Let's choose wikipedia.

In [2]:
import transformers, torch, datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset, Dataset

In [4]:
dataset_name = "wikimedia/wikipedia"
config_name = "20231101.en"

# Load the dataset in streaming mode, this avoids downloading the whole file
print(f"Loading dataset in streaming mode: {dataset_name} with config {config_name}")
streaming_dataset = load_dataset(dataset_name, config_name, split="train", streaming=True)

# Collect articles until we have roughly 1GB data
num_articles_to_collect = 150000
subset_data = []

print(f"Collecting approximately {num_articles_to_collect} articles...")
for i, article in enumerate(streaming_dataset):
    if i >= num_articles_to_collect:
        break
    subset_data.append(article)

print(f"Finished collecting {len(subset_data)} articles.")

# Convert the collected list of dictionaries into an in-memory Hugging Face Dataset
raw_dataset = Dataset.from_list(subset_data)

# You can now see the structure and size of your in-memory dataset
print("\nIn-memory dataset loaded successfully.")
print(raw_dataset)
print(f"Total number of articles in the dataset: {len(raw_dataset)}")

Loading dataset in streaming mode: wikimedia/wikipedia with config 20231101.en
Collecting approximately 150000 articles...
Finished collecting 150000 articles.

In-memory dataset loaded successfully.
Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 150000
})
Total number of articles in the dataset: 150000


## Make sure we have 1GB of data

In [5]:
import sys

total_bytes = 0
for article in raw_dataset:
    text_content = article['text']
    total_bytes += sys.getsizeof(text_content)

# Convert bytes to megabytes (MB) and gigabytes (GB)
total_mb = total_bytes / (1024 * 1024)
total_gb = total_bytes / (1024 * 1024 * 1024)

print(f"Total in-memory size of text: {total_bytes} bytes")
print(f"Total size: {total_mb:.2f} MB")
print(f"Total size: {total_gb:.2f} GB")

Total in-memory size of text: 1227035932 bytes
Total size: 1170.19 MB
Total size: 1.14 GB


## Preprocess data

In [6]:
# Use the `unique` method to remove duplicates based on the 'text' column.
# This method works well on smaller, in-memory datasets.
print(f"Original number of documents: {len(raw_dataset)}")
list_of_unique_documents = raw_dataset.unique(column='text')
print(f"Number of documents after removing duplicates: {len(list_of_unique_documents)}")

Original number of documents: 150000
Number of documents after removing duplicates: 149941


In [7]:
# The unique method on a column returns a list of values (strings in this case),
# not a list of dictionaries. Convert the list of strings back to a list of
# dictionaries with a 'text' key for Dataset.from_list.
list_of_unique_documents_as_dicts = [{'text': text} for text in list_of_unique_documents]
unique_dataset = Dataset.from_list(list_of_unique_documents_as_dicts)
print(type(unique_dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [8]:
import re
from datasets import Dataset

def clean_and_normalize(examples):
    cleaned_texts = []

    for text in examples['text']:
        # Step 1: Optional - Remove HTML tags and markdown
        text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
        text = re.sub(r'\[\[[^\]]+\]\]', '', text) # Remove wiki-internal links

        # Step 2: Lowercase the entire text
        text = text.lower()

        # Step 3: Strip irrelevant symbols and normalize
        text = re.sub(r'[^a-z0-9\s.,?!;:\'-]', '', text)

        # Step 4: Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        cleaned_texts.append(text)

    examples['text'] = cleaned_texts
    return examples

def filter_short_documents(examples):
    return [len(text.split()) >= 50 for text in examples['text']]

# Apply the cleaning and normalization function using map
# batched=True processes multiple examples at once for speed
normalized_dataset = raw_dataset.map(clean_and_normalize, batched=True)

# Apply the filtering function to remove short documents
filtered_dataset = normalized_dataset.filter(filter_short_documents, batched=True)

print("Number of documents before filtering:", len(normalized_dataset))
print("Number of documents after filtering:", len(filtered_dataset))

print("First example after normalization:")
print(normalized_dataset[0]['text'])

Map: 100%|██████████| 150000/150000 [00:22<00:00, 6522.96 examples/s]
Filter: 100%|██████████| 150000/150000 [00:03<00:00, 47194.79 examples/s]

Number of documents before filtering: 150000
Number of documents after filtering: 142323
First example after normalization:
anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. as a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement libertarian socialism. humans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. with the rise of organised hierarchical bodies, scepticism toward authority also rose. although traces of anarchist ideas are found all throughout history, modern anarchism emerged from 




# Tokenization

In [9]:
# Load the tokenizer

from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
# Tokenize the dataset and handle long sequences

def tokenize_function(examples):
    tokenized_chunks = tokenizer(
        examples['text'],
        truncation=True,
        return_overflowing_tokens=True,
        max_length=512,  # Set your desired chunk size (e.g., 512 for BERT)
        stride=128  # Use a stride to create overlapping chunks and preserve context
    )
    return tokenized_chunks

# Apply the chunking function to the dataset
chunked_dataset = filtered_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=filtered_dataset.column_names # Remove the old columns to prevent errors
)

Map: 100%|██████████| 142323/142323 [01:35<00:00, 1493.92 examples/s]


In [11]:
print(f"Original number of examples: {len(filtered_dataset)}")
print(f"Number of examples after chunking: {len(chunked_dataset)}")

Original number of examples: 142323
Number of examples after chunking: 400654


# Custom Data Loader

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class WikipediaDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Extract input_ids and attention_mask from the dataset
        item = self.dataset[idx]
        # Convert lists to PyTorch tensors
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long)
        }

# Create an instance of the custom dataset
# Pass the tokenizer to the dataset
pytorch_dataset = WikipediaDataset(chunked_dataset, tokenizer)


# Define a custom collate function to handle padding
def custom_collate_fn(batch):
    # Use the tokenizer's padding function
    padded_batch = tokenizer.pad(batch, return_tensors='pt')
    return padded_batch


# Create a DataLoader for batching and shuffling
batch_size = 16  # Define your desired batch size
# Pass the custom collate function to the DataLoader
dataloader = DataLoader(pytorch_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)


print(f"Created PyTorch Dataset with {len(pytorch_dataset)} examples.")
print(f"Created PyTorch DataLoader with batch size {batch_size}.")

# Example of iterating through the dataloader
print("\nExample batch from DataLoader:")
for batch in dataloader:
    print("Input IDs shape:", batch['input_ids'].shape)
    print("Attention Mask shape:", batch['attention_mask'].shape)
    break # Just show one batch example

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Created PyTorch Dataset with 400654 examples.
Created PyTorch DataLoader with batch size 16.

Example batch from DataLoader:
Input IDs shape: torch.Size([16, 512])
Attention Mask shape: torch.Size([16, 512])


In [13]:
import torch

# Collect a few sample batches
num_samples = 5  # Number of batches to sample
sampled_batches = []

print(f"Collecting {num_samples} sample batches...")
for i, batch in enumerate(dataloader):
    if i >= num_samples:
        break
    sampled_batches.append(batch)

print(f"Collected {len(sampled_batches)} sample batches.")

# Save the sampled batches to a .pt file
save_path = "sample_processed_data.pt"
torch.save(sampled_batches, save_path)

print(f"Sampled batches saved to {save_path}")

Collecting 5 sample batches...
Collected 5 sample batches.
Sampled batches saved to sample_processed_data.pt


# Check file

In [14]:
import torch

# Load the saved file
sampled_batches = torch.load("sample_processed_data.pt", weights_only=False)

print(f"Loaded {len(sampled_batches)} batches from file.")

# Check structure of the first batch
first_batch = sampled_batches[0]
print("Type of batch:", type(first_batch))
print("Batch content:", first_batch)

Loaded 5 batches from file.
Type of batch: <class 'transformers.tokenization_utils_base.BatchEncoding'>
Batch content: {'input_ids': tensor([[  101,  5234,  1010,  ...,     0,     0,     0],
        [  101,  2220,  1997,  ...,     0,     0,     0],
        [  101, 16962, 22117,  ...,     0,     0,     0],
        ...,
        [  101, 12794, 27830,  ...,  8569,  2102,   102],
        [  101,  2203, 11261,  ...,     0,     0,     0],
        [  101, 10744,  1997,  ...,  2216,  1997,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}


In [15]:
chunked_dataset.save_to_disk("./processed_dataset")

Saving the dataset (3/3 shards): 100%|██████████| 400654/400654 [00:00<00:00, 702179.61 examples/s]
