# Task 9.3 Solutions: Dataset Processing

This notebook contains solutions to the exercises in the Dataset Processing notebook.

In [None]:
# Setup
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer
import torch
import numpy as np
import os

print("Setup complete!")

## Exercise Solution: Process the AG News Dataset

Build a complete processing pipeline for the AG News dataset:
1. Load `ag_news` dataset
2. Create train/validation split (90/10)
3. Tokenize with `distilbert-base-uncased`
4. Add a column for text length
5. Filter out very short texts (< 10 words)
6. Save the processed dataset

In [None]:
# Step 1: Load AG News dataset
print("Step 1: Loading AG News dataset...")
ag_news = load_dataset("ag_news")

print(f"Dataset structure: {ag_news}")
print(f"\nFeatures: {ag_news['train'].features}")
print(f"\nLabel mapping:")
print("  0 = World")
print("  1 = Sports")
print("  2 = Business")
print("  3 = Sci/Tech")

# Show sample
print(f"\nSample: {ag_news['train'][0]}")

In [None]:
# Step 2: Create train/validation split (90/10)
print("Step 2: Creating train/validation split...")

# Split the training data
train_val_split = ag_news['train'].train_test_split(
    test_size=0.1,
    seed=42,
    stratify_by_column='label'  # Maintain class balance
)

# Create final dataset dict
dataset = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': ag_news['test']
})

print(f"Train: {len(dataset['train']):,}")
print(f"Validation: {len(dataset['validation']):,}")
print(f"Test: {len(dataset['test']):,}")

In [None]:
# Step 3: Tokenize with distilbert-base-uncased
print("Step 3: Tokenizing...")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    """Tokenize text using DistilBERT tokenizer."""
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128  # AG News texts are relatively short
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,
    num_proc=4,
    desc="Tokenizing"
)

print(f"Columns after tokenization: {tokenized_dataset['train'].column_names}")

In [None]:
# Step 4: Add a column for text length
print("Step 4: Adding text length column...")

def add_word_count(examples):
    """Add word count to each example."""
    examples['word_count'] = [len(text.split()) for text in examples['text']]
    return examples

tokenized_dataset = tokenized_dataset.map(
    add_word_count,
    batched=True,
    batch_size=1000,
    desc="Adding word counts"
)

# Show word count distribution
word_counts = tokenized_dataset['train']['word_count']
print(f"Word count stats:")
print(f"  Min: {min(word_counts)}")
print(f"  Max: {max(word_counts)}")
print(f"  Mean: {np.mean(word_counts):.1f}")

In [None]:
# Step 5: Filter out very short texts (< 10 words)
print("Step 5: Filtering short texts...")

print(f"Before filtering: {len(tokenized_dataset['train']):,}")

def is_long_enough(example):
    """Keep only texts with 10+ words."""
    return example['word_count'] >= 10

filtered_dataset = tokenized_dataset.filter(
    is_long_enough,
    desc="Filtering short texts"
)

print(f"After filtering: {len(filtered_dataset['train']):,}")
print(f"Removed: {len(tokenized_dataset['train']) - len(filtered_dataset['train']):,} short texts")

In [None]:
# Step 6: Prepare for training and save
print("Step 6: Finalizing and saving...")

# Rename label to labels (expected by Trainer)
final_dataset = filtered_dataset.rename_column('label', 'labels')

# Remove original text to save space (we have tokens now)
final_dataset = final_dataset.remove_columns(['text', 'word_count'])

print(f"Final columns: {final_dataset['train'].column_names}")

# Set format for PyTorch
final_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Save to disk
save_path = "./processed_ag_news"
final_dataset.save_to_disk(save_path)
print(f"Saved to {save_path}")

In [None]:
# Verify the saved dataset
from datasets import load_from_disk

loaded = load_from_disk(save_path)
print("Loaded dataset:")
print(loaded)

# Check a sample
sample = loaded['train'][0]
print(f"\nSample:")
print(f"  input_ids shape: {sample['input_ids'].shape}")
print(f"  labels: {sample['labels']}")

## Challenge Solution: Multi-Dataset Pipeline

Create a processing pipeline that:
1. Loads IMDB, Yelp, and Amazon reviews
2. Standardizes them to same format
3. Combines into one dataset
4. Creates balanced splits
5. Tokenizes with consistent settings

In [None]:
# Multi-Dataset Pipeline
print("MULTI-DATASET PIPELINE")
print("=" * 50)

# Step 1: Load datasets (using small subsets for demo)
print("\n1. Loading datasets...")

# IMDB - binary sentiment (0=neg, 1=pos)
imdb = load_dataset("imdb", split="train[:5000]")
print(f"   IMDB: {len(imdb)} samples")

# Yelp - 5-star ratings, convert to binary
yelp = load_dataset("yelp_review_full", split="train[:5000]")
print(f"   Yelp: {len(yelp)} samples")

# Amazon - polarity (1=neg, 2=pos -> convert to 0, 1)
amazon = load_dataset("amazon_polarity", split="train[:5000]")
print(f"   Amazon: {len(amazon)} samples")

In [None]:
# Step 2: Standardize format - all to {'text': str, 'label': int (0 or 1)}
print("\n2. Standardizing formats...")

def standardize_imdb(examples):
    """IMDB is already in correct format."""
    return {
        'text': examples['text'],
        'label': examples['label'],
        'source': ['imdb'] * len(examples['text'])
    }

def standardize_yelp(examples):
    """Convert Yelp 5-star to binary (1-2 = neg, 4-5 = pos, 3 = neutral -> neg)."""
    binary_labels = [0 if label < 3 else 1 for label in examples['label']]
    return {
        'text': examples['text'],
        'label': binary_labels,
        'source': ['yelp'] * len(examples['text'])
    }

def standardize_amazon(examples):
    """Convert Amazon polarity (1/2) to (0/1)."""
    binary_labels = [label - 1 for label in examples['label']]  # 1->0, 2->1
    # Amazon has 'content' not 'text'
    return {
        'text': examples['content'],
        'label': binary_labels,
        'source': ['amazon'] * len(examples['content'])
    }

# Apply standardization
imdb_std = imdb.map(standardize_imdb, batched=True, remove_columns=imdb.column_names)
yelp_std = yelp.map(standardize_yelp, batched=True, remove_columns=yelp.column_names)
amazon_std = amazon.map(standardize_amazon, batched=True, remove_columns=amazon.column_names)

print(f"   IMDB columns: {imdb_std.column_names}")
print(f"   Yelp columns: {yelp_std.column_names}")
print(f"   Amazon columns: {amazon_std.column_names}")

In [None]:
# Step 3: Combine into one dataset
print("\n3. Combining datasets...")

combined = concatenate_datasets([imdb_std, yelp_std, amazon_std])
print(f"   Combined size: {len(combined)}")

# Show source distribution
import collections
source_dist = collections.Counter(combined['source'])
print(f"   Source distribution: {dict(source_dist)}")

In [None]:
# Step 4: Create balanced splits
print("\n4. Creating balanced splits...")

# Shuffle and split
combined = combined.shuffle(seed=42)
splits = combined.train_test_split(
    test_size=0.2,
    seed=42,
    stratify_by_column='label'  # Maintain label balance
)

# Further split test into validation and test
val_test = splits['test'].train_test_split(test_size=0.5, seed=42)

multi_dataset = DatasetDict({
    'train': splits['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})

print(f"   Train: {len(multi_dataset['train'])}")
print(f"   Validation: {len(multi_dataset['validation'])}")
print(f"   Test: {len(multi_dataset['test'])}")

In [None]:
# Step 5: Tokenize with consistent settings
print("\n5. Tokenizing...")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_multi(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

tokenized_multi = multi_dataset.map(
    tokenize_multi,
    batched=True,
    batch_size=1000,
    remove_columns=['text'],
    desc="Tokenizing"
)

# Rename for Trainer
tokenized_multi = tokenized_multi.rename_column('label', 'labels')

print(f"   Final columns: {tokenized_multi['train'].column_names}")
print("\nMulti-dataset pipeline complete!")

In [None]:
# Cleanup
import shutil

if os.path.exists("./processed_ag_news"):
    shutil.rmtree("./processed_ag_news")
    print("Cleaned up processed_ag_news directory")

print("Cleanup complete!")

## Summary

In this solution notebook, we demonstrated:

1. **AG News Processing Pipeline**:
   - Loaded and explored dataset
   - Created stratified train/val/test splits
   - Tokenized with DistilBERT
   - Added custom columns (word count)
   - Filtered by criteria
   - Saved processed dataset

2. **Multi-Dataset Pipeline**:
   - Combined IMDB, Yelp, and Amazon reviews
   - Standardized different formats
   - Created balanced splits
   - Applied consistent tokenization

Key learnings:
- Use `stratify_by_column` for balanced splits
- Standardize formats before combining datasets
- Use `concatenate_datasets` to merge datasets
- Remove unnecessary columns to save memory