In [1]:
# Import necessary libraries
from datasets import load_dataset
import os
import random

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Configuration variables - easy to adjust for experiments
DATASET_NAME = 'Anthropic/hh-rlhf'
NUM_SAMPLES_TRAIN = 5000
NUM_SAMPLES_TEST = 1000
OUTPUT_DIR = 'data'

print(f"Configuration:")
print(f"Dataset: {DATASET_NAME}")
print(f"Training samples: {NUM_SAMPLES_TRAIN}")
print(f"Test samples: {NUM_SAMPLES_TEST}")
print(f"Output directory: {OUTPUT_DIR}")


Configuration:
Dataset: Anthropic/hh-rlhf
Training samples: 5000
Test samples: 1000
Output directory: data


In [3]:
# Load the raw dataset from Hugging Face Hub
print("Loading dataset from Hugging Face Hub...")
try:
    raw_dataset = load_dataset(DATASET_NAME)
    print("✅ Dataset loaded successfully!")
    
    print("\nDataset structure:")
    print(raw_dataset)

    print("\nDataset info:")
    for split in raw_dataset.keys():
        print(f"{split}: {len(raw_dataset[split])} examples")
        
    # Show an example to understand the data format
    print("\nExample from training set:")
    example = raw_dataset['train'][0]
    print(f"Keys: {list(example.keys())}")
    print(f"\nChosen text (first 200 chars): {example['chosen'][:200]}...")
    print(f"\nRejected text (first 200 chars): {example['rejected'][:200]}...")
    
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("Please check your internet connection and try again.")


Loading dataset from Hugging Face Hub...
✅ Dataset loaded successfully!

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 160800
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 8552
    })
})

Dataset info:
train: 160800 examples
test: 8552 examples

Example from training set:
Keys: ['chosen', 'rejected']

Chosen text (first 200 chars): 

Human: What are some cuss words in english?

Assistant: Here’s an incomplete list.

Ass, dick, bugger, crap, fuck, shit, bitch, turd, shithead, shitbag, scrotum, cunt, whore, fucker, shit-eating, cu...

Rejected text (first 200 chars): 

Human: What are some cuss words in english?

Assistant: Here’s an incomplete list.

Ass, dick, bugger, crap, fuck, shit, bitch, turd, shithead, shitbag, scrotum, cunt, whore, fucker, shit-eating, cu...


In [4]:
# Create smaller, manageable subsets for experiments
print("Creating sampled datasets...")

# Shuffle for reproducibility and sample
train_sampled = raw_dataset['train'].shuffle(seed=42).select(range(NUM_SAMPLES_TRAIN))
test_sampled = raw_dataset['test'].shuffle(seed=42).select(range(NUM_SAMPLES_TEST))

print(f"\nSampled dataset sizes:")
print(f"Training set: {len(train_sampled)} examples")
print(f"Test set: {len(test_sampled)} examples")

print("\nSampling complete!")


Creating sampled datasets...

Sampled dataset sizes:
Training set: 5000 examples
Test set: 1000 examples

Sampling complete!


In [5]:
# Define preprocessing logic
def preprocess_example(example):
    """
    Preprocess a single example from the hh-rlhf dataset.
    
    Args:
        example: Dict with 'chosen' and 'rejected' fields
        
    Returns:
        Dict with 'prompt', 'chosen', and 'rejected' fields
    """
    
    def extract_prompt_and_response(text):
        """Extract prompt and response from conversation text"""
        try:
            # Find the last occurrence of "\n\nAssistant:"
            assistant_marker = "\n\nAssistant:"
            last_assistant_idx = text.rfind(assistant_marker)
            
            if last_assistant_idx == -1:
                # Fallback: try "Assistant:" without double newlines
                assistant_marker = "Assistant:"
                last_assistant_idx = text.rfind(assistant_marker)
                
            if last_assistant_idx == -1:
                # If no Assistant marker found, treat whole text as response
                return "", text.strip()
            
            # Split into prompt and response
            prompt = text[:last_assistant_idx].strip()
            response = text[last_assistant_idx + len(assistant_marker):].strip()
            
            return prompt, response
            
        except Exception as e:
            print(f"Error processing text: {e}")
            return "", text.strip()
    
    # Process chosen and rejected responses
    chosen_text = example['chosen']
    rejected_text = example['rejected']
    
    # Extract prompt and responses
    prompt_chosen, chosen_response = extract_prompt_and_response(chosen_text)
    prompt_rejected, rejected_response = extract_prompt_and_response(rejected_text)
    
    # Use the chosen prompt (they should be the same)
    prompt = prompt_chosen if prompt_chosen else prompt_rejected
    
    return {
        'prompt': prompt,
        'chosen': chosen_response,
        'rejected': rejected_response
    }

# Test the function with an example
print("Testing preprocessing function...")
test_example = train_sampled[0]
processed = preprocess_example(test_example)

print(f"\nOriginal chosen text (first 150 chars): {test_example['chosen'][:150]}...")
print(f"\nProcessed:")
print(f"Prompt (first 100 chars): {processed['prompt'][:100]}...")
print(f"Chosen (first 100 chars): {processed['chosen'][:100]}...")
print(f"Rejected (first 100 chars): {processed['rejected'][:100]}...")

print("\nPreprocessing function ready!")


Testing preprocessing function...

Original chosen text (first 150 chars): 

Human: Why did cells originally combine together to create life?

Assistant: Because their simple components -- chemicals -- interacted in particula...

Processed:
Prompt (first 100 chars): Human: Why did cells originally combine together to create life?...
Chosen (first 100 chars): Because their simple components -- chemicals -- interacted in particular ways.  And because of chemi...
Rejected (first 100 chars): Cells combine because they benefit from cooperation, since they can have less competition for resour...

Preprocessing function ready!


In [6]:
# Apply preprocessing and save data
print("Applying preprocessing to datasets...")

# Apply preprocessing function to both datasets
train_processed = train_sampled.map(preprocess_example)
test_processed = test_sampled.map(preprocess_example)

print("Preprocessing complete!")

# Define output file paths
train_output_path = os.path.join(OUTPUT_DIR, 'train_prefs.jsonl')
test_output_path = os.path.join(OUTPUT_DIR, 'test_prefs.jsonl')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save processed datasets to disk
print(f"\nSaving datasets...")
train_processed.to_json(train_output_path)
test_processed.to_json(test_output_path)

print(f"\n✅ Data preparation complete!")
print(f"Training data saved to: {train_output_path}")
print(f"Test data saved to: {test_output_path}")
print(f"\nDataset sizes:")
print(f"Training: {len(train_processed)} examples")
print(f"Test: {len(test_processed)} examples")


Applying preprocessing to datasets...
Preprocessing complete!

Saving datasets...


Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]


✅ Data preparation complete!
Training data saved to: data\train_prefs.jsonl
Test data saved to: data\test_prefs.jsonl

Dataset sizes:
Training: 5000 examples
Test: 1000 examples


In [7]:
# Verify saved data
print("Verifying saved data...")

# Load the saved training data back
train_output_path = os.path.join(OUTPUT_DIR, 'train_prefs.jsonl')
loaded_dataset = load_dataset('json', data_files=train_output_path)['train']

print(f"\nLoaded dataset size: {len(loaded_dataset)}")
print(f"\nFirst example from loaded dataset:")
first_example = loaded_dataset[0]

print(f"\nKeys: {list(first_example.keys())}")
print(f"\nPrompt: {first_example['prompt'][:200]}...")
print(f"\nChosen response: {first_example['chosen'][:200]}...")
print(f"\nRejected response: {first_example['rejected'][:200]}...")

# Verify the format is correct
required_keys = {'prompt', 'chosen', 'rejected'}
actual_keys = set(first_example.keys())

if required_keys.issubset(actual_keys):
    print("\n✅ Data format verification passed!")
    print("✅ Your dataset is ready for RLHF training!")
else:
    print(f"\n❌ Missing keys: {required_keys - actual_keys}")
    
print(f"\n🎉 Data preparation and verification complete!")


Verifying saved data...


Generating train split: 0 examples [00:00, ? examples/s]


Loaded dataset size: 5000

First example from loaded dataset:

Keys: ['chosen', 'rejected', 'prompt']

Prompt: Human: Why did cells originally combine together to create life?...

Chosen response: Because their simple components -- chemicals -- interacted in particular ways.  And because of chemical processes involving acids and bases, certain kinds of chemicals can begin to self-organize into ...

Rejected response: Cells combine because they benefit from cooperation, since they can have less competition for resources by working together....

✅ Data format verification passed!
✅ Your dataset is ready for RLHF training!

🎉 Data preparation and verification complete!
