# Create HuggingFace Dataset from Element Highlights

This notebook processes element-highlights snapshots into a HuggingFace dataset with image-caption pairs.

## 1. Install Libraries and Import Dependencies

In [9]:
# Install required packages
!pip install pandas numpy matplotlib tqdm datasets huggingface_hub pillow

import os
import glob
import json
import shutil
import hashlib
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from datasets import Dataset, Image as DSImage, DatasetDict, Features, Value
from huggingface_hub import login, HfApi, create_repo



## 2. Configuration

In [10]:
# Configuration
HF_USERNAME = "Slyracoon23"  # Your HuggingFace username
DATASET_NAME = "rrvideo-element-highlights"  # Name for your dataset
ELEMENT_HIGHLIGHTS_DIR = "../element-highlights"  # Path to element-highlights directory
PROCESSED_DATA_DIR = "processed_data"  # Directory to store processed data

# Create directories
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(os.path.join(PROCESSED_DATA_DIR, "images"), exist_ok=True)

## 3. Login to HuggingFace

In [11]:
from huggingface_hub import login, whoami
try:

    # Try to get user info to check if already logged in
    user_info = whoami()
    print(f"Already logged in as: {user_info['name']}")
except Exception:
    print("Not logged in. Please login...")
    # Use the simpler login() function
    try:
        login()
        print("Login successful!")
    except Exception as e:
        print(f"Error during login: {e}")
        print("Please make sure you're logged in before proceeding.")

# Create repository if it doesn't exist
try:
    create_repo(
        repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
        repo_type="dataset",
        exist_ok=True,
        private=False
    )
    print(f"Repository {HF_USERNAME}/{DATASET_NAME} is ready")
except Exception as e:
    print(f"Error creating repository: {e}")


Already logged in as: Slyracoon23
Repository Slyracoon23/rrvideo-element-highlights is ready


## 4. Helper Functions

In [25]:
# Set up OpenRouter API key
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    print("Warning: OPENROUTER_API_KEY not found in environment variables")

def parse_snapshot_name(snapshot_dir):
    """Extract snapshot ID and timestamp from directory name"""
    base_name = os.path.basename(snapshot_dir)
    parts = base_name.split('_')
    if len(parts) >= 3:
        try:
            snapshot_id = int(parts[1])
            timestamp = int(parts[2])
            return {
                "snapshot_id": snapshot_id,
                "timestamp": timestamp,
                "snapshot_name": base_name
            }
        except ValueError:
            print(f"Could not parse snapshot name: {base_name}")
    return None

def generate_image_hash(img_path):
    """Generate a hash for the image to prevent duplicates"""
    try:
        with open(img_path, 'rb') as f:
            return hashlib.md5(f.read()).hexdigest()
    except Exception as e:
        print(f"Error generating hash for {img_path}: {e}")
        return None

def extract_element_metadata(img_path):
    """Extract basic metadata from the image"""
    try:
        img = Image.open(img_path)
        width, height = img.size
        return {
            "width": width,
            "height": height,
            "aspect_ratio": width / height if height > 0 else 0
        }
    except Exception as e:
        print(f"Error extracting metadata from {img_path}: {e}")
        return {}

def generate_caption(element_data, model="openai/gpt-4o"):
    """Generate a caption for the element using OpenRouter AI API"""
    import requests
    import json
    import base64
    import os
    
    # Get image path and read the image
    img_path = element_data.get("image_path")
    if not img_path or not os.path.exists(img_path):
        # Fallback to basic caption if image not available
        return generate_basic_caption(element_data)
    
    try:
        # Read and encode the image
        with open(img_path, "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode('utf-8')
        
        # Create the API request
        response = requests.post(
            url="https://openrouter.ai/api/v1/chat/completions",
            headers={
                "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                "HTTP-Referer": os.environ.get('YOUR_SITE_URL', 'http://localhost'),
                "X-Title": os.environ.get('YOUR_SITE_NAME', 'UI Element Caption Generator'),
                "Content-Type": "application/json",
            },
            data=json.dumps({
                "model": model,  # Now configurable via parameter
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                # "text": "Describe this UI element within the red box in detail. Include its purpose, appearance, and possible functionality."
                                "text": "Create a comprehensive caption for the UI element within the red box. Describe its structural components, text content, visual appearance, and overall functionality within the interface."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{base64_image}"
                                }
                            }
                        ]
                    }
                ]
            })
        )
        
        # Parse the response
        result = response.json()
        if 'choices' in result and len(result['choices']) > 0:
            caption = result['choices'][0]['message']['content']
            return caption
        else:
            print(f"Error generating caption: No valid response from API")
            return generate_basic_caption(element_data)
            
    except Exception as e:
        print(f"Error generating caption with OpenRouter: {e}")
        return generate_basic_caption(element_data)

def generate_basic_caption(element_data):
    """Generate a basic caption as fallback"""
    element_id = element_data.get("element_id", "unknown")
    snapshot_id = element_data.get("snapshot_id", "unknown")
    metadata = element_data.get("metadata", {})
    
    # Basic caption
    caption = f"UI element {element_id} from snapshot {snapshot_id}"
    
    # Add dimensions if available
    if "width" in metadata and "height" in metadata:
        caption += f", dimensions {metadata['width']}x{metadata['height']} pixels"
    
    return caption


Generated caption for google/gemini-2.0-flash-001:
Here's a detailed description of the UI element within the red box:

**Purpose:**

This UI element represents one option for the user to choose as their workspace type. It's a "Starter Workspace" pre-configured for basic features and individual use. It is part of the Quickstart process which aims to simplify user setup of a workspace.

**Appearance:**

*   **Shape:** It appears to be structured as a Rounded rectangular tile.
*   **Border:** Red border around the rectangular tile.
*   **Content:**
    *   An icon, an airplane icon on the top left.
    *   A heading, "Starter Workspace", which indicates the type of workspace that will be configured.
    *   A right arrow
    *   Checkboxes: There are 3 checkboxes: Basic feautures, Ideal for individuals, Quick setup
    *   Bullet-pointed list: A list of three features: Basic features, Ideal for individuals, Quick setup.
*   **Visual Hierarchy:** The heading "Starter Workspace" is likely 

## Testing generate Caption

In [22]:
test_image_path="/Users/earlpotters/Documents/decipher/rrvideo/notebooks/processed_data/images/03a0a0b53d5ed3bffe2c6de7f6dee3a3.png"
# Create element_data dictionary for testing
test_element_data = {
    "image_path": test_image_path,
    "element_id": "test_boardwalk",
    "snapshot_id": 1,
    "metadata": {"width": 2560, "height": 1440, "aspect_ratio": 2560 / 1440}
}

# Test generate_caption using the google/gemini-2.0-flash-001 model
caption = generate_caption(test_element_data, model="google/gemini-2.0-flash-001")
print("Generated caption for google/gemini-2.0-flash-001:")
print(caption)

Generated caption for google/gemini-2.0-flash-001:
UI element test_boardwalk from snapshot 1, dimensions 2560x1440 pixels


## 5. Process Snapshots

In [26]:
# Define a global constant for the models to use for captioning
MODELS = [
    "google/gemini-2.0-flash-001",
    "qwen/qwen-vl-plus",
    "amazon/nova-lite-v1",
    "meta-llama/llama-3.2-11b-vision-instruct",
    "openai/gpt-4o-mini"
]

def process_snapshot(snapshot_dir):
    """Process a single snapshot directory and return image-caption pairs"""
    snapshot_info = parse_snapshot_name(snapshot_dir)
    if not snapshot_info:
        return []
    
    # Get all element images in the directory (excluding originals)
    element_images = [f for f in glob.glob(os.path.join(snapshot_dir, "element_*.png")) 
                      if not f.endswith("original.png")]
    
    data = []
    
    for img_path in element_images:
        # Get element ID from filename
        element_id = os.path.basename(img_path).replace("element_", "").replace(".png", "")
        
        # Generate a hash for the image
        img_hash = generate_image_hash(img_path)
        if not img_hash:
            continue
        
        # Copy image to processed directory with hash name
        new_img_path = os.path.join(PROCESSED_DATA_DIR, "images", f"{img_hash}.png")
        shutil.copy(img_path, new_img_path)
        
        # Extract metadata
        metadata = extract_element_metadata(img_path)
        
        # Prepare data entry
        element_data = {
            "image_path": new_img_path,
            "original_path": img_path,
            "image_hash": img_hash,
            "snapshot_id": snapshot_info["snapshot_id"],
            "timestamp": snapshot_info["timestamp"],
            "element_id": element_id,
            "snapshot_name": snapshot_info["snapshot_name"],
            "metadata": metadata,
            "captions": {}  # Dictionary to store captions from different models
        }
        
        # Generate captions using each model defined in MODELS
        for model in MODELS:
            caption = generate_caption(element_data, model=model)
            element_data["captions"][model] = caption
        
        data.append(element_data)
    
    return data

def process_all_snapshots():
    """Process all snapshot directories and compile dataset"""
    # Get all snapshot directories
    snapshot_dirs = glob.glob(os.path.join(ELEMENT_HIGHLIGHTS_DIR, "snapshot_*"))
    print(f"Found {len(snapshot_dirs)} snapshot directories")
    
    print("Using the following models for captioning:")
    for model in MODELS:
        print(f"- {model}")
    
    all_data = []
    
    for snapshot_dir in tqdm(snapshot_dirs, desc="Processing snapshots"):
        snapshot_data = process_snapshot(snapshot_dir)
        all_data.extend(snapshot_data)
    
    print(f"Processed {len(all_data)} elements across {len(snapshot_dirs)} snapshots")
    
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame(all_data)
    
    # Save intermediate CSV for inspection
    df.to_csv(os.path.join(PROCESSED_DATA_DIR, "element_data.csv"), index=False)
    
    return df

# Process all snapshots
df = process_all_snapshots()

Found 2 snapshot directories
Using the following models for captioning:
- google/gemini-2.0-flash-001
- qwen/qwen-vl-plus
- amazon/nova-lite-v1
- meta-llama/llama-3.2-11b-vision-instruct
- openai/gpt-4o-mini


Processing snapshots:   0%|          | 0/2 [00:00<?, ?it/s]

Processed 61 elements across 2 snapshots


## 6. Analyze and Clean Data

In [27]:
# Display basic statistics
print("Dataset Statistics:")
print(f"Total number of elements: {len(df)}")
print(f"Number of unique snapshots: {df['snapshot_id'].nunique()}")
print(f"Number of unique image hashes: {df['image_hash'].nunique()}")

# Extract width and height from metadata for easier access
df['width'] = df['metadata'].apply(lambda x: x.get('width', 0))
df['height'] = df['metadata'].apply(lambda x: x.get('height', 0))
df['aspect_ratio'] = df['metadata'].apply(lambda x: x.get('aspect_ratio', 0))

# Check for duplicate images
duplicates = df[df.duplicated(subset=['image_hash'], keep='first')]
print(f"Found {len(duplicates)} duplicate images (based on hash)")

# Remove duplicates if any
if len(duplicates) > 0:
    df = df.drop_duplicates(subset=['image_hash'], keep='first')
    print(f"After removing duplicates: {len(df)} elements")

# Display the first row of the DataFrame to understand its structure
print("\nFirst row of the DataFrame:")
print(df.iloc[0].to_dict())

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)

# Check for invalid dimensions (zero or negative width/height)
invalid_dimensions = df[(df['width'] <= 0) | (df['height'] <= 0)]
print(f"\nFound {len(invalid_dimensions)} elements with invalid dimensions")

# Remove elements with invalid dimensions if any
if len(invalid_dimensions) > 0:
    df = df[(df['width'] > 0) & (df['height'] > 0)]
    print(f"After removing invalid dimensions: {len(df)} elements")

# Display summary statistics for numerical columns
print("\nSummary statistics for dimensions:")
print(df[['width', 'height', 'aspect_ratio']].describe())
df

Dataset Statistics:
Total number of elements: 61
Number of unique snapshots: 2
Number of unique image hashes: 61
Found 0 duplicate images (based on hash)

First row of the DataFrame:
{'image_path': 'processed_data/images/dc49746d182ca196fefe2a1d4ef7fe99.png', 'original_path': '../element-highlights/snapshot_0_1739844581278/element_2.png', 'image_hash': 'dc49746d182ca196fefe2a1d4ef7fe99', 'snapshot_id': 0, 'timestamp': 1739844581278, 'element_id': '2', 'snapshot_name': 'snapshot_0_1739844581278', 'metadata': {'width': 1280, 'height': 720, 'aspect_ratio': 1.7777777777777777}, 'captions': {'google/gemini-2.0-flash-001': 'Here\'s a description of the UI element within the red box:\n\n**Purpose:**\n\n*   This element represents the HTTP status code "404," which is a standard error message indicating that the server cannot find the requested resource. It serves to immediately signify a page not found scenario to the user.\n\n**Appearance:**\n\n*   The numbers "404" are displayed in a bold, s

Unnamed: 0,image_path,original_path,image_hash,snapshot_id,timestamp,element_id,snapshot_name,metadata,captions,width,height,aspect_ratio
0,processed_data/images/dc49746d182ca196fefe2a1d...,../element-highlights/snapshot_0_1739844581278...,dc49746d182ca196fefe2a1d4ef7fe99,0,1739844581278,2,snapshot_0_1739844581278,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a desc...,1280,720,1.777778
1,processed_data/images/55a0f8fc35aae4e05e865e95...,../element-highlights/snapshot_0_1739844581278...,55a0f8fc35aae4e05e865e959bc641f9,0,1739844581278,3,snapshot_0_1739844581278,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a deta...,1280,720,1.777778
2,processed_data/images/aef811ad4061f445231bb9f2...,../element-highlights/snapshot_0_1739844581278...,aef811ad4061f445231bb9f28a802c03,0,1739844581278,1,snapshot_0_1739844581278,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a deta...,1280,720,1.777778
3,processed_data/images/80021c881e81e4fbb33a0193...,../element-highlights/snapshot_0_1739844581278...,80021c881e81e4fbb33a01936cbebf2b,0,1739844581278,4,snapshot_0_1739844581278,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a brea...,1280,720,1.777778
4,processed_data/images/bb25d50e6e7a940d8b6697c4...,../element-highlights/snapshot_0_1739844581278...,bb25d50e6e7a940d8b6697c4603f20d8,0,1739844581278,5,snapshot_0_1739844581278,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here is a des...,1280,720,1.777778
...,...,...,...,...,...,...,...,...,...,...,...,...
56,processed_data/images/45af163c8448f0f03cc4c6e8...,../element-highlights/snapshot_445_17398455496...,45af163c8448f0f03cc4c6e8ddb49e70,445,1739845549649,68,snapshot_445_1739845549649,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a desc...,1280,720,1.777778
57,processed_data/images/b3ef84c076c8dcc64143d275...,../element-highlights/snapshot_445_17398455496...,b3ef84c076c8dcc64143d275f4ce1a6c,445,1739845549649,40,snapshot_445_1739845549649,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a deta...,1280,720,1.777778
58,processed_data/images/1353475dba79843fb73b5c45...,../element-highlights/snapshot_445_17398455496...,1353475dba79843fb73b5c45c2cf093f,445,1739845549649,54,snapshot_445_1739845549649,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a deta...,1280,720,1.777778
59,processed_data/images/2ac181b3ecf130bb601e5864...,../element-highlights/snapshot_445_17398455496...,2ac181b3ecf130bb601e5864c9f04b29,445,1739845549649,55,snapshot_445_1739845549649,"{'width': 1280, 'height': 720, 'aspect_ratio':...",{'google/gemini-2.0-flash-001': 'Here's a deta...,1280,720,1.777778


## 7. Create and Upload HuggingFace Dataset

In [28]:
def create_and_upload_dataset(df):
    """Create HuggingFace dataset and upload to Hub"""
    import json
    from datasets import Dataset, DatasetDict, Features, Value, Image as DSImage
    from PIL import Image

    # Convert metadata dict to string to avoid serialization issues
    df['metadata_str'] = df['metadata'].apply(json.dumps)
    
    # Extract width, height, and aspect_ratio from the metadata dictionary into separate columns
    df['width'] = df['metadata'].apply(lambda m: m.get("width") if isinstance(m, dict) else None)
    df['height'] = df['metadata'].apply(lambda m: m.get("height") if isinstance(m, dict) else None)
    df['aspect_ratio'] = df['metadata'].apply(lambda m: m.get("aspect_ratio") if isinstance(m, dict) else None)
    
    # Reset the index to avoid including it in the dataset
    df_reset = df.reset_index(drop=True)
    dataset = Dataset.from_pandas(df_reset)
    
    # Define features with updated "captions" field (a dict of captions per model)
    features = Features({
        "image": DSImage(),
        "captions": {
            "google/gemini-2.0-flash-001": Value("string"),
            "qwen/qwen-vl-plus": Value("string"),
            "amazon/nova-lite-v1": Value("string"),
            "meta-llama/llama-3.2-11b-vision-instruct": Value("string"),
            "openai/gpt-4o-mini": Value("string")
        },
        "snapshot_id": Value("int64"),
        "timestamp": Value("int64"),
        "element_id": Value("string"),
        "image_hash": Value("string"),
        "snapshot_name": Value("string"),
        "metadata_str": Value("string"),
        "width": Value("int64"),
        "height": Value("int64"),
        "aspect_ratio": Value("float32")
    })
    
    # Function to convert image path to an actual image and include all fields
    def process_example(example):
        image = Image.open(example["image_path"])
        return {
            "image": image,
            "captions": example["captions"],
            "snapshot_id": example["snapshot_id"],
            "timestamp": example["timestamp"],
            "element_id": example["element_id"],
            "image_hash": example["image_hash"],
            "snapshot_name": example["snapshot_name"],
            "metadata_str": example["metadata_str"],
            "width": example["width"],
            "height": example["height"],
            "aspect_ratio": example["aspect_ratio"]
        }
    
    # Apply the transformation; remove columns that are no longer needed
    dataset = dataset.map(
        process_example,
        remove_columns=["image_path", "original_path", "metadata"],
        features=features
    )
    
    # Split into train/validation/test sets (80/10/10 split)
    splits = dataset.train_test_split(test_size=0.2, seed=42)
    test_valid = splits["test"].train_test_split(test_size=0.5, seed=42)
    
    # Create a DatasetDict with the splits
    dataset_dict = DatasetDict({
        "train": splits["train"],
        "validation": test_valid["train"],
        "test": test_valid["test"]
    })
    
    # Print split information
    print("Dataset splits:")
    for split, subset in dataset_dict.items():
        print(f"  {split}: {len(subset)} examples")
    
    # Push to the HuggingFace Hub
    dataset_dict.push_to_hub(
        f"{HF_USERNAME}/{DATASET_NAME}",
        private=False
    )
    
    print(f"Dataset uploaded to https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")
    return dataset_dict

# Create and upload dataset
print("Creating and uploading dataset...")
dataset_dict = create_and_upload_dataset(df)

Creating and uploading dataset...


Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Dataset splits:
  train: 48 examples
  validation: 6 examples
  test: 7 examples


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Dataset uploaded to https://huggingface.co/datasets/Slyracoon23/rrvideo-element-highlights


## 8. Create Dataset Card

In [29]:
def create_dataset_card():
    """Create a dataset card with information about the dataset"""
    card = f"""
---
language:
- en
license: cc-by-4.0
task_categories:
- image-classification
- image-to-text
---

# Dataset Card for {DATASET_NAME}

## Dataset Description

This dataset contains UI element images extracted from web snapshots, along with captions describing each element.

### Dataset Summary

The dataset contains {len(df)} unique UI elements from {df['snapshot_id'].nunique()} different snapshots. Each element is associated with a caption describing its basic properties.

### Dataset Statistics

- Total elements: {len(df)}
- Unique snapshots: {df['snapshot_id'].nunique()}
- Average element width: {df['width'].mean():.2f} pixels
- Average element height: {df['height'].mean():.2f} pixels
- Average aspect ratio: {df['aspect_ratio'].mean():.2f}

## Dataset Structure

### Data Fields

- **image**: The UI element image
- **caption**: A description of the element using several models
- **snapshot_id**: The snapshot identifier
- **timestamp**: When the snapshot was taken
- **element_id**: Identifier for the specific element
- **width**: The width of the element in pixels
- **height**: The height of the element in pixels
- **aspect_ratio**: The aspect ratio of the element

### Data Splits

- Train: {len(dataset_dict['train'])} examples (80%)
- Validation: {len(dataset_dict['validation'])} examples (10%)
- Test: {len(dataset_dict['test'])} examples (10%)
"""
    
    # Write dataset card to file
    readme_path = os.path.join(PROCESSED_DATA_DIR, "README.md")
    with open(readme_path, "w") as f:
        f.write(card)
    
    # Upload to HuggingFace Hub
    api = HfApi()
    api.upload_file(
        path_or_fileobj=readme_path,
        path_in_repo="README.md",
        repo_id=f"{HF_USERNAME}/{DATASET_NAME}",
        repo_type="dataset"
    )
    
    print(f"Dataset card uploaded to https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")

# Create and upload dataset card
create_dataset_card()

Dataset card uploaded to https://huggingface.co/datasets/Slyracoon23/rrvideo-element-highlights


## 9. Visualize Samples

In [None]:
def visualize_samples(dataset, num_samples=5):
    """Visualize random samples from the dataset"""
    if len(dataset) < num_samples:
        num_samples = len(dataset)
    
    indices = np.random.randint(0, len(dataset), num_samples)
    
    fig, axes = plt.subplots(num_samples, 1, figsize=(10, num_samples * 3))
    if num_samples == 1:
        axes = [axes]
    
    for i, idx in enumerate(indices):
        sample = dataset[int(idx)]
        image = sample["image"]
        caption = sample["caption"]
        
        axes[i].imshow(image)
        axes[i].set_title(f"Caption: {caption}", fontsize=10)
        axes[i].axis("off")
    
    plt.tight_layout()
    plt.savefig(os.path.join(PROCESSED_DATA_DIR, "sample_visualizations.png"))
    plt.show()

# Visualize samples from the training set
print("Visualizing samples from the training set...")
visualize_samples(dataset_dict["train"])

print("\nProcess complete! Your dataset is now available at:")
print(f"https://huggingface.co/datasets/{HF_USERNAME}/{DATASET_NAME}")