In [None]:
# Save canonical dataset reference
reference_data = {
    'total_canonical_datasets': 10,
    'configurations': dataset_configs,
    'metadata': dataset_metadata
}

with open(output_dir / 'canonical_datasets.json', 'w') as f:
    json.dump(reference_data, f, indent=2)

print(f"Canonical dataset reference saved to: {output_dir / 'canonical_datasets.json'}")
print("\nData exploration completed successfully! âœ“")

## Key Insights

### Dataset Composition
- **10 Total Canonical Datasets**
  - 5 Sarcasm Detection datasets with varying multimodal support
  - 3 Paraphrasing datasets (text-only)
  - 2 Fact Verification datasets (text-based claim-evidence pairs)

### Modality Coverage
- **Text**: All 10 datasets (100%)
- **Image**: 2 datasets (MMSD2, SarcNet)
- **Audio**: 1 dataset (MUStARD)
- **Video**: 1 dataset (MUStARD)

### Size Variation
- Small: MUStARD (690 samples)
- Medium: LIAR (12.8k), SarcNet (3.3k), MRPC (5.8k)
- Large: Headlines (28.6k), MMSD2 (24.6k)
- Very Large: Quora (400k), FEVER (185k), ParaNMT (5M capped at 100k), SARC (1.3M capped at 50k)

### Research-Grade Capping
- ParaNMT-5M: Capped at 100k to prevent task domination
- SARC: Capped at 50k for balanced training
- All other datasets: Full or standard splits used

In [None]:
# Create summary table
summary_data = []
for dataset, metadata in sorted(dataset_metadata.items()):
    summary_data.append({
        'Dataset': dataset,
        'Task': metadata['task'],
        'Modalities': ', '.join(metadata['modalities']),
        'Approx Size': metadata['size']
    })

df_summary = pd.DataFrame(summary_data)

print("\nCanonical Dataset Summary:")
print("="*100)
print(df_summary.to_string(index=False))
print("="*100)

# Summary statistics
print(f"\nTotal Datasets: {len(dataset_metadata)}")
print(f"Sarcasm Detection: 5 datasets")
print(f"Paraphrasing: 3 datasets")
print(f"Fact Verification: 2 datasets")

# Modality coverage
modality_count = {}
for dataset, metadata in dataset_metadata.items():
    for mod in metadata['modalities']:
        modality_count[mod] = modality_count.get(mod, 0) + 1

print(f"\nModality Coverage:")
for mod, count in sorted(modality_count.items()):
    print(f"  {mod}: {count} datasets")

## Dataset Characteristics

In [None]:
# Define canonical dataset configurations
dataset_configs = {
    'sarcasm_detection': {
        'datasets': ['sarc', 'mmsd2', 'mustard', 'sarcnet', 'sarcasm_headlines'],
        'task_type': 'classification',
        'num_classes': 2
    },
    'paraphrasing': {
        'datasets': ['paranmt', 'mrpc', 'quora'],
        'task_type': 'generation'
    },
    'fact_verification': {
        'datasets': ['fever', 'liar'],
        'task_type': 'classification',
        'num_classes': 3
    }
}

# Dataset metadata
dataset_metadata = {
    'sarc': {'modalities': ['text'], 'size': '1.3M (capped at 50k)', 'task': 'sarcasm_detection'},
    'mmsd2': {'modalities': ['text', 'image'], 'size': '24.6k', 'task': 'sarcasm_detection'},
    'mustard': {'modalities': ['text', 'audio', 'video'], 'size': '690', 'task': 'sarcasm_detection'},
    'sarcnet': {'modalities': ['text', 'image'], 'size': '3.3k', 'task': 'sarcasm_detection'},
    'sarcasm_headlines': {'modalities': ['text'], 'size': '28.6k', 'task': 'sarcasm_detection'},
    'paranmt': {'modalities': ['text'], 'size': '5M (capped at 100k)', 'task': 'paraphrasing'},
    'mrpc': {'modalities': ['text'], 'size': '5.8k', 'task': 'paraphrasing'},
    'quora': {'modalities': ['text'], 'size': '400k', 'task': 'paraphrasing'},
    'fever': {'modalities': ['text'], 'size': '185k', 'task': 'fact_verification'},
    'liar': {'modalities': ['text'], 'size': '12.8k', 'task': 'fact_verification'}
}

# Display configuration
print("\n" + "="*80)
print("CANONICAL DATASET CONFIGURATION")
print("="*80)
for task, config in dataset_configs.items():
    print(f"\n{task.replace('_', ' ').title()}:")
    print(f"  Datasets: {', '.join(config['datasets'])}")
    print(f"  Task Type: {config['task_type']}")
    if 'num_classes' in config:
        print(f"  Classes: {config['num_classes']}")

total_datasets = sum(len(config['datasets']) for config in dataset_configs.values())
print(f"\nTotal Canonical Datasets: {total_datasets}")

## Dataset Configuration

In [None]:
# Setup and imports
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().cwd().parent if Path().cwd().name == 'notebooks' else Path().cwd()
sys.path.insert(0, str(project_root))

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Create output directory
output_dir = project_root / 'outputs' / 'notebooks'
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Project root: {project_root}")
print(f"Output directory: {output_dir}")

# FactCheck-MM Data Exploration

## Overview
Comprehensive exploration of FactCheck-MM datasets including statistics, distributions, and modality analysis.

## Canonical Datasets (10 Total)

**Sarcasm Detection (5 datasets):**
- sarc
- mmsd2
- mustard
- sarcnet
- sarcasm_headlines

**Paraphrasing (3 datasets):**
- paranmt
- mrpc
- quora

**Fact Verification (2 datasets):**
- fever
- liar