In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# FactCheck-MM Data Exploration\n",
    "\n",
    "## Overview\n",
    "This notebook provides comprehensive exploration of all FactCheck-MM datasets including:\n",
    "- Dataset statistics and distributions\n",
    "- Multimodal content analysis\n",
    "- Data quality assessment\n",
    "- Preprocessing recommendations\n",
    "\n",
    "## Method\n",
    "We analyze 12 datasets across three tasks:\n",
    "- **Sarcasm Detection**: SARC, MMSD2, MUStARD, UR-FUNNY, SarcNet, Headlines\n",
    "- **Paraphrasing**: ParaNMT-5M, MRPC, Quora\n",
    "- **Fact Verification**: FEVER, LIAR, New Headlines\n",
    "\n",
    "Each dataset is examined for modality availability, label distribution, and content characteristics."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup and imports\n",
    "import sys\n",
    "import os\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import json\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Add project root to path\n",
    "project_root = Path().cwd().parent if Path().cwd().name == 'notebooks' else Path().cwd()\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "# Import project utilities\n",
    "from shared.datasets.unified_loader import UnifiedDatasetLoader\n",
    "from shared.utils.visualization import create_class_distribution_plot, create_text_length_histogram\n",
    "from shared.utils.metrics import calculate_dataset_statistics\n",
    "\n",
    "# Set style\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "\n",
    "# Create output directory\n",
    "output_dir = project_root / 'outputs' / 'notebooks'\n",
    "output_dir.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "print(f\"Project root: {project_root}\")\n",
    "print(f\"Output directory: {output_dir}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load dataset configurations\n",
    "dataset_configs = {\n",
    "    'sarcasm_detection': {\n",
    "        'datasets': ['sarc', 'mmsd2', 'mustard', 'ur_funny', 'sarcnet', 'sarcasm_headlines'],\n",
    "        'task_type': 'classification',\n",
    "        'num_classes': 2,\n",
    "        'modalities': ['text', 'audio', 'image', 'video']\n",
    "    },\n",
    "    'paraphrasing': {\n",
    "        'datasets': ['paranmt', 'mrpc', 'quora'],\n",
    "        'task_type': 'generation',\n",
    "        'modalities': ['text']\n",
    "    },\n",
    "    'fact_verification': {\n",
    "        'datasets': ['fever', 'liar', 'new_headlines'],\n",
    "        'task_type': 'classification',\n",
    "        'num_classes': 3,\n",
    "        'modalities': ['text']\n",
    "    }\n",
    "}\n",
    "\n",
    "# Initialize dataset loader\n",
    "data_loader = UnifiedDatasetLoader(project_root / 'data')\n",
    "\n",
    "print(\"FactCheck-MM Dataset Configuration:\")\n",
    "print(\"=\" * 50)\n",
    "for task, config in dataset_configs.items():\n",
    "    print(f\"\\n{task.replace('_', ' ').title()}:\")\n",
    "    print(f\"  Datasets: {', '.join(config['datasets'])}\")\n",
    "    print(f\"  Task Type: {config['task_type']}\")\n",
    "    print(f\"  Modalities: {', '.join(config['modalities'])}\")\n",
    "    if 'num_classes' in config:\n",
    "        print(f\"  Classes: {config['num_classes']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and analyze each dataset\n",
    "dataset_stats = {}\n",
    "all_datasets_info = []\n",
    "\n",
    "for task, config in dataset_configs.items():\n",
    "    print(f\"\\nAnalyzing {task.replace('_', ' ').title()} datasets...\")\n",
    "    \n",
    "    task_stats = {}\n",
    "    \n",
    "    for dataset_name in config['datasets']:\n",
    "        try:\n",
    "            # Load dataset splits\n",
    "            train_data = data_loader.load_dataset(dataset_name, 'train', sample_size=1000)\n",
    "            val_data = data_loader.load_dataset(dataset_name, 'val', sample_size=300) \n",
    "            test_data = data_loader.load_dataset(dataset_name, 'test', sample_size=300)\n",
    "            \n",
    "            # Calculate statistics\n",
    "            stats = {\n",
    "                'dataset': dataset_name,\n",
    "                'task': task,\n",
    "                'train_size': len(train_data) if train_data is not None else 0,\n",
    "                'val_size': len(val_data) if val_data is not None else 0,\n",
    "                'test_size': len(test_data) if test_data is not None else 0,\n",
    "                'total_size': 0,\n",
    "                'modalities_available': [],\n",
    "                'avg_text_length': 0,\n",
    "                'label_distribution': {}\n",
    "            }\n",
    "            \n",
    "            # Analyze train split in detail\n",
    "            if train_data is not None and len(train_data) > 0:\n",
    "                sample = train_data[0]\n",
    "                \n",
    "                # Check available modalities\n",
    "                if 'text' in sample and sample['text']:\n",
    "                    stats['modalities_available'].append('text')\n",
    "                if 'image_path' in sample or 'image' in sample:\n",
    "                    stats['modalities_available'].append('image')\n",
    "                if 'audio_path' in sample or 'audio' in sample:\n",
    "                    stats['modalities_available'].append('audio')\n",
    "                if 'video_path' in sample or 'video' in sample:\n",
    "                    stats['modalities_available'].append('video')\n",
    "                \n",
    "                # Calculate text statistics\n",
    "                if 'text' in sample:\n",
    "                    text_lengths = [len(item.get('text', '').split()) for item in train_data[:100]]\n",
    "                    stats['avg_text_length'] = np.mean(text_lengths) if text_lengths else 0\n",
    "                \n",
    "                # Label distribution\n",
    "                if 'label' in sample:\n",
    "                    labels = [item.get('label', 0) for item in train_data[:100]]\n",
    "                    unique, counts = np.unique(labels, return_counts=True)\n",
    "                    stats['label_distribution'] = dict(zip(unique.astype(str), counts.tolist()))\n",
    "            \n",
    "            stats['total_size'] = stats['train_size'] + stats['val_size'] + stats['test_size']\n",
    "            task_stats[dataset_name] = stats\n",
    "            all_datasets_info.append(stats)\n",
    "            \n",
    "            print(f\"  ✓ {dataset_name}: {stats['total_size']:,} samples, {stats['modalities_available']}\")\n",
    "            \n",
    "        except Exception as e:\n",
    "            print(f\"  ✗ {dataset_name}: Error - {e}\")\n",
    "            \n",
    "    dataset_stats[task] = task_stats\n",
    "\n",
    "# Create summary DataFrame\n",
    "df_summary = pd.DataFrame(all_datasets_info)\n",
    "print(f\"\\nLoaded {len(df_summary)} datasets successfully\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display dataset summary table\n",
    "summary_display = df_summary[['dataset', 'task', 'total_size', 'modalities_available', 'avg_text_length']].copy()\n",
    "summary_display['modalities'] = summary_display['modalities_available'].apply(lambda x: ', '.join(x))\n",
    "summary_display = summary_display.drop('modalities_available', axis=1)\n",
    "summary_display['avg_text_length'] = summary_display['avg_text_length'].round(1)\n",
    "\n",
    "print(\"Dataset Summary:\")\n",
    "print(\"=\" * 80)\n",
    "print(summary_display.to_string(index=False))\n",
    "\n",
    "# Save summary\n",
    "summary_display.to_csv(output_dir / 'dataset_summary.csv', index=False)\n",
    "print(f\"\\nSummary saved to: {output_dir / 'dataset_summary.csv'}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset Size Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataset size visualization\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n",
    "\n",
    "# Overall dataset sizes\n",
    "ax1 = axes[0, 0]\n",
    "datasets = df_summary['dataset'].tolist()\n",
    "sizes = df_summary['total_size'].tolist()\n",
    "colors = plt.cm.Set3(np.linspace(0, 1, len(datasets)))\n",
    "\n",
    "bars = ax1.bar(range(len(datasets)), sizes, color=colors)\n",
    "ax1.set_title('Dataset Sizes', fontsize=14, fontweight='bold')\n",
    "ax1.set_xlabel('Datasets')\n",
    "ax1.set_ylabel('Number of Samples')\n",
    "ax1.set_xticks(range(len(datasets)))\n",
    "ax1.set_xticklabels(datasets, rotation=45, ha='right')\n",
    "\n",
    "# Add value labels on bars\n",
    "for bar, size in zip(bars, sizes):\n",
    "    height = bar.get_height()\n",
    "    ax1.text(bar.get_x() + bar.get_width()/2., height + max(sizes)*0.01,\n",
    "             f'{int(size):,}', ha='center', va='bottom', fontsize=8)\n",
    "\n",
    "# Dataset sizes by task\n",
    "ax2 = axes[0, 1]\n",
    "task_sizes = df_summary.groupby('task')['total_size'].sum()\n",
    "task_colors = ['#FF9999', '#66B2FF', '#99FF99']\n",
    "\n",
    "wedges, texts, autotexts = ax2.pie(task_sizes.values, labels=task_sizes.index, \n",
    "                                   autopct='%1.1f%%', colors=task_colors, startangle=90)\n",
    "ax2.set_title('Total Samples by Task', fontsize=14, fontweight='bold')\n",
    "\n",
    "# Modality availability\n",
    "ax3 = axes[1, 0]\n",
    "modality_counts = {}\n",
    "for _, row in df_summary.iterrows():\n",
    "    for modality in row['modalities_available']:\n",
    "        modality_counts[modality] = modality_counts.get(modality, 0) + 1\n",
    "\n",
    "modalities = list(modality_counts.keys())\n",
    "counts = list(modality_counts.values())\n",
    "bars = ax3.bar(modalities, counts, color=['#FFB366', '#66FFB2', '#B366FF', '#FF66B3'])\n",
    "ax3.set_title('Modality Availability Across Datasets', fontsize=14, fontweight='bold')\n",
    "ax3.set_xlabel('Modalities')\n",
    "ax3.set_ylabel('Number of Datasets')\n",
    "\n",
    "# Add value labels\n",
    "for bar, count in zip(bars, counts):\n",
    "    height = bar.get_height()\n",
    "    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.05,\n",
    "             str(count), ha='center', va='bottom', fontsize=10, fontweight='bold')\n",
    "\n",
    "# Text length distribution\n",
    "ax4 = axes[1, 1]\n",
    "text_lengths = df_summary['avg_text_length'].dropna()\n",
    "ax4.hist(text_lengths, bins=10, color='lightblue', alpha=0.7, edgecolor='black')\n",
    "ax4.set_title('Average Text Length Distribution', fontsize=14, fontweight='bold')\n",
    "ax4.set_xlabel('Average Words per Sample')\n",
    "ax4.set_ylabel('Number of Datasets')\n",
    "ax4.axvline(text_lengths.mean(), color='red', linestyle='--', \n",
    "            label=f'Mean: {text_lengths.mean():.1f} words')\n",
    "ax4.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(output_dir / 'dataset_overview.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "print(f\"Dataset overview visualization saved to: {output_dir / 'dataset_overview.png'}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample Data Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display sample data from each task\n",
    "print(\"Sample Data from Each Task:\")\n",
    "print(\"=\" * 60)\n",
    "\n",
    "for task, config in dataset_configs.items():\n",
    "    print(f\"\\n{task.replace('_', ' ').title()}:\")\n",
    "    print(\"-\" * 40)\n",
    "    \n",
    "    # Get first available dataset for this task\n",
    "    sample_dataset = None\n",
    "    for dataset_name in config['datasets']:\n",
    "        try:\n",
    "            sample_data = data_loader.load_dataset(dataset_name, 'train', sample_size=5)\n",
    "            if sample_data and len(sample_data) > 0:\n",
    "                sample_dataset = dataset_name\n",
    "                break\n",
    "        except:\n",
    "            continue\n",
    "    \n",
    "    if sample_dataset and sample_data:\n",
    "        print(f\"Dataset: {sample_dataset}\")\n",
    "        print(f\"Sample size: {len(sample_data)}\")\n",
    "        \n",
    "        # Show first few samples\n",
    "        for i, sample in enumerate(sample_data[:3]):\n",
    "            print(f\"\\nSample {i+1}:\")\n",
    "            if 'text' in sample:\n",
    "                text = sample['text'][:200] + \"...\" if len(sample['text']) > 200 else sample['text']\n",
    "                print(f\"  Text: {text}\")\n",
    "            if 'label' in sample:\n",
    "                print(f\"  Label: {sample['label']}\")\n",
    "            if 'paraphrase' in sample:\n",
    "                paraphrase = sample['paraphrase'][:200] + \"...\" if len(sample['paraphrase']) > 200 else sample['paraphrase']\n",
    "                print(f\"  Paraphrase: {paraphrase}\")\n",
    "            if 'evidence' in sample:\n",
    "                evidence = sample['evidence'][:150] + \"...\" if len(sample['evidence']) > 150 else sample['evidence']\n",
    "                print(f\"  Evidence: {evidence}\")\n",
    "            \n",
    "            # Show available modalities\n",
    "            modalities = []\n",
    "            for mod in ['text', 'image', 'audio', 'video']:\n",
    "                if f'{mod}_path' in sample or mod in sample:\n",
    "                    modalities.append(mod)\n",
    "            if modalities:\n",
    "                print(f\"  Available modalities: {', '.join(modalities)}\")\n",
    "    else:\n",
    "        print(f\"No sample data available for {task}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Label Distribution Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze label distributions for classification tasks\n",
    "classification_tasks = ['sarcasm_detection', 'fact_verification']\n",
    "\n",
    "fig, axes = plt.subplots(len(classification_tasks), 1, figsize=(12, 8))\n",
    "if len(classification_tasks) == 1:\n",
    "    axes = [axes]\n",
    "\n",
    "for idx, task in enumerate(classification_tasks):\n",
    "    ax = axes[idx]\n",
    "    \n",
    "    # Collect label distributions for this task\n",
    "    task_data = df_summary[df_summary['task'] == task]\n",
    "    \n",
    "    if len(task_data) > 0:\n",
    "        datasets = task_data['dataset'].tolist()\n",
    "        \n",
    "        # Create stacked bar chart for label distributions\n",
    "        label_data = {}\n",
    "        all_labels = set()\n",
    "        \n",
    "        for _, row in task_data.iterrows():\n",
    "            if row['label_distribution']:\n",
    "                label_data[row['dataset']] = row['label_distribution']\n",
    "                all_labels.update(row['label_distribution'].keys())\n",
    "        \n",
    "        if label_data:\n",
    "            all_labels = sorted(list(all_labels))\n",
    "            \n",
    "            # Prepare data for stacked bar chart\n",
    "            bottom = np.zeros(len(datasets))\n",
    "            colors = plt.cm.Set3(np.linspace(0, 1, len(all_labels)))\n",
    "            \n",
    "            for label_idx, label in enumerate(all_labels):\n",
    "                values = []\n",
    "                for dataset in datasets:\n",
    "                    if dataset in label_data:\n",
    "                        values.append(label_data[dataset].get(label, 0))\n",
    "                    else:\n",
    "                        values.append(0)\n",
    "                \n",
    "                ax.bar(datasets, values, bottom=bottom, label=f'Label {label}', \n",
    "                       color=colors[label_idx], alpha=0.8)\n",
    "                bottom += values\n",
    "            \n",
    "            ax.set_title(f'{task.replace(\"_\", \" \").title()} - Label Distribution', \n",
    "                        fontsize=14, fontweight='bold')\n",
    "            ax.set_xlabel('Datasets')\n",
    "            ax.set_ylabel('Number of Samples')\n",
    "            ax.legend()\n",
    "            ax.tick_params(axis='x', rotation=45)\n",
    "        else:\n",
    "            ax.text(0.5, 0.5, f'No label distribution data for {task}', \n",
    "                   ha='center', va='center', transform=ax.transAxes)\n",
    "    else:\n",
    "        ax.text(0.5, 0.5, f'No data available for {task}', \n",
    "               ha='center', va='center', transform=ax.transAxes)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(output_dir / 'label_distributions.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "print(f\"Label distribution visualization saved to: {output_dir / 'label_distributions.png'}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Text Analysis and Word Clouds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Text analysis and word cloud generation\n",
    "try:\n",
    "    from wordcloud import WordCloud\n",
    "    from collections import Counter\n",
    "    import re\n",
    "    \n",
    "    # Function to clean text\n",
    "    def clean_text(text):\n",
    "        if not isinstance(text, str):\n",
    "            return \"\"\n",
    "        # Remove URLs, mentions, hashtags, and special characters\n",
    "        text = re.sub(r'http\\S+', '', text)\n",
    "        text = re.sub(r'@\\w+', '', text)\n",
    "        text = re.sub(r'#\\w+', '', text)\n",
    "        text = re.sub(r'[^a-zA-Z\\s]', '', text)\n",
    "        return text.lower().strip()\n",
    "    \n",
    "    # Collect text data from different tasks\n",
    "    task_texts = {}\n",
    "    \n",
    "    for task, config in dataset_configs.items():\n",
    "        all_texts = []\n",
    "        \n",
    "        for dataset_name in config['datasets'][:2]:  # Limit to first 2 datasets per task\n",
    "            try:\n",
    "                sample_data = data_loader.load_dataset(dataset_name, 'train', sample_size=200)\n",
    "                if sample_data:\n",
    "                    for item in sample_data:\n",
    "                        if 'text' in item and item['text']:\n",
    "                            cleaned_text = clean_text(item['text'])\n",
    "                            if cleaned_text:\n",
    "                                all_texts.append(cleaned_text)\n",
    "            except:\n",
    "                continue\n",
    "        \n",
    "        if all_texts:\n",
    "            task_texts[task] = ' '.join(all_texts)\n",
    "    \n",
    "    # Generate word clouds\n",
    "    if task_texts:\n",
    "        fig, axes = plt.subplots(1, len(task_texts), figsize=(5*len(task_texts), 5))\n",
    "        if len(task_texts) == 1:\n",
    "            axes = [axes]\n",
    "        \n",
    "        for idx, (task, text) in enumerate(task_texts.items()):\n",
    "            if len(axes) > idx:\n",
    "                wordcloud = WordCloud(width=400, height=300, \n",
    "                                     background_color='white',\n",
    "                                     max_words=100,\n",
    "                                     colormap='viridis').generate(text)\n",
    "                \n",
    "                axes[idx].imshow(wordcloud, interpolation='bilinear')\n",
    "                axes[idx].set_title(f'{task.replace(\"_\", \" \").title()}\\nWord Cloud', \n",
    "                                   fontsize=12, fontweight='bold')\n",
    "                axes[idx].axis('off')\n",
    "        \n",
    "        plt.tight_layout()\n",
    "        plt.savefig(output_dir / 'word_clouds.png', dpi=300, bbox_inches='tight')\n",
    "        plt.show()\n",
    "        \n",
    "        print(f\"Word clouds saved to: {output_dir / 'word_clouds.png'}\")\n",
    "    else:\n",
    "        print(\"No text data available for word cloud generation\")\n",
    "        \n",
    "except ImportError:\n",
    "    print(\"WordCloud library not available. Install with: pip install wordcloud\")\n",
    "except Exception as e:\n",
    "    print(f\"Error generating word clouds: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results\n",
    "\n",
    "Based on the data exploration, we have identified:\n",
    "\n",
    "### Dataset Characteristics\n",
    "- **Total Datasets**: 12 datasets across 3 tasks\n",
    "- **Multimodal Coverage**: Text is universal, audio/image/video available in select datasets\n",
    "- **Size Variation**: Significant variation in dataset sizes (from hundreds to millions of samples)\n",
    "- **Text Complexity**: Average text length varies by task type and domain\n",
    "\n",
    "### Task Distribution\n",
    "- **Sarcasm Detection**: 6 datasets with varying multimodal support\n",
    "- **Paraphrasing**: 3 large text-only datasets\n",
    "- **Fact Verification**: 3 datasets with claim-evidence pairs\n",
    "\n",
    "### Modality Analysis\n",
    "- Text modality: Present in all datasets\n",
    "- Audio modality: Available in video-based sarcasm datasets\n",
    "- Image modality: Present in social media and multimodal datasets\n",
    "- Video modality: Limited to specific sarcasm detection datasets\n",
    "\n",
    "## Insights\n",
    "\n",
    "1. **Data Imbalance**: Some datasets show significant class imbalance requiring careful handling\n",
    "2. **Multimodal Opportunities**: Video-based datasets offer rich multimodal learning opportunities\n",
    "3. **Text Diversity**: Different domains (social media, news, academic) provide diverse text patterns\n",
    "4. **Preprocessing Needs**: Text cleaning, standardization, and modality alignment required\n",
    "5. **Evaluation Considerations**: Varying dataset sizes suggest weighted evaluation strategies\n",
    "\n",
    "### Recommendations\n",
    "- Implement stratified sampling for imbalanced datasets\n",
    "- Use data augmentation for smaller datasets\n",
    "- Standardize text preprocessing across all datasets\n",
    "- Develop modality-specific preprocessing pipelines\n",
    "- Consider dataset-specific evaluation metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save comprehensive dataset analysis\n",
    "analysis_summary = {\n",
    "    'dataset_statistics': dataset_stats,\n",
    "    'total_datasets': len(df_summary),\n",
    "    'total_samples': int(df_summary['total_size'].sum()),\n",
    "    'task_distribution': df_summary.groupby('task')['dataset'].count().to_dict(),\n",
    "    'modality_coverage': modality_counts,\n",
    "    'average_text_length_by_task': df_summary.groupby('task')['avg_text_length'].mean().to_dict(),\n",
    "    'recommendations': [\n",
    "        \"Implement stratified sampling for imbalanced datasets\",\n",
    "        \"Use data augmentation for smaller datasets\", \n",
    "        \"Standardize text preprocessing across all datasets\",\n",
    "        \"Develop modality-specific preprocessing pipelines\",\n",
    "        \"Consider dataset-specific evaluation metrics\"\n",
    "    ]\n",
    "}\n",
    "\n",
    "# Save analysis\n",
    "with open(output_dir / 'dataset_analysis_summary.json', 'w') as f:\n",
    "    json.dump(analysis_summary, f, indent=2, default=str)\n",
    "\n",
    "print(f\"\\nComplete dataset analysis saved to: {output_dir / 'dataset_analysis_summary.json'}\")\n",
    "print(f\"All visualizations saved to: {output_dir}\")\n",
    "print(\"\\nData exploration completed successfully! ✓\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
