# Data Processing and Exploration

This notebook provides comprehensive data loading, preprocessing, and exploration capabilities for the Criteria Evidence Agent project.
It includes interactive data analysis, preprocessing pipelines, and data validation tools.

## Setup and Imports

Import all necessary libraries for data processing and exploration.

In [None]:
import os
import sys
import json
import warnings
from pathlib import Path
from typing import Dict, Any, List, Tuple, Optional
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import re
from tqdm.auto import tqdm
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# NLP and ML libraries
import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# Suppress warnings
warnings.filterwarnings('ignore')

# Add src to path
if 'src' not in sys.path:
    sys.path.append('src')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("‚úÖ All imports successful!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## Load Configuration and Dependencies

Load configuration management and data modules.

In [None]:
# Load configuration management
%run 01_Configuration_Management.ipynb

# Import project modules
from src.data.dataset import DataModule, EvidenceDataset
from src.data.preprocessing import preprocess_text, extract_evidence_spans

print("‚úÖ Configuration and data modules loaded!")

## Data Configuration and Loading

Configure data paths and load the dataset.

In [None]:
def create_data_config_selector():
    """Create an interactive data configuration selector."""
    
    # Data path widgets
    groundtruth_path = widgets.Text(
        value="./Data/groundtruth/redsm5_ground_truth.json",
        description='Groundtruth:',
        style={'description_width': 'initial'}
    )
    
    posts_path = widgets.Text(
        value="./Data/redsm5/redsm5_posts.csv",
        description='Posts CSV:',
        style={'description_width': 'initial'}
    )
    
    # Data parameters
    max_length = widgets.Dropdown(
        options=[128, 256, 384, 512],
        value=256,
        description='Max Length:'
    )
    
    val_size = widgets.FloatSlider(
        value=0.15,
        min=0.1,
        max=0.3,
        step=0.05,
        description='Val Size:'
    )
    
    test_size = widgets.FloatSlider(
        value=0.15,
        min=0.1,
        max=0.3,
        step=0.05,
        description='Test Size:'
    )
    
    seed = widgets.IntText(
        value=42,
        description='Random Seed:'
    )
    
    load_button = widgets.Button(
        description='Load Data',
        button_style='success'
    )
    
    output = widgets.Output()
    
    def on_load_clicked(b):
        with output:
            output.clear_output()
            
            try:
                # Create data configuration
                data_config = DataConfig(
                    groundtruth_path=groundtruth_path.value,
                    posts_path=posts_path.value,
                    max_length=max_length.value,
                    val_size=val_size.value,
                    test_size=test_size.value,
                    seed=seed.value
                )
                
                # Store in global variable
                global current_data_config
                current_data_config = data_config
                
                print(f"‚úÖ Data configuration created!")
                print(f"   Groundtruth: {data_config.groundtruth_path}")
                print(f"   Posts: {data_config.posts_path}")
                print(f"   Max length: {data_config.max_length}")
                print(f"   Val/Test split: {data_config.val_size}/{data_config.test_size}")
                
                # Check if files exist
                if not Path(data_config.groundtruth_path).exists():
                    print(f"‚ö†Ô∏è  Groundtruth file not found: {data_config.groundtruth_path}")
                if not Path(data_config.posts_path).exists():
                    print(f"‚ö†Ô∏è  Posts file not found: {data_config.posts_path}")
                
            except Exception as e:
                print(f"‚ùå Error creating data configuration: {e}")
    
    load_button.on_click(on_load_clicked)
    
    layout = widgets.VBox([
        widgets.HTML("<h3>Data Configuration</h3>"),
        groundtruth_path,
        posts_path,
        widgets.HBox([max_length, val_size]),
        widgets.HBox([test_size, seed]),
        load_button,
        output
    ])
    
    return layout

# Display data configuration selector
data_config_selector = create_data_config_selector()
display(data_config_selector)

## Data Loading and Initial Exploration

Load the dataset and perform initial exploration.

In [None]:
def load_and_explore_data():
    """Load and perform initial exploration of the dataset."""
    
    if 'current_data_config' not in globals():
        print("‚ùå Please configure and load data first using the selector above.")
        return None, None
    
    try:
        print("üìä Loading dataset...")
        
        # Load groundtruth data
        with open(current_data_config.groundtruth_path, 'r') as f:
            groundtruth_data = json.load(f)
        
        # Load posts data
        posts_df = pd.read_csv(current_data_config.posts_path)
        
        print(f"‚úÖ Data loaded successfully!")
        print(f"   Groundtruth entries: {len(groundtruth_data)}")
        print(f"   Posts entries: {len(posts_df)}")
        
        # Basic data exploration
        print(f"\nüìã Posts DataFrame Info:")
        print(f"   Shape: {posts_df.shape}")
        print(f"   Columns: {list(posts_df.columns)}")
        
        # Check for missing values
        missing_values = posts_df.isnull().sum()
        if missing_values.sum() > 0:
            print(f"\n‚ö†Ô∏è  Missing values found:")
            for col, count in missing_values[missing_values > 0].items():
                print(f"   {col}: {count} ({count/len(posts_df)*100:.1f}%)")
        else:
            print(f"\n‚úÖ No missing values found")
        
        # Display sample data
        print(f"\nüìÑ Sample posts:")
        display(posts_df.head())
        
        # Analyze groundtruth structure
        print(f"\nüéØ Groundtruth Analysis:")
        if groundtruth_data:
            sample_entry = list(groundtruth_data.values())[0]
            print(f"   Sample entry keys: {list(sample_entry.keys())}")
            
            # Count labels
            all_labels = []
            for entry in groundtruth_data.values():
                if 'labels' in entry:
                    all_labels.extend(entry['labels'])
            
            label_counts = Counter(all_labels)
            print(f"   Total label instances: {len(all_labels)}")
            print(f"   Unique labels: {len(label_counts)}")
            print(f"   Label distribution:")
            for label, count in label_counts.most_common():
                print(f"     {label}: {count}")
        
        return posts_df, groundtruth_data
        
    except Exception as e:
        print(f"‚ùå Error loading data: {e}")
        import traceback
        traceback.print_exc()
        return None, None

# Load and explore data
posts_df, groundtruth_data = load_and_explore_data()

## Text Analysis and Statistics

Analyze text characteristics and statistics.

In [None]:
def analyze_text_statistics(posts_df: pd.DataFrame):
    """Analyze text statistics and characteristics."""
    
    if posts_df is None:
        print("‚ùå No data loaded. Please load data first.")
        return
    
    print("üìä Text Statistics Analysis")
    print("=" * 40)
    
    # Assume text column is 'text' or find it
    text_column = None
    for col in ['text', 'content', 'post_text', 'message']:
        if col in posts_df.columns:
            text_column = col
            break
    
    if text_column is None:
        print("‚ùå No text column found in the dataset")
        return
    
    print(f"Using text column: '{text_column}'")
    
    # Calculate text statistics
    texts = posts_df[text_column].dropna()
    
    # Character counts
    char_counts = texts.str.len()
    
    # Word counts
    word_counts = texts.str.split().str.len()
    
    # Sentence counts (approximate)
    sentence_counts = texts.str.count(r'[.!?]+') + 1
    
    print(f"\nüìà Text Length Statistics:")
    print(f"   Total texts: {len(texts)}")
    print(f"   Character count - Mean: {char_counts.mean():.1f}, Median: {char_counts.median():.1f}")
    print(f"   Character count - Min: {char_counts.min()}, Max: {char_counts.max()}")
    print(f"   Word count - Mean: {word_counts.mean():.1f}, Median: {word_counts.median():.1f}")
    print(f"   Word count - Min: {word_counts.min()}, Max: {word_counts.max()}")
    print(f"   Sentence count - Mean: {sentence_counts.mean():.1f}, Median: {sentence_counts.median():.1f}")
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Text Statistics Analysis', fontsize=16)
    
    # Character count distribution
    axes[0, 0].hist(char_counts, bins=50, alpha=0.7, color='skyblue')
    axes[0, 0].axvline(char_counts.mean(), color='red', linestyle='--', label=f'Mean: {char_counts.mean():.1f}')
    axes[0, 0].axvline(char_counts.median(), color='green', linestyle='--', label=f'Median: {char_counts.median():.1f}')
    axes[0, 0].set_xlabel('Character Count')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Character Count Distribution')
    axes[0, 0].legend()
    
    # Word count distribution
    axes[0, 1].hist(word_counts, bins=50, alpha=0.7, color='lightcoral')
    axes[0, 1].axvline(word_counts.mean(), color='red', linestyle='--', label=f'Mean: {word_counts.mean():.1f}')
    axes[0, 1].axvline(word_counts.median(), color='green', linestyle='--', label=f'Median: {word_counts.median():.1f}')
    axes[0, 1].set_xlabel('Word Count')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Word Count Distribution')
    axes[0, 1].legend()
    
    # Box plots
    axes[1, 0].boxplot([char_counts], labels=['Characters'])
    axes[1, 0].set_title('Character Count Box Plot')
    axes[1, 0].set_ylabel('Count')
    
    axes[1, 1].boxplot([word_counts], labels=['Words'])
    axes[1, 1].set_title('Word Count Box Plot')
    axes[1, 1].set_ylabel('Count')
    
    plt.tight_layout()
    plt.show()
    
    # Tokenization analysis with different max_lengths
    print(f"\nüî§ Tokenization Analysis:")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained('roberta-base')
        
        # Sample a subset for tokenization analysis
        sample_texts = texts.sample(min(1000, len(texts)), random_state=42)
        
        token_counts = []
        for text in tqdm(sample_texts, desc="Tokenizing"):
            tokens = tokenizer.encode(text, add_special_tokens=True)
            token_counts.append(len(tokens))
        
        token_counts = np.array(token_counts)
        
        print(f"   Token count - Mean: {token_counts.mean():.1f}, Median: {np.median(token_counts):.1f}")
        print(f"   Token count - Min: {token_counts.min()}, Max: {token_counts.max()}")
        
        # Check truncation rates for different max_lengths
        for max_len in [128, 256, 384, 512]:
            truncated = (token_counts > max_len).sum()
            truncation_rate = truncated / len(token_counts) * 100
            print(f"   Max length {max_len}: {truncation_rate:.1f}% would be truncated")
        
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Tokenization analysis failed: {e}")
    
    return {
        'char_counts': char_counts,
        'word_counts': word_counts,
        'sentence_counts': sentence_counts
    }

# Analyze text statistics
if posts_df is not None:
    text_stats = analyze_text_statistics(posts_df)
else:
    print("‚ö†Ô∏è  Load data first to perform text analysis")

## Label Analysis and Distribution

Analyze label distribution and multi-label characteristics.

In [None]:
def analyze_label_distribution(groundtruth_data: Dict):
    """Analyze label distribution and multi-label characteristics."""
    
    if groundtruth_data is None:
        print("‚ùå No groundtruth data loaded. Please load data first.")
        return
    
    print("üè∑Ô∏è  Label Distribution Analysis")
    print("=" * 40)
    
    # Extract all labels and create label matrix
    all_labels = []
    label_combinations = []
    post_ids = []
    
    for post_id, entry in groundtruth_data.items():
        if 'labels' in entry:
            labels = entry['labels']
            all_labels.extend(labels)
            label_combinations.append(labels)
            post_ids.append(post_id)
    
    # Label frequency analysis
    label_counts = Counter(all_labels)
    unique_labels = list(label_counts.keys())
    
    print(f"\nüìä Label Statistics:")
    print(f"   Total posts with labels: {len(label_combinations)}")
    print(f"   Unique labels: {len(unique_labels)}")
    print(f"   Total label instances: {len(all_labels)}")
    print(f"   Average labels per post: {len(all_labels) / len(label_combinations):.2f}")
    
    # Label distribution
    print(f"\nüè∑Ô∏è  Label Frequency:")
    for label, count in label_counts.most_common():
        percentage = count / len(label_combinations) * 100
        print(f"   {label}: {count} ({percentage:.1f}%)")
    
    # Multi-label analysis
    label_counts_per_post = [len(labels) for labels in label_combinations]
    label_count_distribution = Counter(label_counts_per_post)
    
    print(f"\nüìà Labels per Post Distribution:")
    for num_labels, count in sorted(label_count_distribution.items()):
        percentage = count / len(label_combinations) * 100
        print(f"   {num_labels} labels: {count} posts ({percentage:.1f}%)")
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Label Distribution Analysis', fontsize=16)
    
    # Label frequency bar plot
    labels, counts = zip(*label_counts.most_common())
    axes[0, 0].bar(range(len(labels)), counts, color='skyblue')
    axes[0, 0].set_xlabel('Labels')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Label Frequency Distribution')
    axes[0, 0].set_xticks(range(len(labels)))
    axes[0, 0].set_xticklabels(labels, rotation=45, ha='right')
    
    # Label percentage pie chart
    axes[0, 1].pie(counts, labels=labels, autopct='%1.1f%%', startangle=90)
    axes[0, 1].set_title('Label Distribution (Percentage)')
    
    # Labels per post distribution
    num_labels_list, post_counts = zip(*sorted(label_count_distribution.items()))
    axes[1, 0].bar(num_labels_list, post_counts, color='lightcoral')
    axes[1, 0].set_xlabel('Number of Labels per Post')
    axes[1, 0].set_ylabel('Number of Posts')
    axes[1, 0].set_title('Distribution of Labels per Post')
    
    # Label co-occurrence heatmap
    if len(unique_labels) <= 15:  # Only for manageable number of labels
        # Create co-occurrence matrix
        cooccurrence_matrix = np.zeros((len(unique_labels), len(unique_labels)))
        
        for labels in label_combinations:
            for i, label1 in enumerate(unique_labels):
                for j, label2 in enumerate(unique_labels):
                    if label1 in labels and label2 in labels:
                        cooccurrence_matrix[i, j] += 1
        
        # Normalize by diagonal (individual label counts)
        for i in range(len(unique_labels)):
            for j in range(len(unique_labels)):
                if i != j and cooccurrence_matrix[i, i] > 0:
                    cooccurrence_matrix[i, j] /= cooccurrence_matrix[i, i]
        
        # Set diagonal to 1
        np.fill_diagonal(cooccurrence_matrix, 1.0)
        
        im = axes[1, 1].imshow(cooccurrence_matrix, cmap='Blues', aspect='auto')
        axes[1, 1].set_xticks(range(len(unique_labels)))
        axes[1, 1].set_yticks(range(len(unique_labels)))
        axes[1, 1].set_xticklabels(unique_labels, rotation=45, ha='right')
        axes[1, 1].set_yticklabels(unique_labels)
        axes[1, 1].set_title('Label Co-occurrence (Normalized)')
        plt.colorbar(im, ax=axes[1, 1])
    else:
        axes[1, 1].text(0.5, 0.5, 'Too many labels\nfor co-occurrence\nvisualization', 
                        ha='center', va='center', transform=axes[1, 1].transAxes)
        axes[1, 1].set_title('Label Co-occurrence (Skipped)')
    
    plt.tight_layout()
    plt.show()
    
    # Class imbalance analysis
    print(f"\n‚öñÔ∏è  Class Imbalance Analysis:")
    total_posts = len(label_combinations)
    for label, count in label_counts.most_common():
        positive_ratio = count / total_posts
        negative_ratio = 1 - positive_ratio
        imbalance_ratio = negative_ratio / positive_ratio if positive_ratio > 0 else float('inf')
        print(f"   {label}: {positive_ratio:.3f} positive, {negative_ratio:.3f} negative (ratio: {imbalance_ratio:.1f}:1)")
    
    return {
        'label_counts': label_counts,
        'unique_labels': unique_labels,
        'label_combinations': label_combinations,
        'label_count_distribution': label_count_distribution
    }

# Analyze label distribution
if groundtruth_data is not None:
    label_analysis = analyze_label_distribution(groundtruth_data)
else:
    print("‚ö†Ô∏è  Load data first to perform label analysis")

## Data Preprocessing Pipeline

Create and test the data preprocessing pipeline.

In [None]:
def create_preprocessing_pipeline():
    """Create and test the data preprocessing pipeline."""
    
    if 'current_data_config' not in globals():
        print("‚ùå Please configure data first.")
        return None
    
    print("üîß Creating Data Preprocessing Pipeline")
    print("=" * 40)
    
    try:
        # Create model config for data module
        model_config = ModelConfig()
        
        # Create data module
        data_module = DataModule(current_data_config, model_config)
        
        print(f"‚úÖ Data module created successfully!")
        print(f"   Tokenizer: {data_module.tokenizer.__class__.__name__}")
        print(f"   Max length: {current_data_config.max_length}")
        
        # Load and split data
        print(f"\nüìä Loading and splitting data...")
        
        # Get data splits
        train_data, val_data, test_data = data_module.get_data_splits()
        
        print(f"   Train samples: {len(train_data)}")
        print(f"   Validation samples: {len(val_data)}")
        print(f"   Test samples: {len(test_data)}")
        
        # Create datasets
        print(f"\nüîÑ Creating datasets...")
        
        train_dataset = EvidenceDataset(
            train_data, 
            data_module.tokenizer, 
            current_data_config.max_length,
            current_data_config.multi_label_fields
        )
        
        val_dataset = EvidenceDataset(
            val_data, 
            data_module.tokenizer, 
            current_data_config.max_length,
            current_data_config.multi_label_fields
        )
        
        print(f"   Train dataset: {len(train_dataset)} samples")
        print(f"   Val dataset: {len(val_dataset)} samples")
        
        # Test preprocessing with sample
        print(f"\nüß™ Testing preprocessing with sample...")
        
        sample = train_dataset[0]
        print(f"   Sample keys: {list(sample.keys())}")
        print(f"   Input IDs shape: {sample['input_ids'].shape}")
        print(f"   Attention mask shape: {sample['attention_mask'].shape}")
        
        if 'labels' in sample:
            print(f"   Labels shape: {sample['labels'].shape}")
            print(f"   Labels: {sample['labels']}")
        
        # Decode sample to verify tokenization
        decoded_text = data_module.tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
        print(f"\nüìù Sample decoded text (first 200 chars):")
        print(f"   {decoded_text[:200]}...")
        
        return {
            'data_module': data_module,
            'train_dataset': train_dataset,
            'val_dataset': val_dataset,
            'train_data': train_data,
            'val_data': val_data,
            'test_data': test_data
        }
        
    except Exception as e:
        print(f"‚ùå Error creating preprocessing pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None

# Create preprocessing pipeline
preprocessing_result = create_preprocessing_pipeline()

## Data Quality Assessment

Assess data quality and identify potential issues.

In [None]:
def assess_data_quality(posts_df: pd.DataFrame, groundtruth_data: Dict):
    """Assess data quality and identify potential issues."""
    
    if posts_df is None or groundtruth_data is None:
        print("‚ùå No data loaded. Please load data first.")
        return
    
    print("üîç Data Quality Assessment")
    print("=" * 40)
    
    # Find text column
    text_column = None
    id_column = None
    
    for col in ['text', 'content', 'post_text', 'message']:
        if col in posts_df.columns:
            text_column = col
            break
    
    for col in ['post_id', 'id', 'ID', 'Post_ID']:
        if col in posts_df.columns:
            id_column = col
            break
    
    if text_column is None:
        print("‚ùå No text column found")
        return
    
    print(f"Using text column: '{text_column}'")
    print(f"Using ID column: '{id_column}'" if id_column else "No ID column found")
    
    # Basic quality checks
    print(f"\nüìä Basic Quality Checks:")
    
    # Missing values
    missing_text = posts_df[text_column].isnull().sum()
    print(f"   Missing text values: {missing_text} ({missing_text/len(posts_df)*100:.1f}%)")
    
    # Empty strings
    empty_text = (posts_df[text_column].str.strip() == '').sum()
    print(f"   Empty text values: {empty_text} ({empty_text/len(posts_df)*100:.1f}%)")
    
    # Very short texts (< 10 characters)
    short_text = (posts_df[text_column].str.len() < 10).sum()
    print(f"   Very short texts (<10 chars): {short_text} ({short_text/len(posts_df)*100:.1f}%)")
    
    # Very long texts (> 1000 characters)
    long_text = (posts_df[text_column].str.len() > 1000).sum()
    print(f"   Very long texts (>1000 chars): {long_text} ({long_text/len(posts_df)*100:.1f}%)")
    
    # Duplicate texts
    duplicate_text = posts_df[text_column].duplicated().sum()
    print(f"   Duplicate texts: {duplicate_text} ({duplicate_text/len(posts_df)*100:.1f}%)")
    
    # Coverage analysis
    print(f"\nüéØ Coverage Analysis:")
    
    if id_column:
        posts_ids = set(posts_df[id_column].astype(str))
        groundtruth_ids = set(groundtruth_data.keys())
        
        intersection = posts_ids.intersection(groundtruth_ids)
        posts_only = posts_ids - groundtruth_ids
        groundtruth_only = groundtruth_ids - posts_ids
        
        print(f"   Posts with groundtruth: {len(intersection)} ({len(intersection)/len(posts_ids)*100:.1f}%)")
        print(f"   Posts without groundtruth: {len(posts_only)} ({len(posts_only)/len(posts_ids)*100:.1f}%)")
        print(f"   Groundtruth without posts: {len(groundtruth_only)} ({len(groundtruth_only)/len(groundtruth_ids)*100:.1f}%)")
    else:
        print(f"   Cannot perform coverage analysis without ID column")
    
    # Text quality patterns
    print(f"\nüìù Text Quality Patterns:")
    
    texts = posts_df[text_column].dropna()
    
    # URLs
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    texts_with_urls = texts.str.contains(url_pattern, regex=True).sum()
    print(f"   Texts with URLs: {texts_with_urls} ({texts_with_urls/len(texts)*100:.1f}%)")
    
    # Mentions (@username)
    mention_pattern = r'@\w+'
    texts_with_mentions = texts.str.contains(mention_pattern, regex=True).sum()
    print(f"   Texts with mentions: {texts_with_mentions} ({texts_with_mentions/len(texts)*100:.1f}%)")
    
    # Hashtags
    hashtag_pattern = r'#\w+'
    texts_with_hashtags = texts.str.contains(hashtag_pattern, regex=True).sum()
    print(f"   Texts with hashtags: {texts_with_hashtags} ({texts_with_hashtags/len(texts)*100:.1f}%)")
    
    # Special characters ratio
    special_char_ratios = texts.str.count(r'[^\w\s]') / texts.str.len()
    high_special_chars = (special_char_ratios > 0.3).sum()
    print(f"   Texts with high special char ratio (>30%): {high_special_chars} ({high_special_chars/len(texts)*100:.1f}%)")
    
    # Language detection (simple heuristic)
    non_ascii_chars = texts.str.count(r'[^\x00-\x7F]') / texts.str.len()
    likely_non_english = (non_ascii_chars > 0.1).sum()
    print(f"   Likely non-English texts (>10% non-ASCII): {likely_non_english} ({likely_non_english/len(texts)*100:.1f}%)")
    
    # Recommendations
    print(f"\nüí° Recommendations:")
    
    if missing_text > 0:
        print(f"   ‚Ä¢ Consider removing {missing_text} posts with missing text")
    
    if empty_text > 0:
        print(f"   ‚Ä¢ Consider removing {empty_text} posts with empty text")
    
    if short_text > len(texts) * 0.05:  # More than 5%
        print(f"   ‚Ä¢ High number of very short texts - consider minimum length filtering")
    
    if duplicate_text > 0:
        print(f"   ‚Ä¢ Consider deduplication to remove {duplicate_text} duplicate texts")
    
    if texts_with_urls > len(texts) * 0.1:  # More than 10%
        print(f"   ‚Ä¢ Consider URL preprocessing/removal for {texts_with_urls} texts")
    
    if likely_non_english > len(texts) * 0.05:  # More than 5%
        print(f"   ‚Ä¢ Consider language filtering for {likely_non_english} likely non-English texts")
    
    return {
        'missing_text': missing_text,
        'empty_text': empty_text,
        'short_text': short_text,
        'long_text': long_text,
        'duplicate_text': duplicate_text,
        'texts_with_urls': texts_with_urls,
        'texts_with_mentions': texts_with_mentions,
        'texts_with_hashtags': texts_with_hashtags
    }

# Assess data quality
if posts_df is not None and groundtruth_data is not None:
    quality_assessment = assess_data_quality(posts_df, groundtruth_data)
else:
    print("‚ö†Ô∏è  Load data first to perform quality assessment")

## Interactive Data Explorer

Interactive widget-based data exploration tool.

In [None]:
def create_interactive_data_explorer():
    """Create an interactive data exploration dashboard."""
    
    if posts_df is None or groundtruth_data is None:
        print("‚ùå No data loaded. Please load data first.")
        return
    
    # Find text and ID columns
    text_column = None
    id_column = None
    
    for col in ['text', 'content', 'post_text', 'message']:
        if col in posts_df.columns:
            text_column = col
            break
    
    for col in ['post_id', 'id', 'ID', 'Post_ID']:
        if col in posts_df.columns:
            id_column = col
            break
    
    if text_column is None:
        print("‚ùå No text column found")
        return
    
    # Create widgets
    sample_size = widgets.IntSlider(
        value=10,
        min=1,
        max=100,
        description='Sample Size:'
    )
    
    filter_by_labels = widgets.Checkbox(
        value=False,
        description='Filter by Labels'
    )
    
    # Get unique labels for filtering
    all_labels = []
    for entry in groundtruth_data.values():
        if 'labels' in entry:
            all_labels.extend(entry['labels'])
    unique_labels = list(set(all_labels))
    
    label_filter = widgets.SelectMultiple(
        options=unique_labels,
        value=[],
        description='Labels:',
        disabled=True
    )
    
    search_text = widgets.Text(
        value='',
        description='Search:',
        placeholder='Enter search terms...'
    )
    
    explore_button = widgets.Button(
        description='Explore Data',
        button_style='info'
    )
    
    output = widgets.Output()
    
    def on_filter_change(change):
        label_filter.disabled = not change['new']
    
    filter_by_labels.observe(on_filter_change, names='value')
    
    def on_explore_clicked(b):
        with output:
            output.clear_output()
            
            try:
                # Start with all posts
                filtered_df = posts_df.copy()
                
                # Apply search filter
                if search_text.value.strip():
                    search_terms = search_text.value.strip().lower()
                    filtered_df = filtered_df[filtered_df[text_column].str.lower().str.contains(search_terms, na=False)]
                
                # Apply label filter
                if filter_by_labels.value and label_filter.value:
                    if id_column:
                        # Filter by posts that have the selected labels
                        matching_ids = []
                        for post_id, entry in groundtruth_data.items():
                            if 'labels' in entry:
                                post_labels = set(entry['labels'])
                                selected_labels = set(label_filter.value)
                                if selected_labels.intersection(post_labels):
                                    matching_ids.append(post_id)
                        
                        filtered_df = filtered_df[filtered_df[id_column].astype(str).isin(matching_ids)]
                
                print(f"üìä Filtered Results: {len(filtered_df)} posts")
                
                if len(filtered_df) == 0:
                    print("No posts match the current filters.")
                    return
                
                # Sample data
                sample_df = filtered_df.sample(min(sample_size.value, len(filtered_df)), random_state=42)
                
                print(f"\nüìÑ Sample of {len(sample_df)} posts:")
                print("=" * 50)
                
                for idx, (_, row) in enumerate(sample_df.iterrows(), 1):
                    post_id = row[id_column] if id_column else f"Post_{idx}"
                    text = row[text_column]
                    
                    print(f"\n{idx}. Post ID: {post_id}")
                    print(f"   Length: {len(text)} characters")
                    
                    # Show labels if available
                    if str(post_id) in groundtruth_data:
                        entry = groundtruth_data[str(post_id)]
                        if 'labels' in entry:
                            print(f"   Labels: {', '.join(entry['labels'])}")
                    
                    # Show text (truncated)
                    display_text = text[:300] + "..." if len(text) > 300 else text
                    print(f"   Text: {display_text}")
                    print("-" * 50)
                
            except Exception as e:
                print(f"‚ùå Error exploring data: {e}")
    
    explore_button.on_click(on_explore_clicked)
    
    # Layout
    controls = widgets.VBox([
        widgets.HTML("<h3>Interactive Data Explorer</h3>"),
        widgets.HBox([sample_size, filter_by_labels]),
        label_filter,
        search_text,
        explore_button
    ])
    
    return widgets.VBox([controls, output])

# Create and display interactive explorer
if posts_df is not None and groundtruth_data is not None:
    print("\nüîç Interactive Data Explorer:")
    data_explorer = create_interactive_data_explorer()
    if data_explorer:
        display(data_explorer)
else:
    print("‚ö†Ô∏è  Load data first to use the interactive explorer")

## Data Summary and Export

Generate summary reports and export data for training.

In [None]:
def generate_data_summary_report():
    """Generate a comprehensive data summary report."""
    
    if posts_df is None or groundtruth_data is None:
        print("‚ùå No data loaded. Please load data first.")
        return
    
    print("üìã Data Summary Report")
    print("=" * 50)
    
    # Find text column
    text_column = None
    for col in ['text', 'content', 'post_text', 'message']:
        if col in posts_df.columns:
            text_column = col
            break
    
    if text_column is None:
        print("‚ùå No text column found")
        return
    
    # Basic statistics
    print(f"\nüìä Dataset Overview:")
    print(f"   Total posts: {len(posts_df):,}")
    print(f"   Posts with groundtruth: {len(groundtruth_data):,}")
    print(f"   Coverage: {len(groundtruth_data)/len(posts_df)*100:.1f}%")
    
    # Text statistics
    texts = posts_df[text_column].dropna()
    char_counts = texts.str.len()
    word_counts = texts.str.split().str.len()
    
    print(f"\nüìù Text Statistics:")
    print(f"   Character count - Mean: {char_counts.mean():.1f}, Median: {char_counts.median():.1f}")
    print(f"   Word count - Mean: {word_counts.mean():.1f}, Median: {word_counts.median():.1f}")
    
    # Label statistics
    all_labels = []
    for entry in groundtruth_data.values():
        if 'labels' in entry:
            all_labels.extend(entry['labels'])
    
    label_counts = Counter(all_labels)
    
    print(f"\nüè∑Ô∏è  Label Statistics:")
    print(f"   Unique labels: {len(label_counts)}")
    print(f"   Total label instances: {len(all_labels):,}")
    
    print(f"\n‚úÖ Data summary report complete!")
    
    return {
        'total_posts': len(posts_df),
        'posts_with_groundtruth': len(groundtruth_data),
        'unique_labels': len(label_counts),
        'label_counts': label_counts
    }

# Generate summary report
if posts_df is not None and groundtruth_data is not None:
    summary_report = generate_data_summary_report()
else:
    print("‚ö†Ô∏è  Load data first to generate summary report")

print("\n‚úÖ Data Processing and Exploration notebook complete!")
print("\nThis notebook provides:")
print("‚Ä¢ Interactive data configuration and loading")
print("‚Ä¢ Comprehensive text and label analysis")
print("‚Ä¢ Data quality assessment and recommendations")
print("‚Ä¢ Interactive data exploration tools")
print("‚Ä¢ Data preprocessing pipeline testing")