In [None]:
!pip install --upgrade fsspec datasets pandas

In [2]:
import pandas as pd
from datasets import load_dataset
import sys

In [16]:
print("Loading GLUE MRPC dataset...")
# Load the dataset
raw_datasets = load_dataset("glue", "mrpc")

Loading GLUE MRPC dataset...


In [17]:
print("✅ Dataset loaded successfully!")
print(f"Dataset structure: {raw_datasets}")
print()

✅ Dataset loaded successfully!
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})



In [4]:
print("Available splits:")
for split_name, split_data in raw_datasets.items():
    print(f"  - {split_name}: {len(split_data)} examples")
print()

Available splits:
  - train: 3668 examples
  - validation: 408 examples
  - test: 1725 examples



In [5]:
# Look at the training data
train_data = raw_datasets['train']
print("Training data info:")
print(f"  - Number of examples: {len(train_data)}")
print(f"  - Features: {train_data.features}")
print()

Training data info:
  - Number of examples: 3668
  - Features: {'sentence1': Value('string'), 'sentence2': Value('string'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), 'idx': Value('int32')}



In [11]:
# Show first few examples
print("First 3 examples from training set:")
for i in range(min(5, len(train_data))):
    example = train_data[i]
    print(f"\nExample {i+1}:")
    for key, value in example.items():
        print(f"  {key}: {value}")

First 3 examples from training set:

Example 1:
  sentence1: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
  sentence2: Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
  label: 1
  idx: 0

Example 2:
  sentence1: Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .
  sentence2: Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .
  label: 0
  idx: 1

Example 3:
  sentence1: They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .
  sentence2: On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .
  label: 1
  idx: 2

Example 4:
  sentence1: Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .
  sentence2: 

In [7]:
 # Convert to pandas for easier exploration
print("\n" + "="*50)
print("Converting to pandas DataFrame for easier exploration...")

train_df = pd.DataFrame(train_data)
print(f"\nDataFrame shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")

# Show basic statistics
print("\nDataset statistics:")
print(train_df.describe())

# Show label distribution
if 'label' in train_df.columns:
    print("\nLabel distribution:")
    print(train_df['label'].value_counts())


Converting to pandas DataFrame for easier exploration...

DataFrame shape: (3668, 4)
Columns: ['sentence1', 'sentence2', 'label', 'idx']

Dataset statistics:
             label          idx
count  3668.000000  3668.000000
mean      0.674482  2039.858233
std       0.468632  1176.050149
min       0.000000     0.000000
25%       0.000000  1022.750000
50%       1.000000  2039.500000
75%       1.000000  3054.250000
max       1.000000  4075.000000

Label distribution:
label
1    2474
0    1194
Name: count, dtype: int64


In [8]:
# Show sample data
print("\nSample data (first 5 rows):")
print(train_df.head())


Sample data (first 5 rows):
                                           sentence1  \
0  Amrozi accused his brother , whom he called " ...   
1  Yucaipa owned Dominick 's before selling the c...   
2  They had published an advertisement on the Int...   
3  Around 0335 GMT , Tab shares were up 19 cents ...   
4  The stock rose $ 2.11 , or about 11 percent , ...   

                                           sentence2  label  idx  
0  Referring to him as only " the witness " , Amr...      1    0  
1  Yucaipa bought Dominick 's in 1995 for $ 693 m...      0    1  
2  On June 10 , the ship 's owners had published ...      1    2  
3  Tab shares jumped 20 cents , or 4.6 % , to set...      0    3  
4  PG & E Corp. shares jumped $ 1.63 or 8 percent...      1    4  


In [9]:
# Show first few examples
print("First 3 examples from training set:")
for i in range(min(5, len(raw_datasets['test']))):
    example = train_data[i]
    print(f"\nExample {i+1}:")
    for key, value in example.items():
        print(f"  {key}: {value}")

First 3 examples from training set:

Example 1:
  sentence1: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
  sentence2: Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
  label: 1
  idx: 0

Example 2:
  sentence1: Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .
  sentence2: Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .
  label: 0
  idx: 1

Example 3:
  sentence1: They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .
  sentence2: On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .
  label: 1
  idx: 2

Example 4:
  sentence1: Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .
  sentence2: 

In [10]:
# Install required packages
# Run this in your terminal or notebook:
# pip install datasets pandas

import pandas as pd
from datasets import load_dataset
import sys

def load_and_explore_dataset():
    """Load and explore a Hugging Face dataset with error handling"""

    try:
        print("Loading GLUE MRPC dataset...")
        # Load the dataset
        raw_datasets = load_dataset("glue", "mrpc")

        print("✅ Dataset loaded successfully!")
        print(f"Dataset structure: {raw_datasets}")
        print()

        # Explore the dataset splits
        print("Available splits:")
        for split_name, split_data in raw_datasets.items():
            print(f"  - {split_name}: {len(split_data)} examples")
        print()

        # Look at the training data
        train_data = raw_datasets['train']
        print("Training data info:")
        print(f"  - Number of examples: {len(train_data)}")
        print(f"  - Features: {train_data.features}")
        print()

        # Show first few examples
        print("First 3 examples from training set:")
        for i in range(min(3, len(train_data))):
            example = train_data[i]
            print(f"\nExample {i+1}:")
            for key, value in example.items():
                print(f"  {key}: {value}")

        # Convert to pandas for easier exploration
        print("\n" + "="*50)
        print("Converting to pandas DataFrame for easier exploration...")

        train_df = pd.DataFrame(train_data)
        print(f"\nDataFrame shape: {train_df.shape}")
        print(f"Columns: {list(train_df.columns)}")

        # Show basic statistics
        print("\nDataset statistics:")
        print(train_df.describe())

        # Show label distribution
        if 'label' in train_df.columns:
            print("\nLabel distribution:")
            print(train_df['label'].value_counts())

        # Show sample data
        print("\nSample data (first 5 rows):")
        print(train_df.head())

        return raw_datasets, train_df

    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        print(f"Error type: {type(e).__name__}")

        # Common troubleshooting steps
        print("\n🔧 Troubleshooting steps:")
        print("1. Make sure you have internet connection")
        print("2. Try updating the datasets library: pip install --upgrade datasets")
        print("3. Check if the dataset name is correct")
        print("4. Try a different dataset first, like: load_dataset('imdb')")

        return None, None

def try_alternative_datasets():
    """Try loading some popular alternative datasets"""

    alternative_datasets = [
        ("imdb", None),  # Movie reviews
        ("squad", None),  # Question answering
        ("wikitext", "wikitext-2-raw-v1"),  # Text corpus
    ]

    print("Trying alternative datasets...")

    for dataset_name, config in alternative_datasets:
        try:
            print(f"\nTrying {dataset_name}...")
            if config:
                dataset = load_dataset(dataset_name, config)
            else:
                dataset = load_dataset(dataset_name)

            print(f"✅ {dataset_name} loaded successfully!")
            print(f"Structure: {dataset}")

            # Show first example
            first_split = list(dataset.keys())[0]
            first_example = dataset[first_split][0]
            print(f"First example: {first_example}")

            return dataset

        except Exception as e:
            print(f"❌ Failed to load {dataset_name}: {e}")

    return None

if __name__ == "__main__":
    print("Hugging Face Dataset Loader")
    print("=" * 40)

    # Try to load the original dataset
    raw_datasets, train_df = load_and_explore_dataset()

    # If that fails, try alternatives
    if raw_datasets is None:
        print("\n" + "="*50)
        print("Trying alternative datasets...")
        alternative_dataset = try_alternative_datasets()

    print("\n🎉 Dataset exploration complete!")
    print("\nNext steps you can try:")
    print("- Access specific examples: raw_datasets['train'][0]")
    print("- Convert to pandas: pd.DataFrame(raw_datasets['train'])")
    print("- Explore features: raw_datasets['train'].features")
    print("- Filter data: raw_datasets['train'].filter(lambda x: x['label'] == 1)")

Hugging Face Dataset Loader
Loading GLUE MRPC dataset...
✅ Dataset loaded successfully!
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

Available splits:
  - train: 3668 examples
  - validation: 408 examples
  - test: 1725 examples

Training data info:
  - Number of examples: 3668
  - Features: {'sentence1': Value('string'), 'sentence2': Value('string'), 'label': ClassLabel(names=['not_equivalent', 'equivalent']), 'idx': Value('int32')}

First 3 examples from training set:

Example 1:
  sentence1: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
  sentence2: Referring to him as only " the witness " , Amrozi

# Removing metadata.widgets for Github

In [15]:
# Fix for Jupyter Notebook Widget Metadata Error
# "the 'state' key is missing from 'metadata.widgets'"

import json
import os
import shutil
from pathlib import Path

def fix_notebook_widgets(notebook_path):
    """
    Fix notebook widget metadata by removing or fixing widget state

    Args:
        notebook_path (str): Path to the .ipynb file
    """

    # Make a backup first
    backup_path = notebook_path.replace('.ipynb', '_backup.ipynb')
    shutil.copy2(notebook_path, backup_path)
    print(f"✅ Backup created: {backup_path}")

    try:
        # Read the notebook
        with open(notebook_path, 'r', encoding='utf-8') as f:
            notebook = json.load(f)

        print(f"📖 Loaded notebook: {notebook_path}")

        # Check if widgets metadata exists
        if 'metadata' in notebook and 'widgets' in notebook['metadata']:
            print("🔍 Found widget metadata, checking structure...")

            widgets = notebook['metadata']['widgets']

            # Method 1: Add missing 'state' key
            if isinstance(widgets, dict) and 'state' not in widgets:
                print("🔧 Adding missing 'state' key...")
                widgets['state'] = {}

            # Method 2: If widgets is malformed, replace with proper structure
            elif not isinstance(widgets, dict):
                print("🔧 Fixing malformed widgets structure...")
                notebook['metadata']['widgets'] = {
                    "application/vnd.jupyter.widget-state+json": {
                        "state": {},
                        "version_major": 2,
                        "version_minor": 0
                    }
                }

            # Method 3: Ensure proper widget structure
            else:
                print("🔧 Ensuring proper widget structure...")
                if 'application/vnd.jupyter.widget-state+json' not in widgets:
                    notebook['metadata']['widgets'] = {
                        "application/vnd.jupyter.widget-state+json": {
                            "state": widgets.get('state', {}),
                            "version_major": 2,
                            "version_minor": 0
                        }
                    }

        # Clean up cell metadata widgets as well
        if 'cells' in notebook:
            for cell in notebook['cells']:
                if 'metadata' in cell and 'widgets' in cell['metadata']:
                    print("🔧 Cleaning cell widget metadata...")
                    # Remove problematic cell-level widget metadata
                    del cell['metadata']['widgets']

        # Write the fixed notebook
        with open(notebook_path, 'w', encoding='utf-8') as f:
            json.dump(notebook, f, indent=2)

        print("✅ Notebook fixed successfully!")
        print(f"📁 Original saved as: {backup_path}")

    except Exception as e:
        print(f"❌ Error fixing notebook: {e}")
        # Restore backup if something went wrong
        shutil.copy2(backup_path, notebook_path)
        print("🔄 Backup restored due to error")

def remove_all_widgets(notebook_path):
    """
    Alternative solution: Remove all widget metadata completely

    Args:
        notebook_path (str): Path to the .ipynb file
    """

    # Make a backup first
    backup_path = notebook_path.replace('.ipynb', '_backup_no_widgets.ipynb')
    shutil.copy2(notebook_path, backup_path)
    print(f"✅ Backup created: {backup_path}")

    try:
        # Read the notebook
        with open(notebook_path, 'r', encoding='utf-8') as f:
            notebook = json.load(f)

        print(f"📖 Loaded notebook: {notebook_path}")

        # Remove widgets from main metadata
        if 'metadata' in notebook and 'widgets' in notebook['metadata']:
            print("🗑️ Removing main widget metadata...")
            del notebook['metadata']['widgets']

        # Remove widgets from cell metadata
        if 'cells' in notebook:
            for cell in notebook['cells']:
                if 'metadata' in cell and 'widgets' in cell['metadata']:
                    print("🗑️ Removing cell widget metadata...")
                    del cell['metadata']['widgets']

        # Write the cleaned notebook
        with open(notebook_path, 'w', encoding='utf-8') as f:
            json.dump(notebook, f, indent=2)

        print("✅ All widgets removed successfully!")
        print(f"📁 Original saved as: {backup_path}")

    except Exception as e:
        print(f"❌ Error removing widgets: {e}")
        # Restore backup if something went wrong
        shutil.copy2(backup_path, notebook_path)
        print("🔄 Backup restored due to error")

def batch_fix_notebooks(directory_path):
    """
    Fix all notebooks in a directory

    Args:
        directory_path (str): Path to directory containing .ipynb files
    """

    directory = Path(directory_path)
    notebook_files = list(directory.glob('*.ipynb'))

    if not notebook_files:
        print("❌ No notebook files found in directory")
        return

    print(f"🔍 Found {len(notebook_files)} notebook files")

    for notebook_file in notebook_files:
        print(f"\n{'='*50}")
        print(f"Processing: {notebook_file.name}")
        print(f"{'='*50}")

        try:
            fix_notebook_widgets(str(notebook_file))
        except Exception as e:
            print(f"❌ Failed to fix {notebook_file.name}: {e}")

# ============================================
# Command-line style usage instructions
# ============================================

def print_usage_instructions():
    """Print instructions for using the fix"""

    print("🚀 HOW TO USE THIS FIX:")
    print("=" * 50)
    print()
    print("METHOD 1: Fix a single notebook")
    print("fix_notebook_widgets('your_notebook.ipynb')")
    print()
    print("METHOD 2: Remove all widgets (safer)")
    print("remove_all_widgets('your_notebook.ipynb')")
    print()
    print("METHOD 3: Fix all notebooks in a directory")
    print("batch_fix_notebooks('/path/to/your/notebooks')")
    print()
    print("🔧 MANUAL ALTERNATIVE:")
    print("1. Open your notebook in Jupyter")
    print("2. Go to Kernel -> Restart & Clear Output")
    print("3. Save the notebook")
    print("4. This removes all widget states")
    print()
    print("💡 PREVENTION TIPS:")
    print("- Always clear output before committing: Cell -> All Output -> Clear")
    print("- Use 'Restart & Clear Output' before saving")
    print("- Add .ipynb_checkpoints/ to .gitignore")
    print()
    print("🐙 GITHUB SPECIFIC:")
    print("- GitHub renders notebooks better without widget metadata")
    print("- Consider using nbstripout: pip install nbstripout")
    print("- Add git filter: nbstripout --install")

# ============================================
# Quick fix function for immediate use
# ============================================

def quick_fix(notebook_path):
    """Quick one-liner fix for immediate use"""

    print(f"🚀 Quick fixing: {notebook_path}")

    # Check if file exists
    if not os.path.exists(notebook_path):
        print(f"❌ File not found: {notebook_path}")
        return

    # Try the safer approach first (remove widgets)
    try:
        remove_all_widgets(notebook_path)
        print("✅ Quick fix completed!")
    except Exception as e:
        print(f"❌ Quick fix failed: {e}")
        print("Try the manual method or check the file path")

# ============================================
# Example usage
# ============================================

if __name__ == "__main__":
    print("Jupyter Notebook Widget Metadata Fixer")
    print("=" * 50)

    # Print usage instructions
    print_usage_instructions()

    # Example - replace with your actual notebook path
    example_notebook = "your_notebook.ipynb"

    print(f"\n🎯 TO FIX YOUR NOTEBOOK:")
    print(f"quick_fix('{example_notebook}')")
    print(f"# OR")
    print(f"remove_all_widgets('{example_notebook}')")

    # If you want to run it directly, uncomment the line below:
    # quick_fix("your_notebook.ipynb")

Jupyter Notebook Widget Metadata Fixer
🚀 HOW TO USE THIS FIX:

METHOD 1: Fix a single notebook
fix_notebook_widgets('your_notebook.ipynb')

METHOD 2: Remove all widgets (safer)
remove_all_widgets('your_notebook.ipynb')

METHOD 3: Fix all notebooks in a directory
batch_fix_notebooks('/path/to/your/notebooks')

🔧 MANUAL ALTERNATIVE:
1. Open your notebook in Jupyter
2. Go to Kernel -> Restart & Clear Output
3. Save the notebook
4. This removes all widget states

💡 PREVENTION TIPS:
- Always clear output before committing: Cell -> All Output -> Clear
- Use 'Restart & Clear Output' before saving
- Add .ipynb_checkpoints/ to .gitignore

🐙 GITHUB SPECIFIC:
- GitHub renders notebooks better without widget metadata
- Consider using nbstripout: pip install nbstripout
- Add git filter: nbstripout --install

🎯 TO FIX YOUR NOTEBOOK:
quick_fix('your_notebook.ipynb')
# OR
remove_all_widgets('your_notebook.ipynb')
