Data Set Downloading

In [2]:
pip install datasets==3.6.0

Note: you may need to restart the kernel to use updated packages.


ERROR: Ignored the following versions that require a different python version: 3.2.0 Requires-Python >=3.9.0; 3.3.0 Requires-Python >=3.9.0; 3.3.1 Requires-Python >=3.9.0; 3.3.2 Requires-Python >=3.9.0; 3.4.0 Requires-Python >=3.9.0; 3.4.1 Requires-Python >=3.9.0; 3.5.0 Requires-Python >=3.9.0; 3.5.1 Requires-Python >=3.9.0; 3.6.0 Requires-Python >=3.9.0; 4.0.0 Requires-Python >=3.9.0
ERROR: Could not find a version that satisfies the requirement datasets==3.6.0 (from versions: 0.0.9, 1.0.0, 1.0.1, 1.0.2, 1.1.0, 1.1.1, 1.1.2, 1.1.3, 1.2.0, 1.2.1, 1.3.0, 1.4.0, 1.4.1, 1.5.0, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.8.0, 1.9.0, 1.10.0, 1.10.1, 1.10.2, 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 1.13.2, 1.13.3, 1.14.0, 1.15.0, 1.15.1, 1.16.0, 1.16.1, 1.17.0, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 2.0.0, 2.1.0, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.3.2, 2.4.0, 2.5.0, 2.5.1, 2.5.2, 2.6.0, 2.6.1, 2.6.2, 2.7.0, 2.7.1, 2.8.0, 2.9.0, 2.10.0, 2.10.1, 2.11.0, 2.12.0, 2.13.0, 2.13.1, 2.13.2, 2.14.0, 2.14.1, 2.14

In [3]:
from huggingface_hub import login
import os
token = os.getenv("Token_ID")
login(token)

In [4]:
# Simple HuggingFace Dataset Downloader
# Downloads datasets directly from HuggingFace Hub and saves them locally

import os
import json
from pathlib import Path
from typing import List, Dict, Any
from tqdm import tqdm
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import gc

class HuggingFaceDataDownloader:
    """Simple downloader for HuggingFace datasets"""

    def __init__(self, save_directory: str = "./downloaded_datasets", target_size_gb: float = 20.0):
        self.save_dir = Path(save_directory)
        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.target_size_gb = target_size_gb
        self.target_size_bytes = target_size_gb * 1024**3
        self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.downloaded_size = 0
        self.stats = {
            'datasets': {},
            'total_size_gb': 0,
            'total_texts': 0
        }

        print(f"📥 HuggingFace Dataset Downloader")
        print(f"🎯 Target: {target_size_gb} GB")
        print(f"📁 Save to: {self.save_dir}")

    def download_dataset(self, dataset_name: str, config: str = None,
                        split: str = "train", max_size_gb: float = 5.0) -> bool:
        """Download a single HuggingFace dataset"""

        print(f"\n📚 Downloading {dataset_name}")
        if config:
            print(f"   Config: {config}")
        print(f"   Split: {split}")
        print(f"   Max size: {max_size_gb} GB")

        try:
            # Load dataset
            if config:
                dataset = load_dataset(dataset_name, config, split=split, streaming=True)
            else:
                dataset = load_dataset(dataset_name, split=split, streaming=True)

            # Prepare output file
            safe_name = dataset_name.replace("/", "_").replace("-", "_")
            if config:
                safe_name += f"_{config}"
            output_file = self.save_dir / f"{safe_name}_{split}.jsonl"

            texts = []
            current_size = 0
            max_bytes = max_size_gb * 1024**3

            # Download and process
            count = 0
            for example in tqdm(dataset, desc=f"Downloading {dataset_name}"):
                count += 1

                # Extract text from different field names
                text = self._extract_text(example)
                if not text or len(text) < 50:
                    continue

                # Add to collection
                texts.append(text)
                current_size += len(text.encode('utf-8'))

                # Save batch every 1000 items or when reaching size limit
                if len(texts) >= 1000 or current_size >= max_bytes:
                    self._save_batch(texts, output_file)
                    texts = []

                    if current_size >= max_bytes:
                        print(f"   ⚠️ Reached {max_size_gb}GB limit for {dataset_name}")
                        break

                # Overall size check
                self.downloaded_size += len(text.encode('utf-8'))
                if self.downloaded_size >= self.target_size_bytes:
                    print(f"   🎯 Reached overall target of {self.target_size_gb}GB!")
                    break

            # Save final batch
            if texts:
                self._save_batch(texts, output_file)

            # Update stats
            size_gb = current_size / 1024**3
            self.stats['datasets'][dataset_name] = {
                'size_gb': size_gb,
                'items': count,
                'file': str(output_file)
            }

            print(f"   ✅ Downloaded {size_gb:.2f} GB from {dataset_name}")
            return True

        except Exception as e:
            print(f"   ❌ Failed to download {dataset_name}: {e}")
            return False

    def _extract_text(self, example: Dict[str, Any]) -> str:
        """Extract text from dataset example"""

        # Common text field names
        text_fields = ['text', 'content', 'article', 'body', 'passage', 'document']

        for field in text_fields:
            if field in example and example[field]:
                return str(example[field]).strip()

        # For Q&A datasets
        if 'context' in example and 'question' in example:
            context = example.get('context', '')
            question = example.get('question', '')
            return f"Context: {context}\n\nQuestion: {question}"

        # For code datasets
        if 'func_code_string' in example:
            code = example.get('func_code_string', '')
            doc = example.get('func_documentation_string', '')
            return f"# {doc}\n{code}" if doc else code

        # Fallback: convert whole example to string
        return str(example)

    def _save_batch(self, texts: List[str], output_file: Path):
        """Save batch of texts to JSONL file"""
        with open(output_file, 'a', encoding='utf-8') as f:
            for text in texts:
                f.write(json.dumps({'text': text}) + '\n')

    def download_popular_datasets(self) -> Dict[str, bool]:
        """Download a collection of popular, reliable datasets"""


        datasets_to_download = [
          # ----------------------
          # Text datasets
          # ----------------------
          {"name": "bookcorpusopen", "config": None, "max_size": 3.0},  # Legal replacement for BookCorpus
          {"name": "wikitext", "config": "wikitext-103-raw-v1", "max_size": 2.0},  # Wikipedia text
          {"name": "oscar", "config": "unshuffled_deduplicated_en", "max_size": 4.0},  # Web-crawled English text
         {"name": "openwebtext", "config": None, "max_size": 4.0},

          # ----------------------
          # Q&A / Instruction datasets
          # ----------------------
          {"name": "trivia_qa", "config": None, "max_size": 2.0},  # Fact-based Q&A
          {"name": "natural_questions", "config": "simplified_open", "max_size": 2.0},  # Google NQ, question-answer pairs
          {"name": "hotpot_qa", "config": "distractor", "max_size": 2.0},  # Multi-hop Q&A dataset
          {"name": "wiki_qa", "config": None, "max_size": 1.0},  # Smaller Q&A dataset for guaranteed download

          # ----------------------
          # News / Summarization datasets
          # ----------------------
          {"name": "cnn_dailymail", "config": "3.0.0", "max_size": 3.0},  # Long news summarization
          {"name": "xsum", "config": None, "max_size": 1.0},  # Short news summaries

          # ----------------------
          # Code datasets
          # ----------------------
         {"name": "code_search_net", "config": "python", "max_size": 2.0},  # Python code + docstrings
         {"name": "codeparrot/github-code-clean", "config": None, "max_size": 3.0},
         {"name": "codeparrot-small", "config": None, "max_size": 2.0},  # Cleaned GitHub Python code

          # ----------------------
          # Educational / Instruction datasets
          # ----------------------
          {"name": "stackexchange", "config": "stackoverflow", "max_size": 2.0},  # StackOverflow Q&A for instruction-style fine-tuning
          {"name": "pubmed_qa", "config": None, "max_size": 1.5},  # Scientific question-answering
         {"name": "scientific_papers", "config": "pubmed", "max_size": 2.0},
      ]

        alternative_datasets_to_try = [

      {"name": "c4", "config": "en", "max_size": 5.0},                         # Very high quality web text
      {"name": "squad", "config": None, "max_size": 1.0},                      # Q&A
      {"name": "amazon_polarity", "config": None, "max_size": 1.0},            # Reviews
       {"name": "EleutherAI/pile", "config": "all", "max_size": 5.0},  # Diverse high-quality text
      {"name": "tiiuae/falcon-refinedweb", "config": None, "max_size": 4.0},  # High-quality web text
      {"name": "togethercomputer/RedPajama-Data-1T", "config": "default", "max_size": 5.0},  # LLaMA-style data
  ]


        results = {}

        for dataset_info in alternative_datasets_to_try:
            if self.downloaded_size >= self.target_size_bytes:
                print(f"\n🎯 Reached target size of {self.target_size_gb}GB!")
                break

            name = dataset_info["name"]
            config = dataset_info["config"]
            max_size = dataset_info["max_size"]

            success = self.download_dataset(name, config, max_size_gb=max_size)
            results[name] = success

            # Memory cleanup
            gc.collect()

        return results

    def create_training_dataset(self, output_file: str = "training_dataset.pt") -> str:
        """Combine all downloaded datasets into a single training file"""

        print(f"\n🔧 Creating combined training dataset...")

        all_texts = []

        # Read all downloaded files
        for jsonl_file in self.save_dir.glob("*.jsonl"):
            print(f"   📖 Reading {jsonl_file.name}...")

            with open(jsonl_file, 'r', encoding='utf-8') as f:
                for line in tqdm(f, desc=f"Loading {jsonl_file.name}"):
                    try:
                        data = json.loads(line)
                        text = data.get('text', '')
                        if text and len(text) > 50:
                            all_texts.append(text)
                    except:
                        continue

        print(f"   📊 Total texts: {len(all_texts):,}")

        # Tokenize
        print(f"   🔤 Tokenizing...")
        tokenized_texts = []
        batch_size = 1000

        for i in tqdm(range(0, len(all_texts), batch_size), desc="Tokenizing"):
            batch = all_texts[i:i+batch_size]
            try:
                tokens = self.tokenizer(
                    batch,
                    truncation=True,
                    max_length=1024,
                    padding=False,
                    return_attention_mask=False
                )['input_ids']
                tokenized_texts.extend(tokens)
            except Exception as e:
                print(f"      ⚠️ Tokenization error: {e}")

            # Memory cleanup
            if i % 5000 == 0:
                gc.collect()

        # Save tokenized dataset
        output_path = self.save_dir / output_file
        torch.save(tokenized_texts, output_path)

        # Update final stats
        self.stats['total_size_gb'] = self.downloaded_size / 1024**3
        self.stats['total_texts'] = len(all_texts)
        self.stats['total_tokens'] = sum(len(tokens) for tokens in tokenized_texts)

        # Save stats
        with open(self.save_dir / "download_stats.json", 'w') as f:
            json.dump(self.stats, f, indent=2)

        print(f"\n✅ Training dataset created!")
        print(f"   📁 File: {output_path}")
        print(f"   📊 Size: {self.stats['total_size_gb']:.2f} GB")
        print(f"   📝 Texts: {self.stats['total_texts']:,}")
        print(f"   🔤 Tokens: {self.stats['total_tokens']:,}")

        return str(output_path)

# Quick usage function
def download_training_data(target_gb: float = 20.0, save_dir: str = "./training_data"):
    """Quick function to download training data"""

    downloader = HuggingFaceDataDownloader(save_dir, target_gb)

    print("🚀 Starting HuggingFace dataset downloads...")
    results = downloader.download_popular_datasets()

    print(f"\n📊 Download Results:")
    for dataset, success in results.items():
        status = "✅" if success else "❌"
        print(f"   {status} {dataset}")

    # Create combined dataset
    training_file = downloader.create_training_dataset()

    return training_file

print("📥 HuggingFace Dataset Downloader Ready!")
print("💡 Usage: download_training_data(target_gb=20.0)")
print("🔄 This will download popular datasets directly from HuggingFace Hub")

📥 HuggingFace Dataset Downloader Ready!
💡 Usage: download_training_data(target_gb=20.0)
🔄 This will download popular datasets directly from HuggingFace Hub


In [5]:
# Download 20GB of training data
training_file = download_training_data(target_gb=20.0, save_dir="./my_training_data")

RobertaTokenizerFast has an issue when working on mask language modeling where it introduces an extra encoded space before the mask token.See https://github.com/huggingface/transformers/pull/2778 for more information.


📥 HuggingFace Dataset Downloader
🎯 Target: 20.0 GB
📁 Save to: my_training_data
🚀 Starting HuggingFace dataset downloads...

📚 Downloading c4
   Config: en
   Split: train
   Max size: 5.0 GB
   ❌ Failed to download c4: The repository for c4 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/c4.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.

📚 Downloading squad
   Split: train
   Max size: 1.0 GB
   ❌ Failed to download squad: module 'transformers' has no attribute 'PreTrainedTokenizerBase'

📚 Downloading amazon_polarity
   Split: train
   Max size: 1.0 GB


README.md: 0.00B [00:00, ?B/s]

   ❌ Failed to download amazon_polarity: module 'transformers' has no attribute 'PreTrainedTokenizerBase'

📚 Downloading EleutherAI/pile
   Config: all
   Split: train
   Max size: 5.0 GB


README.md: 0.00B [00:00, ?B/s]

pile.py: 0.00B [00:00, ?B/s]

   ❌ Failed to download EleutherAI/pile: The repository for EleutherAI/pile contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/EleutherAI/pile.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.

📚 Downloading tiiuae/falcon-refinedweb
   Split: train
   Max size: 4.0 GB


README.md: 0.00B [00:00, ?B/s]

KeyboardInterrupt: 