Data Set Downloading

In [1]:
pip install datasets

Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Using cached requests-2.32.4-py3-none-any.whl (64 kB)
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.27.1
    Uninstalling requests-2.27.1:
      Successfully uninstalled requests-2.27.1
Successfully installed requests-2.32.4
Note: you may need to restart the kernel to use updated packages.


In [8]:
from huggingface_hub import login
import os
token = os.getenv("Token_ID")
login(token)

In [None]:
from datasets import load_dataset
import os
from huggingface_hub import login


def collect_dataset(dataset_name, config, split, max_size_gb, output_file, text_key='text', title_key=None):
    print(f"\n{'='*50}")
    print(f"Collecting {max_size_gb}GB from {dataset_name}")
    print(f"{'='*50}")

    ds = load_dataset(dataset_name, config, split=split, streaming=True, trust_remote_code=True,timeout = 60)
    total_size = 0
    max_size = max_size_gb * 1024 * 1024 * 1024
    texts = []
    item_count = 0

    for item in ds:
        if isinstance(text_key, list):
            text = '\n'.join([str(item.get(key, '')) for key in text_key])
        else:
            text = item.get(text_key, '')
        if not text or len(text.strip()) < 50:
            continue
        text_size = len(text.encode('utf-8'))
        if total_size + text_size > max_size:
            print(f"Reached size limit. Stopping at {total_size/1024/1024/1024:.3f}GB")
            break
        if title_key and title_key in item:
            formatted_text = f"=== {item[title_key]} ===\n{text}\n\n"
        else:
            formatted_text = f"{text}\n\n"
        texts.append(formatted_text)
        total_size += len(formatted_text.encode('utf-8'))
        item_count += 1
        if item_count % 1000 == 0:
            current_gb = total_size / (1024 * 1024 * 1024)
            preview = (item.get(title_key, '') or text)[:50].replace('\n', ' ')
            print(f"Items: {item_count:,} | Size: {current_gb:.3f}GB | Preview: {preview}...")
    print(f"Saving {len(texts):,} items to {output_file}...")
    with open(output_file, 'w', encoding='utf-8', errors='replace') as f:
        f.writelines(texts)
    final_size = total_size / (1024 * 1024 * 1024)
    print(f"✅ Saved {item_count:,} items ({final_size:.3f}GB) to {output_file}")
    return {
        'dataset': dataset_name,
        'items': item_count,
        'size_gb': final_size,
        'file': output_file
    }


# collect_all.py
import json
from datetime import datetime


def main():
    print("🚀 Starting dataset collection for LLM training")
    print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    results = []

    datasets = [
        {
            "dataset_name": "bigcode/the-stack",
            "config": "default",
            "split": "train",
            "max_size_gb": 5.0,
            "output_file": "code_python_5gb.txt",
            "text_key": "content"
        },
        {
            "dataset_name": "pg19",
            "config": None,
            "split": "train",
            "max_size_gb": 3.0,
            "output_file": "literature_3gb.txt",
            "text_key": "text"
        },
        {
            "dataset_name": "wikimedia/wikipedia",
            "config": "20231101.en",
            "split": "train",
            "max_size_gb": 3.0,
            "output_file": "wikipedia_3gb.txt",
            "text_key": "text",
            "title_key": "title"
        },
        {
            "dataset_name": "daily_dialog", 
            "config": None,
            "split": "train",
            "max_size_gb": 2.0,
            "output_file": "conversations_2gb.txt",
            "text_key": "dialog"
        },
        {
            "dataset_name": "bigcode/the-stack",
            "config": "default",
            "split": "train",
            "max_size_gb": 2.0,
            "output_file": "code_javascript_2gb.txt",
            "text_key": "content"
        }
    ]

    for ds in datasets:
        try:
            result = collect_dataset(**ds)
            results.append(result)
        except Exception as e:
            print(f"❌ Failed to collect {ds['dataset_name']}: {e}")

    print(f"\n{'='*60}")
    print("📊 COLLECTION SUMMARY")
    print(f"{'='*60}")
    total_size = sum(r['size_gb'] for r in results)
    total_items = sum(r['items'] for r in results)
    for result in results:
        print(f"{result['dataset']:30} | {result['items']:8,} items | {result['size_gb']:6.2f}GB | {result['file']}")
    print(f"{'='*60}")
    print(f"{'TOTAL':30} | {total_items:8,} items | {total_size:6.2f}GB")
    print(f"\nFinished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    with open('dataset_info.json', 'w') as f:
        json.dump({
            'collection_date': datetime.now().isoformat(),
            'total_size_gb': total_size,
            'total_items': total_items,
            'datasets': results
        }, f, indent=2)
    print("📝 Metadata saved to dataset_info.json")
    print("\n🎉 Dataset collection complete!")

if __name__ == "__main__":
    main()

🚀 Starting dataset collection for LLM training
Started at: 2025-08-19 10:01:51

Collecting 5.0GB from bigcode/the-stack


Resolving data files:   0%|          | 0/6824 [00:00<?, ?it/s]

❌ Failed to collect bigcode/the-stack: BuilderConfig ParquetConfig(name='default', version=0.0.0, data_dir=None, data_files={NamedSplit('train'): ['hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/abap/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/actionscript/train-00000-of-00002.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/actionscript/train-00001-of-00002.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/ada/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/agda/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/ags-script/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/alloy/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the

Resolving data files:   0%|          | 0/6824 [00:00<?, ?it/s]

❌ Failed to collect bigcode/the-stack: BuilderConfig ParquetConfig(name='default', version=0.0.0, data_dir=None, data_files={NamedSplit('train'): ['hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/abap/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/actionscript/train-00000-of-00002.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/actionscript/train-00001-of-00002.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/ada/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/agda/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/ags-script/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the-stack@349a71353fd5868fb90b593ef09e311379da498a/data/alloy/train-00000-of-00001.parquet', 'hf://datasets/bigcode/the