In [None]:
# import pandas as pd

In [None]:
import os
import requests
from datasets import Dataset
from PyPDF2 import PdfReader
from io import BytesIO
import time

# Target NIST documents
TARGET_FILES = [
    "NIST.CSWP.29.pdf",
    "NIST.SP.800-53r5.pdf",
    "NIST.SP.800-61r2.pdf",
    "NIST.SP.800-171r1.pdf",
    "NIST.SP.800-82r2.pdf",
    "NIST.SP.800-63-3.pdf"
]

# Direct URLs as fallback
DIRECT_URLS = {
    "NIST.CSWP.29.pdf": "https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf",
    "NIST.SP.800-53r5.pdf": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53r5.pdf",
    "NIST.SP.800-61r2.pdf": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-61r2.pdf",
    "NIST.SP.800-171r1.pdf": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-171r1.pdf",
    "NIST.SP.800-82r2.pdf": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-82r2.pdf",
    "NIST.SP.800-63-3.pdf": "https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-63-3.pdf"
}

def extract_pdf_text(file_url, max_retries=3):
    """Extract text from PDF with retry logic"""
    for attempt in range(max_retries):
        try:
            print(f"Downloading {file_url}...")
            response = requests.get(file_url, timeout=30)
            response.raise_for_status()

            print(f"Extracting text...")
            with BytesIO(response.content) as data:
                reader = PdfReader(data)
                text = ""
                for i, page in enumerate(reader.pages):
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + " "
                    if i % 10 == 0 and i > 0:
                        print(f"  Processed {i} pages...")

                return text.strip()
        except Exception as e:
            print(f"  Attempt {attempt+1} failed: {str(e)}")
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"  Waiting {wait_time} seconds before retrying...")
                time.sleep(wait_time)
            else:
                print(f"  Failed to extract text after {max_retries} attempts.")
                return None

def get_nist_documents():
    """Get NIST documents from GitHub repo or direct URLs"""
    documents = []
    metadata = []

    # GitHub API URL
    api_url = "https://api.github.com/repos/fractional-ciso/NIST-Cybersecurity-Documents/contents/"

    try:
        # Try GitHub repository first
        print("Accessing GitHub repository...")
        response = requests.get(api_url, timeout=10)
        response.raise_for_status()
        files = response.json()

        for file in files:
            if file['name'] in TARGET_FILES:
                print(f"\nProcessing target file: {file['name']}")
                text = extract_pdf_text(file['download_url'])
                if text and len(text) > 1000:  # Basic validation
                    documents.append(text)
                    metadata.append({
                        "filename": file['name'],
                        "source": "github",
                        "url": file['download_url'],
                        "length": len(text)
                    })
                    print(f"✓ Successfully extracted {file['name']} ({len(text)} chars)")
                else:
                    print(f"✗ Failed to extract usable text from {file['name']}")

    except Exception as e:
        print(f"Error accessing GitHub repository: {str(e)}")

    # Use direct URLs as fallback or supplement
    if len(documents) < 3:
        print("\nUsing direct NIST URLs for additional documents...")
        for filename, url in DIRECT_URLS.items():
            if not any(m['filename'] == filename for m in metadata):  # Don't duplicate
                print(f"\nProcessing: {filename}")
                text = extract_pdf_text(url)
                if text and len(text) > 1000:
                    documents.append(text)
                    metadata.append({
                        "filename": filename,
                        "source": "direct",
                        "url": url,
                        "length": len(text)
                    })
                    print(f"✓ Successfully extracted {filename} ({len(text)} chars)")
                else:
                    print(f"✗ Failed to extract usable text from {filename}")

    return documents, metadata

# Execute
documents, metadata = get_nist_documents()

if documents:
    # Create dataset with metadata
    dataset = Dataset.from_dict({
        'text': documents,
        'metadata': metadata
    })

    # Save dataset
    dataset.save_to_disk("nist_cybersecurity_dataset")

    # Verification
    print(f"\nSuccessfully extracted {len(dataset)} documents")
    print("\nDocument Details:")
    for i, (doc, meta) in enumerate(zip(documents, metadata)):
        print(f"  {i+1}. {meta['filename']} - {meta['length']} chars")

    # Sample content
    print("\nSample content from first document:")
    print(documents[0][:500].replace('\n', ' ') + "...")
else:
    print("Error: No documents could be extracted.")

Accessing GitHub repository...

Processing target file: NIST.SP.800-63-3.pdf
Downloading https://raw.githubusercontent.com/fractional-ciso/NIST-Cybersecurity-Documents/master/NIST.SP.800-63-3.pdf...
Extracting text...
  Processed 10 pages...
  Processed 20 pages...
  Processed 30 pages...
  Processed 40 pages...
  Processed 50 pages...
  Processed 60 pages...
  Processed 70 pages...
✓ Successfully extracted NIST.SP.800-63-3.pdf (147851 chars)

Using direct NIST URLs for additional documents...

Processing: NIST.CSWP.29.pdf
Downloading https://nvlpubs.nist.gov/nistpubs/CSWP/NIST.CSWP.29.pdf...
Extracting text...
  Processed 10 pages...
  Processed 20 pages...
  Processed 30 pages...
✓ Successfully extracted NIST.CSWP.29.pdf (71033 chars)

Processing: NIST.SP.800-53r5.pdf
Downloading https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-53r5.pdf...
Extracting text...
  Processed 10 pages...
  Processed 20 pages...
  Processed 30 pages...
  Processed 40 pages...
  Processed 50

Saving the dataset (0/1 shards):   0%|          | 0/6 [00:00<?, ? examples/s]


Successfully extracted 6 documents

Document Details:
  1. NIST.SP.800-63-3.pdf - 147851 chars
  2. NIST.CSWP.29.pdf - 71033 chars
  3. NIST.SP.800-53r5.pdf - 1681747 chars
  4. NIST.SP.800-61r2.pdf - 235065 chars
  5. NIST.SP.800-171r1.pdf - 361280 chars
  6. NIST.SP.800-82r2.pdf - 704473 chars

Sample content from first document:
NIST Special Publication 800-63-3  Digital Identity Guidelines  Paul A. Grassi Michael E. Garcia James L. Fenton        This publication is available free of charge from: https://doi.org/10.6028/NIST.SP.800-63-3      NIST Special Publication 800-63-3 Digital Identity Guidelines   Paul A. Grassi Michael E. Garcia Applied Cybersecurity Division Information Technology Laboratory James L. Fenton Altmode Networks Los Altos, Calif.        This publication is available free of charge from: https://doi....
