# S2ORC Dataset Analysis Notebook

This notebook provides tools for analyzing the processed Parquet files from the ETL pipeline.

## Setup

Make sure you're using the `nvidia_impact_env` conda environment for GPU acceleration.


In [4]:
# Cell 1: Imports & Setup
import os
import glob
import json
import re
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import cudf
import cupy as cp

# Check for GPU
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")

# Config
INPUT_DIR = "processed_parquet"
OUTPUT_DIR = "evaluation_results"
MODEL_NAME = 'all-MiniLM-L6-v2'  # Fast, effective for semantic classification
os.makedirs(OUTPUT_DIR, exist_ok=True)

  import pynvml  # type: ignore[import]
  from .autonotebook import tqdm as notebook_tqdm


GPU Available: True
Device: NVIDIA GB10


NVIDIA GB10 with CUDA capability sm_121 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_89 sm_90 compute_90.
If you want to use the NVIDIA GB10 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [5]:
# Cell 2: Define Extraction Logic

# --- 1. HARD SIGNALS (Regex Patterns) ---
# These extract specific strings for lists/booleans
REGEX_PATTERNS = {
    "frameworks": {
        "PyTorch": r"\b(pytorch|torch)\b",
        "TensorFlow": r"\b(tensorflow|keras)\b",
        "JAX": r"\b(jax|flax)\b",
        "Scikit-Learn": r"\b(scikit-learn|sklearn)\b",
        "HuggingFace": r"\b(huggingface|transformers)\b",
        "FastAI": r"\b(fastai)\b"
    },
    "compute": {
        "NVIDIA GPU": r"\b(nvidia|gpu|cuda|a100|v100|h100|rtx|geforce)\b",
        "TPU": r"\b(tpu|tensor processing unit)\b",
        "HPC/Cluster": r"\b(hpc|supercomputer|cluster|slurm)\b"
    },
    "repo_type": {
        "GitHub": r"github\.com",
        "GitLab": r"gitlab\.com",
        "BitBucket": r"bitbucket\.org",
        "Zenodo": r"zenodo",
        "FigShare": r"figshare"
    },
    "identifiers": {
        "clinical_trial": r"\b(NCT\d{8}|ISRCTN\d{8})\b",
        "patent": r"\b(US\d{7,}|WO\d{4}\d{6})\b",
        "grant": r"\b(NSF|NIH|ERC|grant number)\b"
    }
}

# --- 2. SEMANTIC ANCHORS (Vector Descriptions) ---
# We compare paper abstracts to these descriptions to determine "Levels"
ANCHOR_TEXTS = {
    "adoption_level": {
        "core": "novel machine learning algorithm model architecture loss function theoretical deep learning",
        "substantial": "deep learning implementation neural network training fine-tuning transformer architecture",
        "moderate": "applied machine learning random forest support vector machine classification regression analysis",
        "minimal": "statistical analysis t-test correlation linear regression pca clustering",
        "none": "qualitative analysis literature review theoretical derivation manual annotation"
    },
    "impact_scope": {
        "transformative": "paradigm shift breakthrough discovery universal application fundamental change",
        "broad": "wide application cross-domain utility standard benchmark widespread adoption",
        "moderate": "improvement baseline optimization specific domain incremental advance",
        "narrow": "case study preliminary result specific implementation single dataset"
    },
    "replication_feasibility": {
        "straightforward": "source code available public dataset detailed methodology reproducible",
        "moderate": "methodology described parameters listed some code available",
        "difficult": "complex pipeline custom hardware missing parameters proprietary data",
        "not_feasible": "proprietary code closed data insufficient detail"
    }
}

In [6]:
# Cell 3: Initialize Model
model = SentenceTransformer(MODEL_NAME, device='cuda')

# Pre-compute anchor vectors for fast comparison
anchor_vectors = {}
for category, options in ANCHOR_TEXTS.items():
    anchor_vectors[category] = {}
    for label, text in options.items():
        # Encode and keep on CPU as numpy for fast mapping later (or move to GPU if using CuPy)
        anchor_vectors[category][label] = model.encode(text, normalize_embeddings=True)

print("✅ Model loaded and anchors vectorised.")

✅ Model loaded and anchors vectorised.


In [7]:
# Cell 4: Core Extraction Function (CORRECTED)
def extract_paper_evaluation(row, embedding, vector_anchors):
    # --- 1. PREPARE TEXT ---
    full_text = str(row.get('text', '')) # KEEP EVERYTHING
    full_text_lower = full_text.lower()
    
    # --- 2. VECTOR MATH (Using Pre-Computed Embedding) ---
    # Note: We assume 'embedding' was generated from the first ~2000 chars
    # in the main loop to save GPU compute.
    
    # Helper to find closest semantic match
    def get_best_match(anchors_dict, paper_vec):
        best_score = -1
        best_label = "none"
        for label, anchor_vec in anchors_dict.items():
            score = np.dot(paper_vec, anchor_vec)
            if score > best_score:
                best_score = score
                best_label = label
        return best_label

    # --- 3. REGEX EXTRACTIONS (Scanning FULL TEXT) ---
    # We scan the ENTIRE string now, catching Appendices/References
    
    found_frameworks = [k for k, v in REGEX_PATTERNS['frameworks'].items() if re.search(v, full_text_lower)]
    found_compute = [k for k, v in REGEX_PATTERNS['compute'].items() if re.search(v, full_text_lower)]
    
    # Critical Fix: Repository links often appear at the very end
    found_repos = [k for k, v in REGEX_PATTERNS['repo_type'].items() if re.search(v, full_text_lower)]
    
    # Identifiers
    found_trials = re.findall(REGEX_PATTERNS['identifiers']['clinical_trial'], full_text)
    found_patents = re.findall(REGEX_PATTERNS['identifiers']['patent'], full_text)
    
    # --- 4. SEMANTIC CLASSIFICATIONS ---
    adoption_level = get_best_match(vector_anchors['adoption_level'], embedding)
    impact_scope = get_best_match(vector_anchors['impact_scope'], embedding)
    repl_feasibility = get_best_match(vector_anchors['replication_feasibility'], embedding)

    # --- 5. CONSTRUCT DICTIONARY ---
    evaluation = {
        "paper_id": str(row['paper_id']),
        "field": str(row['primary_field']),
        "publication_date": str(row['year']),
        
        "ml_adoption": {
            "ml_frameworks_mentioned": found_frameworks,
            "specific_models_architectures": [], 
            "compute_resources_mentioned": found_compute,
            "datasets_referenced": [], 
            "ml_libraries_tools": found_frameworks, 
            "ml_adoption_level": adoption_level,
            "ml_application_domain": str(row['primary_field']),
            "ml_method_primary": None,
            "integration_with_traditional_methods": False
        },
        "reproducibility": {
            "code_availability_mentioned": len(found_repos) > 0 or "code available" in full_text_lower,
            "code_repository_type": found_repos[0] if found_repos else None,
            "data_availability_mentioned": "data available" in full_text_lower or "zenodo" in full_text_lower,
            "data_sharing_statement": None,
            # Heuristic: Detailed methodology usually means longer text
            "methodology_detail_level": "detailed" if len(full_text) > 25000 else "moderate",
            "hyperparameters_specified": "hyperparameter" in full_text_lower,
            "computational_environment_described": len(found_compute) > 0,
            "preprocessing_steps_detailed": "preprocessing" in full_text_lower,
            "statistical_methods_described": "statistical" in full_text_lower or "p-value" in full_text_lower,
            "replication_feasibility": repl_feasibility
        },
        "research_outcomes": {
            "mentions_clinical_trials": len(found_trials) > 0,
            "clinical_trial_identifiers": list(set(found_trials)),
            "mentions_patents": len(found_patents) > 0,
            "patent_numbers": list(set(found_patents)),
            "mentions_corrections": "correction" in full_text_lower[:1000], # Corrections usually in title/header
            "mentions_retractions": "retraction" in full_text_lower[:1000],
            "validation_type": [],
            "real_world_application_mentioned": impact_scope in ["transformative", "broad"],
            "commercialization_mentioned": "commercial" in full_text_lower,
            "regulatory_approval_mentioned": "fda" in full_text_lower or "approved" in full_text_lower
        },
        "impact_indicators": {
            "claims_novelty": "novel" in full_text_lower or "first time" in full_text_lower,
            "claims_improvement_over_existing": "outperform" in full_text_lower or "better than" in full_text_lower,
            "quantitative_improvements_mentioned": "%" in full_text_lower or "accuracy" in full_text_lower,
            "comparison_to_baseline": "baseline" in full_text_lower,
            "mentions_policy_implications": "policy" in full_text_lower,
            "mentions_clinical_guidelines": "guideline" in full_text_lower,
            "mentions_media_coverage": False,
            "collaboration_indicators": [],
            "funding_sources_mentioned": [],
            "potential_impact_scope": impact_scope
        },
        "overall_assessment": f"Paper in {row['primary_field']} ({row['year']}) with {adoption_level} ML adoption.",
        "confidence_score": "medium",
        "notes": "Generated by NVIDIA Impact Engine"
    }
    
    return evaluation

In [None]:
# Cell 5: Run Processing Loop
parquet_files = sorted(glob.glob(f"{INPUT_DIR}/*.parquet"))
print(f"Found {len(parquet_files)} files to process.")

for i, file_path in enumerate(parquet_files):
    output_path = f"{OUTPUT_DIR}/{os.path.basename(file_path).replace('.parquet', '.jsonl')}"
    if os.path.exists(output_path):
        continue # Skip if done
        
    try:
        # Load Parquet (Pandas for row iteration comfort, or cuDF if logic handles it)
        # Using Pandas here for loop simplicity with complex dict creation
        df = pd.read_parquet(file_path)
        
        if len(df) == 0: continue

        # Vectorize text batch (GPU)
        # We slice to 1000 chars for semantic vectorization (standard for BERT)
        texts = df['text'].fillna("").astype(str).str.slice(0, 1000).tolist()
        embeddings = model.encode(texts, batch_size=64, show_progress_bar=False)
        
        # Build Results
        results = []
        for idx, row in df.iterrows():
            eval_dict = extract_paper_evaluation(row, embeddings[idx], anchor_vectors)
            results.append(eval_dict)
            
        # Save JSONL
        with open(output_path, 'w') as f:
            for item in results:
                f.write(json.dumps(item) + "\n")
                
        print(f"✅ [{i+1}/{len(parquet_files)}] Saved {len(results)} evals to {os.path.basename(output_path)}")
        
        # Cleanup
        del df, texts, embeddings, results
        gc.collect()
        
    except Exception as e:
        print(f"❌ Error on {file_path}: {e}")

Found 2396 files to process.
✅ [1/2396] Saved 2 evals to chunk_test_01250.jsonl
✅ [2/2396] Saved 1 evals to chunk_test_01260.jsonl
✅ [3/2396] Saved 1 evals to chunk_test_01264.jsonl
✅ [4/2396] Saved 1 evals to chunk_test_01265.jsonl
✅ [5/2396] Saved 2 evals to chunk_test_01266.jsonl
✅ [6/2396] Saved 1 evals to chunk_test_01268.jsonl
✅ [7/2396] Saved 1 evals to chunk_test_01272.jsonl
✅ [8/2396] Saved 1 evals to chunk_test_01280.jsonl
✅ [9/2396] Saved 1 evals to chunk_test_01281.jsonl
✅ [10/2396] Saved 1 evals to chunk_test_01282.jsonl
✅ [11/2396] Saved 1 evals to chunk_test_01283.jsonl
✅ [12/2396] Saved 2 evals to chunk_test_01286.jsonl
✅ [13/2396] Saved 6 evals to chunk_test_01289.jsonl
✅ [14/2396] Saved 7 evals to chunk_test_01290.jsonl
✅ [15/2396] Saved 8 evals to chunk_test_01291.jsonl
✅ [16/2396] Saved 13 evals to chunk_test_01292.jsonl
✅ [17/2396] Saved 19 evals to chunk_test_01293.jsonl
✅ [18/2396] Saved 22 evals to chunk_test_01294.jsonl
✅ [19/2396] Saved 28 evals to chunk_test_