# S2ORC Dataset Analysis Notebook

This notebook provides tools for analyzing the processed Parquet files from the ETL pipeline.

## Setup

Make sure you're using the `nvidia_impact_env` conda environment for GPU acceleration.


In [None]:
# Standard imports
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path

# Try to import cuDF for GPU acceleration (if available)
try:
    import cudf
    USE_GPU = True
    print("‚úÖ GPU acceleration available (cuDF)")
except ImportError:
    USE_GPU = False
    print("‚ö†Ô∏è  GPU acceleration not available, using Pandas (CPU)")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)


## Load Data

Load all Parquet files or specific splits.


In [None]:
# Configuration
PARQUET_DIR = "processed_parquet"

# Check what files we have
train_files = glob.glob(f"{PARQUET_DIR}/chunk_train_*.parquet")
test_files = glob.glob(f"{PARQUET_DIR}/chunk_test_*.parquet")
val_files = glob.glob(f"{PARQUET_DIR}/chunk_val_*.parquet")

print(f"üìä Parquet Files Found:")
print(f"   Train: {len(train_files):,} files")
print(f"   Test:  {len(test_files):,} files")
print(f"   Val:   {len(val_files):,} files")
print(f"   Total: {len(train_files) + len(test_files) + len(val_files):,} files")


In [None]:
# Load a sample to inspect structure
if USE_GPU:
    sample_df = cudf.read_parquet(train_files[0])
else:
    sample_df = pd.read_parquet(train_files[0])

print(f"üìã Sample file shape: {sample_df.shape}")
print(f"\nüìù Columns:")
print(list(sample_df.columns))
print(f"\nüîç First few rows:")
sample_df.head()


In [None]:
# Load all data (or specific split)
def load_all_data(use_gpu=USE_GPU, split=None):
    """Load all parquet files, optionally filtered by split"""
    if split:
        pattern = f"{PARQUET_DIR}/chunk_{split}_*.parquet"
        files = glob.glob(pattern)
    else:
        files = glob.glob(f"{PARQUET_DIR}/*.parquet")
    
    print(f"üìÇ Loading {len(files):,} files...")
    
    if use_gpu:
        df = cudf.read_parquet(files)
    else:
        # Load in chunks to avoid memory issues
        dfs = []
        for i, f in enumerate(files):
            if i % 100 == 0:
                print(f"   Loading file {i+1}/{len(files)}...")
            dfs.append(pd.read_parquet(f))
        df = pd.concat(dfs, ignore_index=True)
    
    print(f"‚úÖ Loaded {len(df):,} records")
    return df

# Uncomment to load:
# df_all = load_all_data()
# df_train = load_all_data(split='train')
# df_test = load_all_data(split='test')
# df_val = load_all_data(split='val')


## Data Exploration

Explore the dataset structure and statistics.


In [None]:
# Basic statistics
def explore_data(df, name="Dataset"):
    print(f"\n{'='*60}")
    print(f"üìä {name} Statistics")
    print(f"{'='*60}")
    print(f"Total records: {len(df):,}")
    print(f"\nColumn info:")
    print(df.info())
    
    if 'year' in df.columns:
        print(f"\nüìÖ Year range: {df['year'].min()} - {df['year'].max()}")
    
    if 'primary_field' in df.columns:
        print(f"\nüî¨ Top 10 Fields of Study:")
        print(df['primary_field'].value_counts().head(10))
    
    if 'text_length' in df.columns:
        print(f"\nüìù Text Length Statistics:")
        print(df['text_length'].describe())

# Example usage:
# explore_data(sample_df, "Sample File")
