# Dryad Dataset Downloader

This notebook downloads datasets from the Dryad API and saves them to a parquet file using Polars for efficient processing.

In [1]:
import polars as pl
import requests
import json
import time
from typing import List, Dict, Any
from pathlib import Path

In [2]:
# Configuration
BASE_URL = "https://datadryad.org/api/v2/datasets"
OUTPUT_FILE = "scripts/output/gold/dryad_datasets.parquet"
BATCH_SIZE = 100  # Number of records per API request
DELAY_BETWEEN_REQUESTS = 0.5  # Seconds to wait between requests to be respectful

In [3]:
def make_dryad_request(url: str, params: Dict[str, Any] = None) -> Dict[str, Any]:
    """
    Make a request to the Dryad API with error handling and rate limiting.
    """
    try:
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error making request to {url}: {e}")
        return None

In [4]:
# Now that we've seen the actual structure, let's update our flattening function
# Based on exploration: extract ALL available fields from the API response

def flatten_dataset_actual(dataset: Dict[str, Any]) -> Dict[str, Any]:
    """
    Flatten dataset structure based on ACTUAL API response format.
    Extracts ALL available fields from the dataset.
    """
    flattened = {}
    
    # Basic fields (direct extraction)
    flattened['id'] = dataset.get('id')
    flattened['identifier'] = dataset.get('identifier')  # DOI-like identifier
    flattened['title'] = dataset.get('title')
    flattened['abstract'] = dataset.get('abstract')
    
    # Storage and size info
    flattened['storage_size'] = dataset.get('storageSize')
    
    # Publication info
    flattened['related_publication_issn'] = dataset.get('relatedPublicationISSN')

    # Usage Notes
    flattened['usageNotes'] = dataset.get('usageNotes')
    
    # Dates
    flattened['publication_date'] = dataset.get('publicationDate')
    flattened['last_modification_date'] = dataset.get('lastModificationDate')
    
    # Version and status info
    flattened['version_number'] = dataset.get('versionNumber')
    flattened['version_status'] = dataset.get('versionStatus')
    flattened['curation_status'] = dataset.get('curationStatus')
    flattened['version_changes'] = dataset.get('versionChanges')
    
    # Access and sharing
    flattened['visibility'] = dataset.get('visibility')
    flattened['sharing_link'] = dataset.get('sharingLink')
    flattened['license'] = dataset.get('license')
    
    # Links (extract useful link information)
    links = dataset.get('_links', {})
    if links:
        # Extract key links as separate fields
        flattened['self_link'] = links.get('self', {}).get('href') if links.get('self') else None
        flattened['download_link'] = links.get('stash:download', {}).get('href') if links.get('stash:download') else None
        flattened['files_link'] = links.get('stash:files', {}).get('href') if links.get('stash:files') else None
        flattened['versions_link'] = links.get('stash:versions', {}).get('href') if links.get('stash:versions') else None
        # Store all links as JSON string for reference
        flattened['all_links'] = json.dumps(links) if links else None
    else:
        flattened['self_link'] = None
        flattened['download_link'] = None
        flattened['files_link'] = None
        flattened['versions_link'] = None
        flattened['all_links'] = None
    
    # Authors - comprehensive extraction
    authors = dataset.get('authors', [])
    if authors:
        author_names = []
        author_emails = []
        author_affiliations = []
        author_ror_ids = []
        for author in authors:
            # Full name
            name_parts = []
            if author.get('firstName'):
                name_parts.append(author.get('firstName'))
            if author.get('lastName'):
                name_parts.append(author.get('lastName'))
            if name_parts:
                author_names.append(' '.join(name_parts))
            
            # Email
            if author.get('email'):
                author_emails.append(author.get('email'))
            
            # Affiliation
            if author.get('affiliation'):
                author_affiliations.append(author.get('affiliation'))
            
            # ROR IDs
            if author.get('affiliationROR'):
                author_ror_ids.append(author.get('affiliationROR'))
        
        flattened['authors'] = '; '.join(author_names) if author_names else None
        flattened['author_emails'] = '; '.join([email for email in author_emails if email]) if author_emails else None
        flattened['author_affiliations'] = '; '.join(set(author_affiliations)) if author_affiliations else None
        flattened['author_ror_ids'] = '; '.join(set(author_ror_ids)) if author_ror_ids else None
        flattened['author_count'] = len(authors)
        # Store full author data as JSON for reference
        flattened['authors_full'] = json.dumps(authors) if authors else None
    else:
        flattened['authors'] = None
        flattened['author_emails'] = None
        flattened['author_affiliations'] = None
        flattened['author_ror_ids'] = None
        flattened['author_count'] = 0
        flattened['authors_full'] = None
    
    # Keywords 
    keywords = dataset.get('keywords', [])
    flattened['keywords'] = '; '.join(keywords) if keywords else None
    flattened['keyword_count'] = len(keywords) if keywords else 0
    
    # Locations - comprehensive extraction
    locations = dataset.get('locations', [])
    if locations:
        location_info = []
        location_places = []
        location_coords = []
        for location in locations:
            if isinstance(location, dict):
                # Extract place names
                place = location.get('place', '')
                if place:
                    location_places.append(place)
                    location_info.append(place)
                
                # Extract coordinates
                if location.get('point'):
                    coords = location.get('point', {})
                    lat = coords.get('latitude')
                    lon = coords.get('longitude')
                    if lat and lon:
                        coord_str = f"({lat}, {lon})"
                        location_coords.append(coord_str)
                        location_info.append(coord_str)
            elif isinstance(location, str):
                location_info.append(location)
        
        flattened['locations'] = '; '.join(location_info) if location_info else None
        flattened['location_places'] = '; '.join(location_places) if location_places else None
        flattened['location_coordinates'] = '; '.join(location_coords) if location_coords else None
        flattened['location_count'] = len(locations)
        # Store full location data as JSON
        flattened['locations_full'] = json.dumps(locations) if locations else None
    else:
        flattened['locations'] = None
        flattened['location_places'] = None
        flattened['location_coordinates'] = None
        flattened['location_count'] = 0
        flattened['locations_full'] = None
    
    # Funders - comprehensive extraction
    funders = dataset.get('funders', [])
    if funders:
        funder_names = []
        funder_awards = []
        funder_info = []
        for funder in funders:
            if isinstance(funder, dict):
                funder_name = funder.get('funderName', '')
                award_number = funder.get('awardNumber', '')
                
                if funder_name:
                    funder_names.append(funder_name)
                
                if award_number:
                    funder_awards.append(award_number)
                
                if funder_name and award_number:
                    funder_info.append(f"{funder_name} ({award_number})")
                elif funder_name:
                    funder_info.append(funder_name)
            elif isinstance(funder, str):
                funder_info.append(funder)
                funder_names.append(funder)
        
        flattened['funders'] = '; '.join(funder_info) if funder_info else None
        flattened['funder_names'] = '; '.join(set(funder_names)) if funder_names else None
        flattened['funder_awards'] = '; '.join(funder_awards) if funder_awards else None
        flattened['funder_count'] = len(funders)
        # Store full funder data as JSON
        flattened['funders_full'] = json.dumps(funders) if funders else None
    else:
        flattened['funders'] = None
        flattened['funder_names'] = None
        flattened['funder_awards'] = None
        flattened['funder_count'] = 0
        flattened['funders_full'] = None
    
    # Related works - comprehensive extraction
    related_works = dataset.get('relatedWorks', [])
    if related_works:
        related_info = []
        relation_types = []
        identifiers = []
        for work in related_works:
            if isinstance(work, dict):
                work_type = work.get('relationshipType', '')
                identifier = work.get('identifier', '')
                
                if work_type:
                    relation_types.append(work_type)
                
                if identifier:
                    identifiers.append(identifier)
                
                if work_type and identifier:
                    related_info.append(f"{work_type}: {identifier}")
                elif identifier:
                    related_info.append(identifier)
        
        flattened['related_works'] = '; '.join(related_info) if related_info else None
        flattened['relation_types'] = '; '.join(set(relation_types)) if relation_types else None
        flattened['related_identifiers'] = '; '.join(identifiers) if identifiers else None
        flattened['related_works_count'] = len(related_works)
        # Store full related works data as JSON
        flattened['related_works_full'] = json.dumps(related_works) if related_works else None
    else:
        flattened['related_works'] = None
        flattened['relation_types'] = None
        flattened['related_identifiers'] = None
        flattened['related_works_count'] = 0
        flattened['related_works_full'] = None
    
    # Metrics - comprehensive extraction
    metrics = dataset.get('metrics', {})
    if metrics:
        flattened['downloads'] = metrics.get('downloads')
        flattened['views'] = metrics.get('views')
        # Extract any other metrics that might be available
        for key, value in metrics.items():
            if key not in ['downloads', 'views']:
                flattened[f'metric_{key}'] = value
        # Store full metrics as JSON
        flattened['metrics_full'] = json.dumps(metrics) if metrics else None
    else:
        flattened['downloads'] = None
        flattened['views'] = None
        flattened['metrics_full'] = None
    
    return flattened

In [5]:
def download_dryad_datasets() -> pl.DataFrame:
    """
    Download all datasets from Dryad API using the refined flattening function.
    Returns a Polars DataFrame with comprehensive field extraction.
    Fault-tolerant with checkpoint saving and resume capability.
    """
    # Checkpoint file to save progress
    checkpoint_file = "scripts/output/checkpoint_dryad_progress.json"
    checkpoint_data_file = "scripts/output/checkpoint_dryad_data.parquet"
    
    # Try to load existing progress
    all_datasets = []
    start_page = 1
    total_datasets = 0
    
    try:
        if Path(checkpoint_file).exists():
            print("📂 Found existing checkpoint, resuming download...")
            with open(checkpoint_file, 'r') as f:
                checkpoint = json.load(f)
                start_page = checkpoint.get('next_page', 1)
                total_datasets = checkpoint.get('total_processed', 0)
            
            if Path(checkpoint_data_file).exists():
                existing_df = pl.read_parquet(checkpoint_data_file)
                all_datasets = existing_df.to_dicts()
                print(f"🔄 Resuming from page {start_page}, already have {total_datasets:,} datasets")
            else:
                print(f"🔄 Resuming from page {start_page} (no previous data found)")
    except Exception as e:
        print(f"⚠️  Could not load checkpoint: {e}, starting fresh")
        start_page = 1
        total_datasets = 0
        all_datasets = []
    
    print(f"🚀 Starting download from Dryad API: {BASE_URL}")
    print(f"📊 Batch size: {BATCH_SIZE} | Rate limit: {DELAY_BETWEEN_REQUESTS}s between requests")
    
    page = start_page
    last_checkpoint_size = len(all_datasets)
    
    try:
        while True:
            print(f"\n📥 Fetching page {page}...")
            
            # Parameters for pagination
            params = {
                'page': page,
                'per_page': BATCH_SIZE
            }
            
            # Make API request with retries
            retry_count = 0
            max_retries = 3
            response_data = None
            
            while retry_count < max_retries and response_data is None:
                response_data = make_dryad_request(BASE_URL, params)
                if response_data is None:
                    retry_count += 1
                    if retry_count < max_retries:
                        wait_time = 2 ** retry_count  # Exponential backoff
                        print(f"⏳ Retry {retry_count}/{max_retries} in {wait_time}s...")
                        time.sleep(wait_time)
            
            if response_data is None:
                print(f"❌ Failed to fetch page {page} after {max_retries} retries")
                break
            
            # Extract datasets from response
            datasets = response_data.get('_embedded', {}).get('stash:datasets', [])
            
            if not datasets:
                print(f"✅ No more datasets found on page {page}. Download complete!")
                break
            
            # Process and flatten each dataset using the REFINED function
            page_processed = 0
            page_errors = 0
            for i, dataset in enumerate(datasets):
                try:
                    flattened_dataset = flatten_dataset_actual(dataset)
                    all_datasets.append(flattened_dataset)
                    page_processed += 1
                except Exception as e:
                    page_errors += 1
                    print(f"⚠️  Error processing dataset {i+1} on page {page}: {e}")
                    continue
            
            total_datasets += page_processed
            print(f"✅ Processed {page_processed}/{len(datasets)} datasets from page {page}")
            if page_errors > 0:
                print(f"⚠️  {page_errors} errors on this page")
            print(f"📈 Total processed so far: {total_datasets:,}")
            
            # Save checkpoint every 10 pages or if we have 1000+ new records
            if (page % 10 == 0) or (len(all_datasets) - last_checkpoint_size >= 1000):
                try:
                    print(f"💾 Saving checkpoint at page {page}...")
                    
                    # Create directories if needed
                    Path(checkpoint_file).parent.mkdir(parents=True, exist_ok=True)
                    
                    # Save checkpoint metadata
                    checkpoint_info = {
                        'next_page': page + 1,
                        'total_processed': total_datasets,
                        'last_checkpoint': str(time.time()),
                        'datasets_count': len(all_datasets)
                    }
                    
                    with open(checkpoint_file, 'w') as f:
                        json.dump(checkpoint_info, f, indent=2)
                    
                    # Save data checkpoint (handle schema issues gracefully)
                    if all_datasets:
                        try:
                            checkpoint_df = pl.DataFrame(all_datasets, infer_schema_length=None)
                            checkpoint_df.write_parquet(checkpoint_data_file, compression="snappy")
                            print(f"✅ Checkpoint saved: {len(all_datasets):,} records")
                            last_checkpoint_size = len(all_datasets)
                        except Exception as schema_error:
                            print(f"⚠️  Schema error in checkpoint, trying string conversion...")
                            # Fallback: save as strings
                            string_datasets = []
                            for dataset in all_datasets:
                                string_dataset = {k: str(v) if v is not None else None for k, v in dataset.items()}
                                string_datasets.append(string_dataset)
                            checkpoint_df = pl.DataFrame(string_datasets)
                            checkpoint_df.write_parquet(checkpoint_data_file, compression="snappy")
                            print(f"✅ Checkpoint saved as strings: {len(all_datasets):,} records")
                            last_checkpoint_size = len(all_datasets)
                        
                except Exception as checkpoint_error:
                    print(f"⚠️  Could not save checkpoint: {checkpoint_error}")
                    # Continue anyway - don't fail the whole download
            
            # Check if we've reached the last page
            page_info = response_data.get('page', {})
            current_page = page_info.get('number', page)
            total_pages = page_info.get('totalPages')
            
            if total_pages:
                print(f"📄 Progress: {current_page}/{total_pages} pages ({current_page/total_pages*100:.1f}%)")
                if current_page >= total_pages:
                    print(f"🎉 Reached last page ({total_pages}). Download complete!")
                    break
            
            # Rate limiting
            time.sleep(DELAY_BETWEEN_REQUESTS)
            page += 1
            
    except KeyboardInterrupt:
        print(f"\n🛑 Download interrupted by user at page {page}")
        print(f"📊 Progress saved: {len(all_datasets):,} datasets collected")
        # Save final checkpoint before exiting
        try:
            checkpoint_info = {
                'next_page': page,
                'total_processed': len(all_datasets),
                'interrupted': True,
                'last_checkpoint': str(time.time())
            }
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint_info, f, indent=2)
            print(f"💾 Progress saved to resume later")
        except:
            pass
    
    except Exception as e:
        print(f"\n❌ Unexpected error: {e}")
        print(f"📊 Progress so far: {len(all_datasets):,} datasets")
        # Save emergency checkpoint
        try:
            checkpoint_info = {
                'next_page': page,
                'total_processed': len(all_datasets),
                'error': str(e),
                'last_checkpoint': str(time.time())
            }
            with open(checkpoint_file, 'w') as f:
                json.dump(checkpoint_info, f, indent=2)
        except:
            pass
    
    # Final summary
    print(f"\n🎯 Download Summary:")
    print(f"   Total datasets processed: {len(all_datasets):,}")
    print(f"   Pages fetched: {page - start_page + 1}")
    
    # Convert to Polars DataFrame with robust error handling
    if all_datasets:
        try:
            print("🔧 Creating final DataFrame...")
            df = pl.DataFrame(all_datasets, infer_schema_length=None)
            print(f"   DataFrame shape: {df.shape}")
            print(f"   Columns extracted: {len(df.columns)}")
            
            # Clean up checkpoint files on success
            try:
                if Path(checkpoint_file).exists():
                    Path(checkpoint_file).unlink()
                if Path(checkpoint_data_file).exists():
                    Path(checkpoint_data_file).unlink()
                print("🧹 Cleaned up checkpoint files")
            except:
                pass
            
            return df
            
        except Exception as e:
            print(f"⚠️  Error creating final DataFrame: {e}")
            print("🔧 Using string conversion fallback...")
            
            try:
                # Convert all to strings as fallback
                string_datasets = []
                for dataset in all_datasets:
                    string_dataset = {k: str(v) if v is not None else None for k, v in dataset.items()}
                    string_datasets.append(string_dataset)
                
                df = pl.DataFrame(string_datasets)
                print(f"✅ Fallback DataFrame created: {df.shape}")
                
                # Try to optimize numeric columns
                numeric_cols = ['id', 'storage_size', 'version_number', 'author_count', 
                               'keyword_count', 'location_count', 'funder_count', 
                               'related_works_count', 'downloads', 'views']
                
                for col in numeric_cols:
                    if col in df.columns:
                        try:
                            df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))
                        except:
                            pass
                
                return df
                
            except Exception as e2:
                print(f"❌ All DataFrame creation methods failed: {e2}")
                print(f"💾 Data is still saved in checkpoint file: {checkpoint_data_file}")
                return pl.DataFrame()
    else:
        print("❌ No datasets processed successfully!")
        return pl.DataFrame()

## Download Dryad Datasets

Execute the download process to fetch all datasets from the Dryad API:

In [6]:
# Execute the download
dryad_df = download_dryad_datasets()

if not dryad_df.is_empty():
    print(f"\n🔍 Quick preview of downloaded data:")
    print(f"Columns: {list(dryad_df.columns[:10])}{'...' if len(dryad_df.columns) > 10 else ''}")
    
    # Show sample data
    preview_cols = ['id', 'title', 'authors', 'publication_date', 'keywords']
    available_cols = [col for col in preview_cols if col in dryad_df.columns]
    if available_cols:
        print(f"\n📋 Sample records:")
        sample_df = dryad_df.head(3).select(available_cols)
        print(sample_df)
    
    # Quick stats
    print(f"\n📊 Quick statistics:")
    stats = {
        'Total records': len(dryad_df),
        'Non-null titles': dryad_df.filter(pl.col('title').is_not_null()).shape[0] if 'title' in dryad_df.columns else 'N/A',
        'Non-null abstracts': dryad_df.filter(pl.col('abstract').is_not_null()).shape[0] if 'abstract' in dryad_df.columns else 'N/A',
        'Non-null authors': dryad_df.filter(pl.col('authors').is_not_null()).shape[0] if 'authors' in dryad_df.columns else 'N/A'
    }
    
    for key, value in stats.items():
        print(f"   {key}: {value}")
    
else:
    print("❌ No data downloaded. Check API connectivity and parameters.")

🚀 Starting download from Dryad API: https://datadryad.org/api/v2/datasets
📊 Batch size: 100 | Rate limit: 0.5s between requests

📥 Fetching page 1...
✅ Processed 100/100 datasets from page 1
📈 Total processed so far: 100
✅ Processed 100/100 datasets from page 1
📈 Total processed so far: 100

📥 Fetching page 2...

📥 Fetching page 2...
✅ Processed 100/100 datasets from page 2
📈 Total processed so far: 200
✅ Processed 100/100 datasets from page 2
📈 Total processed so far: 200

📥 Fetching page 3...

📥 Fetching page 3...
✅ Processed 100/100 datasets from page 3
📈 Total processed so far: 300
✅ Processed 100/100 datasets from page 3
📈 Total processed so far: 300

📥 Fetching page 4...

📥 Fetching page 4...
✅ Processed 100/100 datasets from page 4
📈 Total processed so far: 400
✅ Processed 100/100 datasets from page 4
📈 Total processed so far: 400

📥 Fetching page 5...

📥 Fetching page 5...
✅ Processed 100/100 datasets from page 5
📈 Total processed so far: 500
✅ Processed 100/100 datasets from p

## Save to Parquet File

Save the downloaded datasets to a parquet file for efficient storage and future use:

In [7]:
# Save datasets to parquet file
if not dryad_df.is_empty():
    output_path = Path(OUTPUT_FILE)
    output_path.parent.mkdir(parents=True, exist_ok=True)  # Ensure directory exists
    
    print(f"\n💾 Saving {len(dryad_df):,} datasets to parquet...")
    print(f"   Output path: {output_path.absolute()}")
    
    # Save with compression for optimal storage
    dryad_df.write_parquet(OUTPUT_FILE, compression="snappy")
    
    # Verify and report
    if output_path.exists():
        file_size = output_path.stat().st_size
        print(f"✅ Successfully saved!")
        print(f"   File size: {file_size / (1024*1024):.2f} MB")
        print(f"   Records: {len(dryad_df):,}")
        print(f"   Columns: {len(dryad_df.columns)}")
    else:
        print("❌ Failed to save file")
else:
    print("❌ No data available to save")


💾 Saving 66,730 datasets to parquet...
   Output path: /Users/pietro/Desktop/VIDA-NYU/data-gatherer/scripts/output/gold/dryad_datasets.parquet
✅ Successfully saved!
   File size: 122.96 MB
   Records: 66,730
   Columns: 48
✅ Successfully saved!
   File size: 122.96 MB
   Records: 66,730
   Columns: 48


## Data Analysis

Let's do some basic analysis of the downloaded datasets:

In [8]:
# Comprehensive analysis of the Dryad datasets
if not dryad_df.is_empty():
    print("🔬 DRYAD DATASETS ANALYSIS")
    print("=" * 50)
    print(f"📊 Dataset Overview:")
    print(f"   Total datasets: {len(dryad_df):,}")
    print(f"   Total columns: {len(dryad_df.columns)}")
    
    # Status distributions
    if 'curation_status' in dryad_df.columns:
        print(f"\n📋 Curation Status Distribution:")
        status_counts = dryad_df.group_by("curation_status").agg(pl.count().alias("count")).sort("count", descending=True)
        for row in status_counts.iter_rows():
            print(f"   {row[0]}: {row[1]:,}")
    
    # Publication trends
    if 'publication_date' in dryad_df.columns:
        print(f"\n📅 Recent Publication Years (Top 10):")
        yearly_stats = (dryad_df
                       .with_columns(
                           pl.col("publication_date").str.slice(0, 4).cast(pl.Int32, strict=False).alias("year")
                       )
                       .filter(pl.col("year").is_not_null())
                       .group_by("year")
                       .agg(pl.count().alias("count"))
                       .sort("year", descending=True)
                       .head(10))
        for row in yearly_stats.iter_rows():
            print(f"   {row[0]}: {row[1]:,} datasets")
    
    # Storage insights
    if 'storage_size' in dryad_df.columns:
        print(f"\n💽 Storage Statistics:")
        size_stats = dryad_df.select([
            (pl.col("storage_size").sum() / (1024**3)).alias("total_gb"),
            (pl.col("storage_size").mean() / (1024**2)).alias("avg_mb"),
            (pl.col("storage_size").median() / (1024**2)).alias("median_mb"),
            pl.col("storage_size").count().alias("datasets_with_size")
        ]).row(0)
        
        print(f"   Total storage: {size_stats[0]:.2f} GB")
        print(f"   Average size: {size_stats[1]:.2f} MB")
        print(f"   Median size: {size_stats[2]:.2f} MB")
        print(f"   Datasets with size info: {size_stats[3]:,}")
    
    # Top institutions
    if 'author_affiliations' in dryad_df.columns:
        print(f"\n🏛️  Top 10 Author Affiliations:")
        affiliations = (dryad_df
                       .filter(pl.col("author_affiliations").is_not_null())
                       .select(pl.col("author_affiliations").str.split(";").alias("affiliation_list"))
                       .explode("affiliation_list")
                       .with_columns(pl.col("affiliation_list").str.strip_chars().alias("affiliation"))
                       .group_by("affiliation")
                       .agg(pl.count().alias("count"))
                       .sort("count", descending=True)
                       .head(10))
        
        for row in affiliations.iter_rows():
            print(f"   {row[0]}: {row[1]:,}")
    
    # Popular keywords
    if 'keywords' in dryad_df.columns:
        print(f"\n🏷️  Top 10 Keywords:")
        keywords = (dryad_df
                   .filter(pl.col("keywords").is_not_null())
                   .select(pl.col("keywords").str.split(";").alias("keyword_list"))
                   .explode("keyword_list")
                   .with_columns(pl.col("keyword_list").str.strip_chars().alias("keyword"))
                   .group_by("keyword")
                   .agg(pl.count().alias("count"))
                   .sort("count", descending=True)
                   .head(10))
        
        for row in keywords.iter_rows():
            print(f"   {row[0]}: {row[1]:,}")
    
    # License distribution
    if 'license' in dryad_df.columns:
        print(f"\n⚖️  License Distribution:")
        licenses = dryad_df.group_by("license").agg(pl.count().alias("count")).sort("count", descending=True)
        for row in licenses.iter_rows():
            print(f"   {row[0]}: {row[1]:,}")
    
    print(f"\n✅ Analysis complete!")
        
else:
    print("❌ No data available for analysis")

🔬 DRYAD DATASETS ANALYSIS
📊 Dataset Overview:
   Total datasets: 66,730
   Total columns: 48

📋 Curation Status Distribution:
   Published: 66,725
   Embargoed: 4
   Action required: 1

📅 Recent Publication Years (Top 10):
   2999: 1 datasets
   2027: 1 datasets
   2026: 1 datasets
   2025: 4,702 datasets
   2024: 6,002 datasets
   2023: 5,656 datasets
   2022: 6,852 datasets
   2021: 7,064 datasets
   2020: 6,185 datasets
   2019: 6,710 datasets

💽 Storage Statistics:
   Total storage: 164800.89 GB
   Average size: 2529.55 MB
   Median size: 0.45 MB
   Datasets with size info: 66,714

🏛️  Top 10 Author Affiliations:
   University of California, Davis: 1,286
   University of California, Berkeley: 1,061
   University of British Columbia: 887
   University of Oxford: 886
   Cornell University: 877
   Centre National de la Recherche Scientifique: 864
   University of Florida: 836
   Chinese Academy of Sciences: 800
   University of Washington: 792
   Michigan State University: 719

🏷️  To

(Deprecated in version 0.20.5)
  status_counts = dryad_df.group_by("curation_status").agg(pl.count().alias("count")).sort("count", descending=True)
(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count"))
(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count"))
(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count"))
(Deprecated in version 0.20.5)
  licenses = dryad_df.group_by("license").agg(pl.count().alias("count")).sort("count", descending=True)


In [10]:
dryad_df.columns

['id',
 'identifier',
 'title',
 'abstract',
 'storage_size',
 'related_publication_issn',
 'usageNotes',
 'publication_date',
 'last_modification_date',
 'version_number',
 'version_status',
 'curation_status',
 'version_changes',
 'visibility',
 'sharing_link',
 'license',
 'self_link',
 'download_link',
 'files_link',
 'versions_link',
 'all_links',
 'authors',
 'author_emails',
 'author_affiliations',
 'author_ror_ids',
 'author_count',
 'authors_full',
 'keywords',
 'keyword_count',
 'locations',
 'location_places',
 'location_coordinates',
 'location_count',
 'locations_full',
 'funders',
 'funder_names',
 'funder_awards',
 'funder_count',
 'funders_full',
 'related_works',
 'relation_types',
 'related_identifiers',
 'related_works_count',
 'related_works_full',
 'downloads',
 'views',
 'metric_citations',
 'metrics_full']

In [11]:
dryad_df[['id', 'title', 'abstract', 'related_publication_issn', 'usageNotes']].head(10)

id,title,abstract,related_publication_issn,usageNotes
i64,str,str,str,str
93,"""Distinct predatory behaviors i…","""<p>Over the Cenozoic, large ca…","""0960-9822""",
94,"""Data from: Phylogenomic insigh…","""The stinging wasps (Hymenopter…","""0960-9822""","""<div class=""o-metadata__file-u…"
95,"""Data from: Complex selection o…","""Adaptive variation in social b…","""0962-1083""","""<div class=""o-metadata__file-u…"
96,"""Data from: The evolution of ma…","""Understanding the causes of la…","""0003-0147""","""<div class=""o-metadata__file-u…"
97,"""Data from: Gene discovery in G…","""[No abstract filled]""","""1439-4227""","""<div class=""o-metadata__file-u…"
98,"""Data from: ""Comparative genomi…","""To advance comparative genomic…","""1755-0998""","""<div class=""o-metadata__file-u…"
99,"""Data from: Demographic modelli…","""Aim: Climate warming is causin…","""1365-2699""","""<div class=""o-metadata__file-u…"
100,"""Data from: A chromosomal-scale…","""Background: Teak, a member of …","""2047-217X""","""<div class=""o-metadata__file-u…"
101,"""Data from: The chinchilla as a…","""Several parameters are importa…","""2054-5703""","""<div class=""o-metadata__file-u…"
102,"""Data from: Community trees: id…","""Groups of codistributed specie…","""1558-5646""","""<div class=""o-metadata__file-u…"


## Load Previously Downloaded Data

For future analysis, you can load the saved parquet file:

In [9]:
# Load previously saved data
# loaded_df = pl.read_parquet(OUTPUT_FILE)
# print(f"📂 Loaded {len(loaded_df):,} datasets from {OUTPUT_FILE}")
# print(f"🔍 Preview:\n{loaded_df.head()}")

print("💡 Uncomment the lines above to load previously saved data")

💡 Uncomment the lines above to load previously saved data
