# Encyclopedia of Life Dataset Catalog

Systematic survey of biodiversity datasets available through the Encyclopedia of Life (EOL) open data portal.

## Objectives

1. **Dataset Discovery**: Catalog all available datasets
2. **Data Currency**: Assess update frequencies and recency  
3. **Data Scale**: Determine dataset size and scope
4. **Data Quality**: Evaluate metadata completeness

## Methodology

Web scraping of EOL portal with metadata extraction, statistical analysis, and quality assessment.

In [2]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import json
import os
import time
import re
from datetime import datetime
from urllib.parse import urljoin, urlparse
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 8)
pd.set_option('display.max_columns', None)

## Dataset Discovery

In [3]:
def safe_request(url, retries=3, delay=1):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=15, headers=headers)
            response.raise_for_status()
            return response
        except requests.RequestException:
            if attempt < retries - 1:
                time.sleep(delay * (attempt + 1))
    return None

eol_portal_url = 'https://opendata.eol.org/dataset'

In [4]:
def detect_total_pages(base_url=eol_portal_url):
    response = safe_request(base_url)
    if not response:
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    max_page = 0
    
    for link in soup.find_all('a', href=True):
        href = link.get('href', '')
        page_match = re.search(r'page=(\d+)', href)
        if page_match:
            max_page = max(max_page, int(page_match.group(1)))
    
    return max_page if max_page > 0 else None

def extract_dataset_info(element):
    dataset = {}
    
    link = element.find('a') if element.name != 'a' else element
    if link:
        dataset['title'] = link.get_text().strip()
        dataset['url'] = urljoin(eol_portal_url, link.get('href', ''))
    
    desc_elem = element.find('p') or element.find(class_='notes') or element.find(class_='description')
    if desc_elem:
        dataset['description'] = desc_elem.get_text().strip()[:500]
    
    tags = [tag.get_text().strip() for tag in element.find_all(class_=re.compile('tag|label'))]
    dataset['tags'] = tags if tags else []
    
    return dataset if dataset.get('title') else None

def scrape_dataset_page(page_num):
    url = f'{eol_portal_url}?page={page_num}'
    response = safe_request(url)
    if not response:
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    selectors = ['li.dataset-item', 'div.dataset-item', '.package-item', 'li[class*="dataset"]']
    
    for selector in selectors:
        elements = soup.select(selector)
        if elements:
            return [extract_dataset_info(elem) for elem in elements if extract_dataset_info(elem)]
    
    return []

total_pages = detect_total_pages()
print(f'Detected {total_pages} pages' if total_pages else 'Using dynamic detection')

Detected 37 pages


In [5]:
def collect_all_datasets(max_pages=None):
    max_pages = max_pages or total_pages or 50
    all_datasets = []
    
    for page in range(1, max_pages + 1):
        datasets = scrape_dataset_page(page)
        if not datasets and page > 5:
            break
        all_datasets.extend(datasets)
        time.sleep(1)
    
    return all_datasets

all_datasets = collect_all_datasets()
print(f'Collected {len(all_datasets)} datasets')

Collected 739 datasets


In [None]:
df = pd.DataFrame(all_datasets)

total_datasets = len(df)
with_descriptions = df['description'].notna().sum()
with_tags = df['tags'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum()
with_urls = df['url'].notna().sum()

print(f'Total datasets: {total_datasets}')
print(f'With descriptions: {with_descriptions} ({with_descriptions/total_datasets*100:.1f}%)')
print(f'With tags: {with_tags} ({with_tags/total_datasets*100:.1f}%)')
print(f'With URLs: {with_urls} ({with_urls/total_datasets*100:.1f}%)')

In [None]:
# Tag frequency analysis
all_tags = [tag for tags in df['tags'] if isinstance(tags, list) for tag in tags]
tag_counts = pd.Series(all_tags).value_counts() if all_tags else pd.Series()

# Quality assessment
high_quality = df[
    (df['description'].str.len() > 50) & 
    (df['tags'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)) &
    (df['url'].notna())
]

print(f'Unique tags: {len(tag_counts)}')
print(f'Top tags: {", ".join(tag_counts.head(5).index.tolist()) if len(tag_counts) > 0 else "None"}')
print(f'High quality datasets: {len(high_quality)} ({len(high_quality)/total_datasets*100:.1f}%)')

## Dataset Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

desc_lengths = df['description'].str.len().fillna(0)
axes[0,0].hist(desc_lengths, bins=20, edgecolor='black', alpha=0.7)
axes[0,0].set_xlabel('Description Length (characters)')
axes[0,0].set_ylabel('Count')
axes[0,0].set_title('Description Length Distribution')
axes[0,0].grid(alpha=0.3)

if len(tag_counts) > 0:
    top_tags = tag_counts.head(10)
    axes[0,1].barh(range(len(top_tags)), top_tags.values)
    axes[0,1].set_yticks(range(len(top_tags)))
    axes[0,1].set_yticklabels(top_tags.index)
    axes[0,1].set_xlabel('Frequency')
    axes[0,1].set_title('Top Dataset Tags')
    axes[0,1].grid(alpha=0.3)
else:
    axes[0,1].text(0.5, 0.5, 'No tags found', ha='center', va='center', transform=axes[0,1].transAxes)

completeness = {
    'URLs': with_urls,
    'Descriptions': with_descriptions, 
    'Tags': with_tags,
    'High Quality': len(high_quality)
}

axes[1,0].bar(completeness.keys(), completeness.values())
axes[1,0].set_ylabel('Count')
axes[1,0].set_title('Metadata Completeness')
axes[1,0].tick_params(axis='x', rotation=45)
axes[1,0].grid(alpha=0.3)

quality_levels = ['Basic', 'Medium', 'High']
quality_counts = [
    df['url'].notna().sum(),
    df[(df['description'].str.len() > 20) & (df['url'].notna())].shape[0],
    len(high_quality)
]

axes[1,1].bar(quality_levels, quality_counts)
axes[1,1].set_ylabel('Count')
axes[1,1].set_title('Dataset Quality Levels')
axes[1,1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f'Average description length: {desc_lengths.mean():.0f} characters')
print(f'Quality distribution: Basic={quality_counts[0]}, Medium={quality_counts[1]}, High={quality_counts[2]}')

In [1]:
# Data currency and scale analysis
current_year = datetime.now().year
date_patterns = [r'\b(20\d{2})\b', r'\b(19\d{2})\b']
years_found = []

for desc in df['description'].fillna(''):
    for pattern in date_patterns:
        years_found.extend([int(year) for year in re.findall(pattern, desc)])

scale_indicators = {
    'species': int(df['description'].str.contains('species', case=False, na=False).sum()),
    'records': int(df['description'].str.contains('records?', case=False, na=False).sum()),
    'traits': int(df['description'].str.contains('traits?', case=False, na=False).sum()),
    'observations': int(df['description'].str.contains('observations?', case=False, na=False).sum())
}

recent_datasets = sum(1 for year in years_found if year >= current_year - 5)
total_with_years = len(years_found)

print('Data Currency')
print(f'Datasets with year references: {total_with_years}')
print(f'Recent datasets (2020+): {recent_datasets}')
print(f'Year range: {min(years_found) if years_found else "N/A"} - {max(years_found) if years_found else "N/A"}')

print('\nData Scale')
for indicator, count in scale_indicators.items():
    print(f'Datasets mentioning "{indicator}": {count}')

# Export data with proper type conversion for JSON serialization
export_data = {
    'discovery_date': datetime.now().isoformat(),
    'total_datasets': int(total_datasets),
    'metadata_completeness': {
        'with_descriptions': int(with_descriptions),
        'with_tags': int(with_tags),
        'with_urls': int(with_urls),
        'high_quality': int(len(high_quality))
    },
    'data_currency': {
        'datasets_with_years': total_with_years,
        'recent_datasets': recent_datasets,
        'year_range': [min(years_found), max(years_found)] if years_found else None
    },
    'data_scale': scale_indicators
}

# Correct default export path
default_path = '../data/raw'
os.makedirs(default_path, exist_ok=True)

# Export to default location
with open(f'{default_path}/eol_dataset_summary.json', 'w') as f:
    json.dump(export_data, f, indent=2)

df.to_csv(f'{default_path}/eol_datasets.csv', index=False)
print(f'\nExported {total_datasets} datasets to: {os.path.abspath(default_path)}')
print(f'- CSV file: eol_datasets.csv ({len(df)} datasets)')
print(f'- JSON summary: eol_dataset_summary.json (analysis results)')
print('\nFiles ready for analysis or sharing.')

# Optional: User-specified additional save location
print('\n' + '='*60)
print('OPTIONAL: Additional Export Location')
print('='*60)
print('You can specify an additional location to save a copy of the files.')
print('Examples:')
print('  - C:/Users/YourName/Documents/EOL_Data')
print('  - ~/Desktop/biodiversity_data')  
print('  - Leave blank to skip additional export')
print('')

# Uncomment the lines below to enable interactive export location selection:
# additional_path = input('Enter additional save path (or press Enter to skip): ').strip()
# 
# if additional_path:
#     try:
#         os.makedirs(additional_path, exist_ok=True)
#         
#         with open(f'{additional_path}/eol_dataset_summary.json', 'w') as f:
#             json.dump(export_data, f, indent=2)
#         
#         df.to_csv(f'{additional_path}/eol_datasets.csv', index=False)
#         print(f'✓ Successfully exported to additional location: {additional_path}')
#         
#     except Exception as e:
#         print(f'✗ Error saving to {additional_path}: {e}')
# else:
#     print('Skipping additional export location.')

print(f'\nExport Summary:')
print(f'- CSV file: {total_datasets} datasets with full metadata')
print(f'- JSON summary: Complete analysis results and statistics')
print(f'- Default location: {os.path.abspath(default_path)}')

NameError: name 'datetime' is not defined

In [None]:
# Optional: Export to custom location using file dialog
# Run this cell if you want to save files to an additional location

try:
    import tkinter as tk
    from tkinter import filedialog
    
    print('Export to Additional Location')
    print('='*40)
    print('Click "Run" to open file dialog and choose export location.')
    print('(A file browser window will appear)')
    
    # Create a root window (hidden)
    root = tk.Tk()
    root.withdraw()  # Hide the main window
    root.attributes('-topmost', True)  # Bring dialog to front
    
    # Open folder selection dialog
    custom_path = filedialog.askdirectory(
        title="Select folder to export EOL dataset files",
        mustexist=False
    )
    
    root.destroy()  # Clean up
    
    if custom_path:
        # Create directory if it doesn't exist
        os.makedirs(custom_path, exist_ok=True)
        
        # Export files to chosen location
        custom_summary_file = os.path.join(custom_path, 'eol_dataset_summary.json')
        custom_csv_file = os.path.join(custom_path, 'eol_datasets.csv')
        
        with open(custom_summary_file, 'w') as f:
            json.dump(export_data, f, indent=2)
        
        df.to_csv(custom_csv_file, index=False)
        
        print(f'✓ Successfully exported files to:')
        print(f'  {os.path.abspath(custom_path)}')
        print(f'  - eol_dataset_summary.json')
        print(f'  - eol_datasets.csv')
        
        # Show file sizes
        summary_size = os.path.getsize(custom_summary_file)
        csv_size = os.path.getsize(custom_csv_file)
        print(f'\nFile sizes:')
        print(f'  - Summary JSON: {summary_size:,} bytes')
        print(f'  - Dataset CSV: {csv_size:,} bytes')
        
    else:
        print('No folder selected. Export cancelled.')
        
except ImportError:
    print('tkinter not available. Using text input fallback.')
    print('\nEnter custom export path:')
    print('Examples:')
    print('  Windows: C:/Users/YourName/Documents/EOL_Analysis')
    print('  Mac/Linux: ~/Documents/EOL_Analysis')
    
    custom_path = input('Enter save path (or press Enter to skip): ').strip()
    
    if custom_path:
        try:
            custom_path = os.path.expanduser(custom_path)
            os.makedirs(custom_path, exist_ok=True)
            
            with open(f'{custom_path}/eol_dataset_summary.json', 'w') as f:
                json.dump(export_data, f, indent=2)
            
            df.to_csv(f'{custom_path}/eol_datasets.csv', index=False)
            print(f'✓ Files exported to: {os.path.abspath(custom_path)}')
            
        except Exception as e:
            print(f'✗ Error: {e}')
    else:
        print('Export skipped.')
        
except Exception as e:
    print(f'✗ Error during export: {e}')
    print('Files remain available in default location.')