In [2]:
"""
ENA Sample Metadata Extractor

This script retrieves sample information (aliases, titles, accessions)
for given study accessions using the ENA Portal API.
"""

import requests
import pandas as pd
import sys
from typing import List, Dict, Optional

class ENASampleExtractor:
    """Class to extract sample metadata from ENA using study accessions."""

    def __init__(self):
        self.base_url = "https://www.ebi.ac.uk/ena/portal/api/search"
        self.default_fields = [
            "sample_accession",
            "secondary_sample_accession",
            "sample_alias",
            "sample_title",
            "description",
            "scientific_name",
            "study_accession"
        ]

    def get_samples_for_study(self, study_accession: str,
                            fields: Optional[List[str]] = None,
                            output_format: str = "json") -> Dict:
        """
        Get all samples associated with a study accession.

        Args:
            study_accession: Study accession (e.g., 'PRJNA1131598')
            fields: List of fields to retrieve (uses default if None)
            output_format: 'json' or 'tsv'

        Returns:
            Dictionary with sample data or error information
        """
        if fields is None:
            fields = self.default_fields

        params = {
            "result": "sample",
            "query": f'study_accession="{study_accession}"',
            "fields": ",".join(fields),
            "format": output_format,
            "limit": 0  # Get all results
        }

        try:
            print(f"Fetching samples for study: {study_accession}")
            response = requests.get(self.base_url, params=params, timeout=30)
            response.raise_for_status()

            if output_format == "json":
                data = response.json()
                return {
                    "success": True,
                    "study_accession": study_accession,
                    "sample_count": len(data),
                    "samples": data
                }
            else:  # TSV format
                return {
                    "success": True,
                    "study_accession": study_accession,
                    "data": response.text
                }

        except requests.exceptions.RequestException as e:
            return {
                "success": False,
                "error": f"API request failed: {str(e)}",
                "study_accession": study_accession
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Unexpected error: {str(e)}",
                "study_accession": study_accession
            }

    def process_multiple_studies(self, study_accessions: List[str]) -> Dict:
        """
        Process multiple study accessions and combine results.

        Args:
            study_accessions: List of study accessions

        Returns:
            Combined results from all studies
        """
        all_samples = []
        failed_studies = []

        for study_acc in study_accessions:
            result = self.get_samples_for_study(study_acc)

            if result["success"]:
                samples = result["samples"]
                print(f"✓ Found {len(samples)} samples for {study_acc}")
                all_samples.extend(samples)
            else:
                print(f"✗ Failed to get samples for {study_acc}: {result['error']}")
                failed_studies.append({
                    "study_accession": study_acc,
                    "error": result["error"]
                })

        return {
            "success": len(failed_studies) == 0,
            "total_samples": len(all_samples),
            "samples": all_samples,
            "failed_studies": failed_studies
        }

    def save_to_csv(self, samples_data: List[Dict], filename: str) -> bool:
        """
        Save sample data to CSV file.

        Args:
            samples_data: List of sample dictionaries
            filename: Output filename

        Returns:
            True if successful, False otherwise
        """
        try:
            df = pd.DataFrame(samples_data)
            df.to_csv(filename, index=False)
            print(f"✓ Saved {len(samples_data)} samples to {filename}")
            return True
        except Exception as e:
            print(f"✗ Failed to save to CSV: {str(e)}")
            return False

    def print_summary(self, samples_data: List[Dict]) -> None:
        """Print a summary of the retrieved samples."""
        if not samples_data:
            print("No samples found.")
            return

        print(f"\n=== SUMMARY ===")
        print(f"Total samples found: {len(samples_data)}")

        # Group by study
        studies = {}
        for sample in samples_data:
            study = sample.get('study_accession', 'Unknown')
            if study not in studies:
                studies[study] = []
            studies[study].append(sample)

        print(f"Studies processed: {len(studies)}")
        for study, samples in studies.items():
            print(f"  {study}: {len(samples)} samples")

        # Show first few samples as examples
        print(f"\n=== SAMPLE EXAMPLES ===")
        for i, sample in enumerate(samples_data[:3]):
            print(f"\nSample {i+1}:")
            print(f"  Accession: {sample.get('sample_accession', 'N/A')}")
            print(f"  Alias: {sample.get('sample_alias', 'N/A')}")
            print(f"  Title: {sample.get('sample_title', 'N/A')}")
            print(f"  Scientific name: {sample.get('scientific_name', 'N/A')}")

In [3]:
codes = list(pd.read_csv('disease.csv')['Accession #'])
extractor = ENASampleExtractor()
results = extractor.process_multiple_studies(codes)

Fetching samples for study: PRJNA1131598
✓ Found 169 samples for PRJNA1131598
Fetching samples for study: PRJNA938107
✓ Found 120 samples for PRJNA938107
Fetching samples for study: PRJNA819279
✓ Found 66 samples for PRJNA819279
Fetching samples for study: PRJEB47011
✓ Found 105 samples for PRJEB47011
Fetching samples for study: PRJNA945212
✓ Found 193 samples for PRJNA945212
Fetching samples for study: PRJNA950484
✓ Found 57 samples for PRJNA950484
Fetching samples for study: PRJNA877411
✓ Found 60 samples for PRJNA877411
Fetching samples for study: PRJEB43871
✓ Found 135 samples for PRJEB43871
Fetching samples for study: PRJEB47555
✓ Found 216 samples for PRJEB47555
Fetching samples for study: PRJNA647236
✓ Found 54 samples for PRJNA647236
Fetching samples for study: HRA004410
✓ Found 0 samples for HRA004410
Fetching samples for study: PRJNA1053658
✓ Found 80 samples for PRJNA1053658
Fetching samples for study: HRA006733
✓ Found 0 samples for HRA006733
Fetching samples for study: PRJ

In [4]:
import json
import os

# Create output directory for JSON files
output_dir = "accession_json_files"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Group samples by study accession
studies_data = {}
for sample in results["samples"]:
    study_acc = sample.get('study_accession', 'Unknown')
    if study_acc not in studies_data:
        studies_data[study_acc] = []
    studies_data[study_acc].append(sample)

# Create JSON file for each accession code
for study_acc, samples in studies_data.items():
    # Create the JSON structure with study accession, total samples, and dicts of lists
    json_data = {
        "study_accession": study_acc,
        "total_samples_found": len(samples),
        "alias": [sample.get('sample_alias', 'N/A') for sample in samples],
        "title": [sample.get('sample_title', 'N/A') for sample in samples]
    }

    # Save to JSON file
    filename = f"{output_dir}/{study_acc}_metadata.json"
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=2)

    print(f"✓ Created JSON file: {filename}")

print(f"\n✓ All JSON files created in '{output_dir}' directory")
print(f"✓ Total files created: {len(studies_data)}")

✓ Created JSON file: accession_json_files/PRJNA1131598_metadata.json
✓ Created JSON file: accession_json_files/PRJNA938107_metadata.json
✓ Created JSON file: accession_json_files/PRJNA819279_metadata.json
✓ Created JSON file: accession_json_files/PRJEB87787;PRJEB47011_metadata.json
✓ Created JSON file: accession_json_files/PRJNA945212_metadata.json
✓ Created JSON file: accession_json_files/PRJNA950484_metadata.json
✓ Created JSON file: accession_json_files/PRJNA877411_metadata.json
✓ Created JSON file: accession_json_files/PRJEB43871_metadata.json
✓ Created JSON file: accession_json_files/PRJEB47555_metadata.json
✓ Created JSON file: accession_json_files/PRJNA647236_metadata.json
✓ Created JSON file: accession_json_files/PRJNA1053658_metadata.json
✓ Created JSON file: accession_json_files/PRJNA895415_metadata.json
✓ Created JSON file: accession_json_files/PRJNA417939_metadata.json
✓ Created JSON file: accession_json_files/PRJNA1077687_metadata.json
✓ Created JSON file: accession_json_f