In [13]:
import pandas as pd 
from ena_sample_extractor import ENASampleExtractor

In [14]:
codes = list(pd.read_csv('disease.csv')['Accession #'])
codes

['PRJNA1131598',
 'PRJNA938107',
 'PRJNA819279',
 'PRJEB47011',
 'PRJNA945212',
 'PRJNA950484',
 'PRJNA877411',
 'PRJEB43871',
 'PRJEB47555',
 'PRJNA647236',
 'HRA004410',
 'PRJNA1053658',
 'HRA006733',
 'PRJNA895415',
 'PRJNA417939',
 'PRJNA1077687',
 'PRJEB66206',
 'PRJNA962831',
 'PRJNA1088637',
 'PRJNA1088637']

In [15]:
extractor = ENASampleExtractor()
results = extractor.process_multiple_studies(codes)

Fetching samples for study: PRJNA1131598
✓ Found 169 samples for PRJNA1131598
Fetching samples for study: PRJNA938107
✓ Found 120 samples for PRJNA938107
Fetching samples for study: PRJNA819279
✓ Found 66 samples for PRJNA819279
Fetching samples for study: PRJEB47011
✓ Found 105 samples for PRJEB47011
Fetching samples for study: PRJNA945212
✓ Found 193 samples for PRJNA945212
Fetching samples for study: PRJNA950484
✓ Found 57 samples for PRJNA950484
Fetching samples for study: PRJNA877411
✓ Found 60 samples for PRJNA877411
Fetching samples for study: PRJEB43871
✓ Found 135 samples for PRJEB43871
Fetching samples for study: PRJEB47555
✓ Found 216 samples for PRJEB47555
Fetching samples for study: PRJNA647236
✓ Found 54 samples for PRJNA647236
Fetching samples for study: HRA004410
✓ Found 0 samples for HRA004410
Fetching samples for study: PRJNA1053658
✓ Found 80 samples for PRJNA1053658
Fetching samples for study: HRA006733
✓ Found 0 samples for HRA006733
Fetching samples for study: PRJ

In [16]:
import json
import os

# Create output directory for JSON files
output_dir = "accession_json_files"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Group samples by study accession
studies_data = {}
for sample in results["samples"]:
    study_acc = sample.get('study_accession', 'Unknown')
    if study_acc not in studies_data:
        studies_data[study_acc] = []
    studies_data[study_acc].append(sample)

# Create JSON file for each accession code
for study_acc, samples in studies_data.items():
    # Create the JSON structure with study accession, total samples, and dicts of lists
    json_data = {
        "study_accession": study_acc,
        "total_samples_found": len(samples),
        "alias": [sample.get('sample_alias', 'N/A') for sample in samples],
        "title": [sample.get('sample_title', 'N/A') for sample in samples]
    }
    
    # Save to JSON file
    filename = f"{output_dir}/{study_acc}_metadata.json"
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=2)
    
    print(f"✓ Created JSON file: {filename}")

print(f"\n✓ All JSON files created in '{output_dir}' directory")
print(f"✓ Total files created: {len(studies_data)}")

✓ Created JSON file: accession_json_files/PRJNA1131598_metadata.json
✓ Created JSON file: accession_json_files/PRJNA938107_metadata.json
✓ Created JSON file: accession_json_files/PRJNA819279_metadata.json
✓ Created JSON file: accession_json_files/PRJEB87787;PRJEB47011_metadata.json
✓ Created JSON file: accession_json_files/PRJNA945212_metadata.json
✓ Created JSON file: accession_json_files/PRJNA950484_metadata.json
✓ Created JSON file: accession_json_files/PRJNA877411_metadata.json
✓ Created JSON file: accession_json_files/PRJEB43871_metadata.json
✓ Created JSON file: accession_json_files/PRJEB47555_metadata.json
✓ Created JSON file: accession_json_files/PRJNA647236_metadata.json
✓ Created JSON file: accession_json_files/PRJNA1053658_metadata.json
✓ Created JSON file: accession_json_files/PRJNA895415_metadata.json
✓ Created JSON file: accession_json_files/PRJNA417939_metadata.json
✓ Created JSON file: accession_json_files/PRJNA1077687_metadata.json
✓ Created JSON file: accession_json_f

In [3]:
pd.read_csv('ena_samples_metadata.csv')

Unnamed: 0,sample_accession,secondary_sample_accession,sample_alias,sample_title,description,scientific_name,study_accession
0,SAMN42227062,SRS21892939,5,MC06.2,Diet therapy for 1 months,gut metagenome,PRJNA1131598
1,SAMN42227065,SRS21892915,8,MC16.1,Baseline,gut metagenome,PRJNA1131598
2,SAMN42227069,SRS21892999,12,MC18.1,Baseline,gut metagenome,PRJNA1131598
3,SAMN42227074,SRS21892891,17,MC20.2,Diet therapy for 1 months,gut metagenome,PRJNA1131598
4,SAMN42227075,SRS21892895,18,MC21.1,Baseline,gut metagenome,PRJNA1131598
...,...,...,...,...,...,...,...
164,SAMN42227208,SRS21893014,151,MT38.2,Diet therapy for 1 months,gut metagenome,PRJNA1131598
165,SAMN42227209,SRS21893012,152,MT39.1,Baseline,gut metagenome,PRJNA1131598
166,SAMN42227210,SRS21893015,153,MT39.2,Diet therapy for 1 months,gut metagenome,PRJNA1131598
167,SAMN42227216,SRS21892894,159,MT42.1,Baseline,gut metagenome,PRJNA1131598
