Skip to content

Commit

Permalink
Merge pull request #275 from SystemsGenetics/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
spficklin committed Feb 3, 2023
2 parents 3107932 + 92c0c78 commit c7fe336
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 5 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v2.1.0 - [date]
## v2.1.1 - 2023-02-02

### `Fixed`

- A bug which prevented retreival of NCBI's SRA metadata.

## v2.1.0 - 2022-02-11

This release is a major reconfiguration of GEMmaker to meet the nf-core standards [nf-core](https://nf-co.re/). It also includes multiple bug fixes

Expand Down
4 changes: 3 additions & 1 deletion bin/retrieve_sra_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def download_runs_meta(run_ids, meta_dir, page_size=100):
sys.stderr.write("ERROR Retrieving SRA Metadata: %s\n" % (e.reason))
continue

response_xml = response_obj.read().decode(response_obj.headers.get_content_charset())
response = response_obj.read()
charset = response_obj.headers.get_content_charset(failobj="utf-8")
response_xml = response.decode(charset)
response = xmltodict.parse(response_xml)

# Write out the XML for debugging purposes should something fail
Expand Down
102 changes: 102 additions & 0 deletions bin/sra_meta2biosample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3

"""
A Python script for converting GEMmaker sample JSON meta data to a tab delimited file
.. module:: GEMmaker
:platform: UNIX, Linux
:synopsis: This script recursively reads in all the JSON files in a sample directory
and parses it to find NCBI BioSample details. Results are saved into a tab
delimited file.
"""


import json
import glob
from pathlib import Path
import urllib
import xmltodict
import os
import argparse
import pandas as pd
import pprint
import json
import sys
import re

def parse_meta(meta):
accession = None
sample = {}

# Get the run accession
if '@accession' in meta.keys():
accession = meta['@accession']
else:
return sample

# If this is an SRA sample accession ID then continue
regexp = re.compile(r'^.RS\d+$')
if not regexp.search(accession):
return sample

if 'IDENTIFIERS' in meta.keys():
if 'EXTERNAL_ID' in meta['IDENTIFIERS'].keys():
external_ids = meta['IDENTIFIERS']['EXTERNAL_ID']
if isinstance(external_ids, dict):
external_ids = [external_ids]
for external_id in external_ids:
if ('@namespace' in external_id) & (external_id['@namespace'] == 'BioSample'):
sample['ncbi_biosample_accession'] = external_id['#text']

# If there is no sample then skip this one.
if not ('ncbi_biosample_accession' in sample.keys()):
pp.pprint(meta)
return sample

# Add in any sample attributes
if 'SAMPLE_ATTRIBUTES' in meta.keys():
attrs = meta['SAMPLE_ATTRIBUTES']['SAMPLE_ATTRIBUTE']
if isinstance(attrs, dict):
attrs = [attrs]
for attr in attrs:
sample[attr['TAG']] = attr['VALUE']

# If there is a description for the sample, use that
# instead of any attribute values.
if 'DESCRIPTION' in meta.keys():
sample['description'] = meta['DESCRIPTION']

# use the sample name as the accession.
sample['sample_name'] = accession
return sample



if __name__ == "__main__":
pp = pprint.PrettyPrinter(indent=4)

# Parse command-line arguments
parser = argparse.ArgumentParser()

parser.add_argument("--dir", help="A directory containing JSON files. Use this argument as often as needed", required=True, action='append')
parser.add_argument("--out", help="The output file name", required=True)
args = parser.parse_args()

samples = []

num_files = 0;
for metadir in args.dir:
for path in Path(metadir).rglob('*.json'):
num_files = num_files + 1
meta_file = open(path)
meta = json.load(meta_file)
sample = parse_meta(meta)
if sample:
samples.append(sample)

samples = pd.DataFrame(samples)
total_found = samples.shape[0]
samples.drop_duplicates(['ncbi_biosample_accession'],inplace=True)
num_samples = samples.shape[0]
print("Parsed {} files. Found {} samples with {} unique.".format(num_files, total_found, num_samples), file=sys.stderr)
samples.to_csv(args.out, sep="\t", index=False)
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# You can use this file to create a conda environment for this pipeline:
# conda env create -f environment.yml
name: systemsgenetics-gemmaker-2.1.0
name: systemsgenetics-gemmaker-2.1.1
channels:
- conda-forge
- bioconda
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ manifest {
description = 'GEMmaker is a Nextflow workflow for large-scale gene expression sample processing, expression-level quantification and Gene Expression Matrix (GEM) construction. Results from GEMmaker are useful for differential gene expression (DGE) and gene co-expression network (GCN) analyses. The GEMmaker workflow currently supports Illumina RNA-seq datasets.'
mainScript = 'main.nf'
nextflowVersion = '>=21.04.0'
version = '2.1.0'
version = '2.1.1'
}

// Function to ensure that resource requirements don't go beyond
Expand Down Expand Up @@ -245,4 +245,4 @@ def check_max(obj, type) {
return obj
}
}
}
}

0 comments on commit c7fe336

Please sign in to comment.