Merge pull request #275 from SystemsGenetics/dev

Dev
SystemsGenetics · Feb 3, 2023 · c7fe336 · c7fe336
2 parents 3107932 + 92c0c78
commit c7fe336
Show file tree

Hide file tree

Showing 5 changed files with 115 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,13 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v2.1.0 - [date]
+## v2.1.1 - 2023-02-02
+
+### `Fixed`
+
+- A bug which prevented retreival of NCBI's SRA metadata.
+
+## v2.1.0 - 2022-02-11
 
 This release is a major reconfiguration of GEMmaker to meet the nf-core standards [nf-core](https://nf-co.re/). It also includes multiple bug fixes
 

diff --git a/bin/retrieve_sra_metadata.py b/bin/retrieve_sra_metadata.py
@@ -65,7 +65,9 @@ def download_runs_meta(run_ids, meta_dir, page_size=100):
             sys.stderr.write("ERROR Retrieving SRA Metadata: %s\n" % (e.reason))
             continue
 
-        response_xml = response_obj.read().decode(response_obj.headers.get_content_charset())
+        response = response_obj.read()
+        charset = response_obj.headers.get_content_charset(failobj="utf-8")
+        response_xml = response.decode(charset)
         response = xmltodict.parse(response_xml)
 
         # Write out the XML for debugging purposes should something fail

diff --git a/bin/sra_meta2biosample.py b/bin/sra_meta2biosample.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+
+"""
+A Python script for converting GEMmaker sample JSON meta data to a tab delimited file
+
+.. module:: GEMmaker
+    :platform: UNIX, Linux
+    :synopsis: This script recursively reads in all the JSON files in a sample directory
+        and parses it to find NCBI BioSample details. Results are saved into a tab
+        delimited file.
+"""
+
+
+import json
+import glob
+from pathlib import Path
+import urllib
+import xmltodict
+import os
+import argparse
+import pandas as pd
+import pprint
+import json
+import sys
+import re
+
+def parse_meta(meta):
+   accession = None
+   sample = {}
+
+   # Get the run accession
+   if '@accession' in meta.keys():
+       accession = meta['@accession'] 
+   else:
+       return sample
+
+   # If this is an SRA sample accession ID then continue
+   regexp = re.compile(r'^.RS\d+$')
+   if not regexp.search(accession):
+       return sample
+
+   if 'IDENTIFIERS' in meta.keys():
+       if 'EXTERNAL_ID' in meta['IDENTIFIERS'].keys():
+           external_ids = meta['IDENTIFIERS']['EXTERNAL_ID']
+           if isinstance(external_ids, dict):
+               external_ids = [external_ids]
+           for external_id in external_ids:
+               if ('@namespace' in external_id) & (external_id['@namespace'] == 'BioSample'):
+                   sample['ncbi_biosample_accession'] = external_id['#text']
+
+   # If there is no sample then skip this one.
+   if not ('ncbi_biosample_accession' in sample.keys()):
+       pp.pprint(meta)
+       return sample
+
+   # Add in any sample attributes
+   if 'SAMPLE_ATTRIBUTES' in meta.keys():
+       attrs = meta['SAMPLE_ATTRIBUTES']['SAMPLE_ATTRIBUTE']
+       if isinstance(attrs, dict):
+           attrs = [attrs]
+       for attr in attrs: 
+          sample[attr['TAG']] = attr['VALUE']
+
+   # If there is a description for the sample, use that
+   # instead of any attribute values.
+   if 'DESCRIPTION' in meta.keys():
+       sample['description'] = meta['DESCRIPTION']
+
+   # use the sample name as the accession.
+   sample['sample_name'] = accession
+   return sample
+
+
+
+if __name__ == "__main__":
+    pp = pprint.PrettyPrinter(indent=4)
+
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dir", help="A directory containing JSON files. Use this argument as often as needed", required=True, action='append')
+    parser.add_argument("--out", help="The output file name", required=True)
+    args = parser.parse_args()
+
+    samples = []
+
+    num_files = 0;
+    for metadir in args.dir:
+       for path in Path(metadir).rglob('*.json'):
+           num_files = num_files + 1
+           meta_file = open(path)
+           meta = json.load(meta_file)
+           sample = parse_meta(meta)
+           if sample:
+               samples.append(sample)
+
+    samples = pd.DataFrame(samples)
+    total_found = samples.shape[0]
+    samples.drop_duplicates(['ncbi_biosample_accession'],inplace=True)
+    num_samples = samples.shape[0]
+    print("Parsed {} files. Found {} samples with {} unique.".format(num_files, total_found, num_samples), file=sys.stderr)
+    samples.to_csv(args.out, sep="\t", index=False)
diff --git a/environment.yml b/environment.yml
@@ -1,6 +1,6 @@
 # You can use this file to create a conda environment for this pipeline:
 #   conda env create -f environment.yml
-name: systemsgenetics-gemmaker-2.1.0
+name: systemsgenetics-gemmaker-2.1.1
 channels:
   - conda-forge
   - bioconda

diff --git a/nextflow.config b/nextflow.config
@@ -211,7 +211,7 @@ manifest {
       description = 'GEMmaker is a Nextflow workflow for large-scale gene expression sample processing, expression-level quantification and Gene Expression Matrix (GEM) construction. Results from GEMmaker are useful for differential gene expression (DGE) and gene co-expression network (GCN) analyses. The GEMmaker workflow currently supports Illumina RNA-seq datasets.'
       mainScript = 'main.nf'
       nextflowVersion = '>=21.04.0'
-      version = '2.1.0'
+      version = '2.1.1'
 }
 
 // Function to ensure that resource requirements don't go beyond
@@ -245,4 +245,4 @@ def check_max(obj, type) {
             return obj
         }
     }
-}
+}