# SCXA to H5AD conversion

Experiments fo convert single cell expression atlas files (https://www.ebi.ac.uk/gxa/sc/experiments/E-CURD-134/downloads) to AnnData format.

In [1]:
## IMPORT depedencies 
import os
import scanpy as sc
import anndata as ad
import pandas as pd
from pandas.core.apply import frame_apply

import urllib.request as request
import zipfile

Download and prepare SCXA data. Download operation may take some time (file size is approx. 137.5 MB).

You can manually download and copy E-CURD-134-normalised-files.zip (as normalised-files.zip) to project root folder to speed-up the process.

In [2]:
normalised_data_url = "https://www.ebi.ac.uk/gxa/sc/experiment/E-CURD-134/download/zip?fileType=normalised&accessKey="
experiment_design_url = "https://www.ebi.ac.uk/gxa/sc/experiment/E-CURD-134/download?fileType=experiment-design&accessKey="

if not os.path.exists(os.path.join(os.getcwd(), "normalised-files.zip")):
    # Download
    request.urlretrieve(normalised_data_url, "normalised-files.zip")

if not os.path.exists(os.path.join(os.getcwd(), "downloads/matrix.mtx")):
    # Unzip
    with zipfile.ZipFile("normalised-files.zip", 'r') as zip_ref:
        zip_ref.extractall(os.path.join(os.getcwd(), "downloads/"))
    # Rename files
    for filename in os.listdir("downloads/"):
        if filename.endswith(".mtx"):
            os.rename(os.path.join(os.getcwd(), "downloads/" + filename), os.path.join(os.getcwd(), "downloads/matrix.mtx"))
        if filename.endswith(".mtx_cols"):
            os.rename(os.path.join(os.getcwd(), "downloads/" + filename), os.path.join(os.getcwd(), "downloads/barcodes.tsv"))
        if filename.endswith(".mtx_rows"):
            os.rename(os.path.join(os.getcwd(), "downloads/" + filename), os.path.join(os.getcwd(), "downloads/genes.tsv"))

if not os.path.exists(os.path.join(os.getcwd(), "downloads/ExpDesign.tsv")):
    request.urlretrieve(experiment_design_url, "downloads/ExpDesign.tsv")
    
print('Experiment data is ready.')

Experiment data is ready.


Read downloaded data.

In [10]:
ann_data = sc.read_10x_mtx('downloads/', var_names='gene_ids')
ann_data.obs.head(4)

SAMN13703199-AAACCTGAGCAGGTCA
SAMN13703199-AAACCTGAGCTTCGCG
SAMN13703199-AAACCTGAGGCTATCT
SAMN13703199-AAACCTGAGGTGCTAG


In [4]:
exp_design = pd.read_csv(os.path.join(os.getcwd(), "downloads/ExpDesign.tsv"), sep='\t', header=0).set_index('Assay')
exp_design.head(4)

Unnamed: 0_level_0,Sample Characteristic[organism],Sample Characteristic Ontology Term[organism],Sample Characteristic[individual],Sample Characteristic Ontology Term[individual],Sample Characteristic[strain],Sample Characteristic Ontology Term[strain],Sample Characteristic[developmental stage],Sample Characteristic Ontology Term[developmental stage],Sample Characteristic[sex],Sample Characteristic Ontology Term[sex],...,Sample Characteristic[genotype],Sample Characteristic Ontology Term[genotype],Sample Characteristic[disease],Sample Characteristic Ontology Term[disease],Factor Value[age],Factor Value Ontology Term[age],Factor Value[inferred cell type - ontology labels],Factor Value Ontology Term[inferred cell type - ontology labels],Factor Value[inferred cell type - authors labels],Factor Value Ontology Term[inferred cell type - authors labels]
Assay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMN13703199-AAACCTGAGCAGGTCA,Drosophila melanogaster,http://purl.obolibrary.org/obo/NCBITaxon_7227,Adult_3d_S,,Canton-S,http://www.ebi.ac.uk/efo/EFO_0001325,adult,http://www.ebi.ac.uk/efo/EFO_0001272,female,http://purl.obolibrary.org/obo/PATO_0000383,...,wild type genotype,,normal,http://purl.obolibrary.org/obo/PATO_0000461,3 day,,lamina monopolar neuron L1,http://purl.obolibrary.org/obo/FBbt_00003719,lamina monopolar neuron L1,http://purl.obolibrary.org/obo/FBbt_00003719
SAMN13703199-AAACCTGAGCTTCGCG,Drosophila melanogaster,http://purl.obolibrary.org/obo/NCBITaxon_7227,Adult_3d_S,,Canton-S,http://www.ebi.ac.uk/efo/EFO_0001325,adult,http://www.ebi.ac.uk/efo/EFO_0001272,female,http://purl.obolibrary.org/obo/PATO_0000383,...,wild type genotype,,normal,http://purl.obolibrary.org/obo/PATO_0000461,3 day,,T neuron T2a,http://purl.obolibrary.org/obo/FBbt_00003729,T neuron T2a,http://purl.obolibrary.org/obo/FBbt_00003729
SAMN13703199-AAACCTGAGGCTATCT,Drosophila melanogaster,http://purl.obolibrary.org/obo/NCBITaxon_7227,Adult_3d_S,,Canton-S,http://www.ebi.ac.uk/efo/EFO_0001325,adult,http://www.ebi.ac.uk/efo/EFO_0001272,female,http://purl.obolibrary.org/obo/PATO_0000383,...,wild type genotype,,normal,http://purl.obolibrary.org/obo/PATO_0000461,3 day,,lamina monopolar neuron L5,http://purl.obolibrary.org/obo/FBbt_00003725,lamina monopolar neuron L5,http://purl.obolibrary.org/obo/FBbt_00003725
SAMN13703199-AAACCTGAGGTGCTAG,Drosophila melanogaster,http://purl.obolibrary.org/obo/NCBITaxon_7227,Adult_3d_S,,Canton-S,http://www.ebi.ac.uk/efo/EFO_0001325,adult,http://www.ebi.ac.uk/efo/EFO_0001272,female,http://purl.obolibrary.org/obo/PATO_0000383,...,wild type genotype,,normal,http://purl.obolibrary.org/obo/PATO_0000461,3 day,,,,unknown 24,


Create a new dataframe based on obs and populate new columns. 

OBS schema to comply: https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#obs-cell-metadata

In [18]:
new_obs = pd.DataFrame(index=ann_data.obs.index.copy())

In [15]:
def iri_to_purl(iri_str):
    if '/' in str(iri_str):
        last_part = str(iri_str).rsplit('/', 1)[-1]
        return last_part.replace("_", ":")
    else:
        return iri_str

In [19]:
cell_types =list()
organisms = list()
organism_ontology_term_ids = list()
sex_list = list()
sex_ontology_term_ids = list()
development_stages = list()
development_stage_ontology_term_ids = list()
diseases = list()
disease_ontology_term_ids = list()
tissues = list()
tissue_ontology_term_ids = list()

for i, row in new_obs.iterrows():
    assay_id = i
    cell_type =""
    organism = ""
    organism_ontology_term_id = ""
    sex = ""
    sex_ontology_term_id = ""
    development_stage = ""
    development_stage_ontology_term_id = ""
    disease = ""
    disease_ontology_term_id = ""
    tissue = ""
    tissue_ontology_term_id = ""
    
    if assay_id in exp_design.index:
        sample = exp_design.loc[assay_id]
        cell_type = iri_to_purl(sample["Factor Value Ontology Term[inferred cell type - ontology labels]"])
        organism = sample["Sample Characteristic[organism]"]
        organism_ontology_term_id = iri_to_purl(sample["Sample Characteristic Ontology Term[organism]"])
        sex = sample["Sample Characteristic[sex]"]
        sex_ontology_term_id = iri_to_purl(sample["Sample Characteristic Ontology Term[sex]"])
        development_stage = sample["Sample Characteristic[developmental stage]"]
        development_stage_ontology_term_id = iri_to_purl(sample["Sample Characteristic Ontology Term[developmental stage]"])
        disease = sample["Sample Characteristic[disease]"]
        disease_ontology_term_id = iri_to_purl(sample["Sample Characteristic Ontology Term[disease]"])
        tissue = sample["Sample Characteristic[organism part]"]
        tissue_ontology_term_id = iri_to_purl(sample["Sample Characteristic Ontology Term[organism part]"])
    else:
        print("Barcode not found in the exp design data: " + assay_id)
    
    cell_types.append(cell_type)
    organisms.append(organism)
    organism_ontology_term_ids.append(organism_ontology_term_id)
    sex_list.append(sex)
    sex_ontology_term_ids.append(sex_ontology_term_id)
    development_stages.append(development_stage)
    development_stage_ontology_term_ids.append(development_stage_ontology_term_id)
    diseases.append(disease)
    disease_ontology_term_ids.append(disease_ontology_term_id)
    tissues.append(tissue)
    tissue_ontology_term_ids.append(tissue_ontology_term_id)
    
new_obs["cell_type"] = cell_types    
new_obs["organism"] = organisms
new_obs["organism_ontology_term_id"] = organism_ontology_term_ids
new_obs["sex"] = sex_list
new_obs["sex_ontology_term_id"] = sex_ontology_term_ids
new_obs["development_stage"] = development_stages
new_obs["development_stage_ontology_term_id"] = development_stage_ontology_term_ids
new_obs["disease"] = diseases
new_obs["disease_ontology_term_id"] = disease_ontology_term_ids
new_obs["tissue"] = tissues
new_obs["tissue_ontology_term_id"] = tissue_ontology_term_ids

In [20]:
new_obs.head(5)

Unnamed: 0,cell_type,organism,organism_ontology_term_id,sex,sex_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,tissue,tissue_ontology_term_id
SAMN13703199-AAACCTGAGCAGGTCA,FBbt:00003719,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGCTTCGCG,FBbt:00003729,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGGCTATCT,FBbt:00003725,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGGTGCTAG,,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGTCAATAG,FBbt:00003826,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,


In [21]:
ann_data.obs = new_obs
ann_data.obs.head(5)

Unnamed: 0,cell_type,organism,organism_ontology_term_id,sex,sex_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,tissue,tissue_ontology_term_id
SAMN13703199-AAACCTGAGCAGGTCA,FBbt:00003719,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGCTTCGCG,FBbt:00003729,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGGCTATCT,FBbt:00003725,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGGTGCTAG,,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,
SAMN13703199-AAACCTGAGTCAATAG,FBbt:00003826,Drosophila melanogaster,NCBITaxon:7227,female,PATO:0000383,adult,EFO:0001272,normal,PATO:0000461,optic lobe,


Write the output h5ad file

In [None]:
ad.AnnData.write_h5ad(ann_data, "out/E-CURD-134.h5ad", compression="gzip")