### Create anndata object from re-mapped E-MTAB-8901 samples
- **Developed by:** Anna Maguza
- **Affilation:** Faculty of Medicine, Würzburg University
- **Creation date:** 23th of October 2024
- **Last modified date:** 23th of October 2024

### Import packages

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sci
import anndata as ad
from scipy import io,sparse
import os

+ We want to save the spliced/unspliced counts in our anndata object

In [2]:
def starsolo_velocity_anndata(input_dir):
    """
    input directory should contain barcodes.tsv, features.tsv with 3 mtx from spliced, ambigious, unspliced
    """
    obs = pd.read_csv(os.path.join(input_dir,'barcodes.tsv'), header = None, index_col = 0)
    # Remove index column name to make it compliant with the anndata format
    obs.index.name = None

    var = pd.read_csv(os.path.join(input_dir,"features.tsv"), sep='\t',names = ('gene_ids', 'feature_types'), index_col = 1)
    var.index.name = None

    spliced=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"spliced.mtx")).T)
    ambiguous=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"ambiguous.mtx")).T)
    unspliced=sci.sparse.csr_matrix(sci.io.mmread(os.path.join(input_dir,"unspliced.mtx")).T)
    adata=ad.AnnData(X=spliced,obs=obs,var=var,layers={'spliced':spliced,"ambiguous":ambiguous,"unspliced":unspliced})
    adata.var_names_make_unique()
    return adata

+ Upload sample description dataframe

In [3]:
samples = pd.read_csv("/mnt/LaCIE/annaM/gut_project/raw_fastq_files/Elmentaite_2021/metadata/E-MTAB-8901.sdrf.txt", sep = "\t")

In [4]:
samples['sample_name'] = samples['Comment[read2 file]'].str.split('_').str[0]

+ Base path for remapped samples

In [20]:
base_path = '/mnt/LaCIE/annaM/gut_project/raw_data/Elmentaite_2021/remapped_fetal_data_E-MTAB-8901_starsolo'

In [21]:
ann_data_list = []
failed_samples = []

for sample_name in samples['sample_name']:
    try:
        # Try loading the AnnData object from the UMI10_output path
        sample_path = f"{base_path}/{sample_name}/UMI10_output/Velocyto/raw"
        sample_name_adata = starsolo_velocity_anndata(sample_path)

        # Create the cell_id column
        sample_name_adata.obs['sample_name'] = sample_name

        ann_data_list.append(sample_name_adata)
    except FileNotFoundError:
        try:
            # If not found in UMI10_output, try loading from the UMI12_output path
            sample_path = f"{base_path}/{sample_name}/UMI12_output/Velocyto/raw"
            sample_name_adata = starsolo_velocity_anndata(sample_path)

            # Create the cell_id column
            sample_name_adata.obs['sample_name'] = sample_name

            ann_data_list.append(sample_name_adata)
        except FileNotFoundError:
            # If sample is not found in either path, add it to the failed_samples list
            failed_samples.append(sample_name)
            print(f"Sample {sample_name} not found in both UMI10 and UMI12 paths, skipping.")

# Merge all AnnData objects into one, if there are any
if ann_data_list:
    combined_adata = ann_data_list[0].concatenate(ann_data_list[1:], join='outer')
else:
    combined_adata = None
    print("No valid AnnData objects found to merge.")

# List samples that were not processed
if failed_samples:
    print("The following samples were not processed:")
    for sample in failed_samples:
        print(sample)


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Sample 4918STDY7718974 not found in both UMI10 and UMI12 paths, skipping.


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Sample 4918STDY7718972 not found in both UMI10 and UMI12 paths, skipping.


  utils.warn_names_duplicates("var")


Sample 4918STDY7718973 not found in both UMI10 and UMI12 paths, skipping.


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Sample 4918STDY7718977 not found in both UMI10 and UMI12 paths, skipping.


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Sample 4918STDY7718975 not found in both UMI10 and UMI12 paths, skipping.


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Sample 4918STDY7718976 not found in both UMI10 and UMI12 paths, skipping.
Sample 4918STDY7901096 not found in both UMI10 and UMI12 paths, skipping.


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Sample 4918STDY7273964 not found in both UMI10 and UMI12 paths, skipping.
Sample 4918STDY7273965 not found in both UMI10 and UMI12 paths, skipping.


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")

See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
  concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique)
  if pd.api.types.is_categorical_dtype(dtype):
  concat_indices = concat_indices.str.cat(label_col.map(str), sep=index_unique)
  if pd.api.types.is_categorical_dtype(dtype):


The following samples were not processed:
4918STDY7718974
4918STDY7718972
4918STDY7718973
4918STDY7718977
4918STDY7718975
4918STDY7718976
4918STDY7901096
4918STDY7273964
4918STDY7273965


In [23]:
combined_adata.obs['barcode'] = combined_adata.obs.index.copy()

In [24]:
combined_adata.obs = combined_adata.obs.merge(samples, on='sample_name', how='left', suffixes=('', '_y'))

combined_adata.obs = combined_adata.obs.loc[:, ~combined_adata.obs.columns.str.endswith('_y')]

In [26]:
combined_adata.obs.index = combined_adata.obs['barcode']

In [27]:
combined_adata.obs

Unnamed: 0_level_0,sample_name,batch,barcode,Source Name,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[developmental stage],Characteristics[age],Unit[time unit],...,Comment[FASTQ_URI],Comment[read2 file],Comment[FASTQ_URI].1,Comment[index1 file],Comment[FASTQ_URI].2,Factor Value[disease],Factor Value[developmental stage],Factor Value[organism part],Factor Value[immunophenotype],Factor Value[growth condition]
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGAAACCAT-0,4918STDY8366756,0,AAACCTGAGAAACCAT-0,2206_p2_WNT3A_cells,ERS4414920,SAMEA6655451,Homo sapiens,embryonic human stage,,,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,embryonic human stage,ileum,total cells,organoid grown in conditioned medium
AAACCTGAGAAACCGC-0,4918STDY8366756,0,AAACCTGAGAAACCGC-0,2206_p2_WNT3A_cells,ERS4414920,SAMEA6655451,Homo sapiens,embryonic human stage,,,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,embryonic human stage,ileum,total cells,organoid grown in conditioned medium
AAACCTGAGAAACCTA-0,4918STDY8366756,0,AAACCTGAGAAACCTA-0,2206_p2_WNT3A_cells,ERS4414920,SAMEA6655451,Homo sapiens,embryonic human stage,,,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,embryonic human stage,ileum,total cells,organoid grown in conditioned medium
AAACCTGAGAAACGAG-0,4918STDY8366756,0,AAACCTGAGAAACGAG-0,2206_p2_WNT3A_cells,ERS4414920,SAMEA6655451,Homo sapiens,embryonic human stage,,,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,embryonic human stage,ileum,total cells,organoid grown in conditioned medium
AAACCTGAGAAACGCC-0,4918STDY8366756,0,AAACCTGAGAAACGCC-0,2206_p2_WNT3A_cells,ERS4414920,SAMEA6655451,Homo sapiens,embryonic human stage,,,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY8366756_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,embryonic human stage,ileum,total cells,organoid grown in conditioned medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCTTTACAC-45,4918STDY7389431,45,TTTGTCATCTTTACAC-45,T44_IL_cells,ERS4414974,SAMEA6655505,Homo sapiens,child stage,10,year,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,child stage,terminal ileum,total cells,primary tissue
TTTGTCATCTTTACGT-45,4918STDY7389431,45,TTTGTCATCTTTACGT-45,T44_IL_cells,ERS4414974,SAMEA6655505,Homo sapiens,child stage,10,year,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,child stage,terminal ileum,total cells,primary tissue
TTTGTCATCTTTAGGG-45,4918STDY7389431,45,TTTGTCATCTTTAGGG-45,T44_IL_cells,ERS4414974,SAMEA6655505,Homo sapiens,child stage,10,year,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,child stage,terminal ileum,total cells,primary tissue
TTTGTCATCTTTAGTC-45,4918STDY7389431,45,TTTGTCATCTTTAGTC-45,T44_IL_cells,ERS4414974,SAMEA6655505,Homo sapiens,child stage,10,year,...,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7389431_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,child stage,terminal ileum,total cells,primary tissue


In [28]:
combined_adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Elmentaite_2021/remapped_fetal_data_E-MTAB-8901_starsolo/E-MTAB-8901_raw_velocity_anndata.h5ad')

In [5]:
sample_names_to_keep = [
    '4918STDY7718974', '4918STDY7718972', '4918STDY7718973', 
    '4918STDY7718977', '4918STDY7718975', '4918STDY7718976', 
    '4918STDY7901096', '4918STDY7273964', '4918STDY7273965'
]

filtered_samples = samples[samples['sample_name'].isin(sample_names_to_keep)]
filtered_samples

Unnamed: 0,Source Name,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[developmental stage],Characteristics[age],Unit[time unit],Term Source REF,Term Accession Number,Characteristics[gestational age],...,Comment[read2 file],Comment[FASTQ_URI].1,Comment[index1 file],Comment[FASTQ_URI].2,Factor Value[disease],Factor Value[developmental stage],Factor Value[organism part],Factor Value[immunophenotype],Factor Value[growth condition],sample_name
7,BRC2134_CO_neg_cells,ERS4414927,SAMEA6655458,Homo sapiens,10th week post-fertilization human stage,,,,,12.0,...,4918STDY7718974_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7718974_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,10th week post-fertilization human stage,colon,EPCAM negative,primary tissue,4918STDY7718974
10,BRC2134_DU_neg_cells,ERS4414930,SAMEA6655461,Homo sapiens,10th week post-fertilization human stage,,,,,12.0,...,4918STDY7718972_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7718972_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,10th week post-fertilization human stage,duodenum,EPCAM negative,primary tissue,4918STDY7718972
12,BRC2134_IL_neg_cells,ERS4414932,SAMEA6655463,Homo sapiens,10th week post-fertilization human stage,,,,,12.0,...,4918STDY7718973_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7718973_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,10th week post-fertilization human stage,ileum,EPCAM negative,primary tissue,4918STDY7718973
19,BRC2134_CO_pos_cells,ERS4414939,SAMEA6655470,Homo sapiens,10th week post-fertilization human stage,,,,,12.0,...,4918STDY7718977_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7718977_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,10th week post-fertilization human stage,colon,EPCAM positive,primary tissue,4918STDY7718977
26,BRC2134_DU_pos_cells,ERS4414946,SAMEA6655477,Homo sapiens,10th week post-fertilization human stage,,,,,12.0,...,4918STDY7718975_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7718975_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,10th week post-fertilization human stage,duodenum,EPCAM positive,primary tissue,4918STDY7718975
33,BRC2134_IL_pos_cells,ERS4414953,SAMEA6655484,Homo sapiens,10th week post-fertilization human stage,,,,,12.0,...,4918STDY7718976_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7718976_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,normal,10th week post-fertilization human stage,ileum,EPCAM positive,primary tissue,4918STDY7718976
34,T176_IL_cells,ERS4414954,SAMEA6655485,Homo sapiens,child stage,11.0,year,EFO,UO_0000036,,...,4918STDY7901096_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7901096_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,Crohn disease,child stage,terminal ileum,total cells,primary tissue,4918STDY7901096
39,T017_IL_cells,ERS4414959,SAMEA6655490,Homo sapiens,adolescent stage,13.0,year,EFO,UO_0000036,,...,4918STDY7273964_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7273964_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,Crohn disease,adolescent stage,terminal ileum,total cells,primary tissue,4918STDY7273964
40,T019_IL_cells,ERS4414960,SAMEA6655491,Homo sapiens,child stage,12.0,year,EFO,UO_0000036,,...,4918STDY7273965_S1_L001_R2_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,4918STDY7273965_S1_L001_I1_001.fastq.gz,ftp://ftp.ebi.ac.uk/pub/databases/microarray/d...,Crohn disease,child stage,terminal ileum,total cells,primary tissue,4918STDY7273965


In [6]:
filtered_samples.to_csv("/mnt/LaCIE/annaM/gut_project/raw_fastq_files/Elmentaite_2021/metadata/E-MTAB-8901_filtered.sdrf.txt", sep = "\t")