# Combining the anndata objects from kallisto bustools run on E-MTAB-12916 scRNA data

Raw data downloaded from [here](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-12916/sdrf)

    Developed by: Christian Eger
    Würzburg Institute for Systems Immunology - Faculty of Medicine - Julius Maximilian Universität Würzburg
    Created on: 240415
    Last modified: 240419

In [1]:
import os
import pandas as pd
import scanpy as sc

In [2]:
meta_data = pd.read_csv('../.data/meta_data/E-MTAB-12916.sdrf.txt', sep='\t')
meta_data

Unnamed: 0,Source Name,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[age],Characteristics[developmental stage],Characteristics[sex],Characteristics[individual],Characteristics[organism part],Characteristics[disease],...,Assay Name,Technology Type,Comment[ENA_EXPERIMENT],Scan Name,Comment[SUBMITTED_FILE_NAME],Comment[ENA_RUN],Comment[FASTQ_URI],Comment[read_index],Comment[read_type],Factor Value[organism part]
0,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_I1_001.fastq.gz,HCAHeart9508627_S1_L001_I1_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index1,sample_barcode,heart left ventricle
1,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_I2_001.fastq.gz,HCAHeart9508627_S1_L001_I2_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index2,sample_barcode,heart left ventricle
2,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_R1_001.fastq.gz,HCAHeart9508627_S1_L001_R1_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read1,"cell_barcode,umi_barcode",heart left ventricle
3,HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,...,HCAHeart9508627,sequencing assay,ERX10811380,HCAHeart9508627_S1_L001_R2_001.fastq.gz,HCAHeart9508627_S1_L001_R2_001.fastq.gz,ERR11403589,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read2,single,heart left ventricle
4,HCAHeart9508628,ERS15408105,SAMEA113412974,Homo sapiens,60 to 65,adult,Male,D7,right cardiac atrium,normal,...,HCAHeart9508628,sequencing assay,ERX10811381,HCAHeart9508628_S1_L001_I1_001.fastq.gz,HCAHeart9508628_S1_L001_I1_001.fastq.gz,ERR11403590,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index1,sample_barcode,right cardiac atrium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,HCAHeartST13189996,ERS15408149,SAMEA113413018,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189996,sequencing assay,ERX10811425,HCAHeartST13189996_S1_L001_R2_001.fastq.gz,HCAHeartST13189996_S1_L001_R2_001.fastq.gz,ERR11403634,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read2,single,atrioventriculeft cardiac atriumr node
184,HCAHeartST13189997,ERS15408150,SAMEA113413019,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189997,sequencing assay,ERX10811426,HCAHeartST13189997_S1_L001_I1_001.fastq.gz,HCAHeartST13189997_S1_L001_I1_001.fastq.gz,ERR11403635,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index1,sample_barcode,atrioventriculeft cardiac atriumr node
185,HCAHeartST13189997,ERS15408150,SAMEA113413019,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189997,sequencing assay,ERX10811426,HCAHeartST13189997_S1_L001_I2_001.fastq.gz,HCAHeartST13189997_S1_L001_I2_001.fastq.gz,ERR11403635,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,index2,sample_barcode,atrioventriculeft cardiac atriumr node
186,HCAHeartST13189997,ERS15408150,SAMEA113413019,Homo sapiens,70 to 75,adult,Female,AV13,atrioventriculeft cardiac atriumr node,normal,...,HCAHeartST13189997,sequencing assay,ERX10811426,HCAHeartST13189997_S1_L001_R1_001.fastq.gz,HCAHeartST13189997_S1_L001_R1_001.fastq.gz,ERR11403635,ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR114/ERR114...,read1,"cell_barcode,umi_barcode",atrioventriculeft cardiac atriumr node


In [3]:
mapping_path = '../.data/mapping/'
samples = meta_data['Source Name'].unique().tolist()

In [4]:
compressed_df = meta_data.groupby('Source Name').agg(lambda x: x.iloc[0] if x.nunique() == 1 else None)
compressed_df = compressed_df.dropna(axis=1, how='all')
#compressed_df.reset_index(inplace=True)
compressed_df.head(5)

Unnamed: 0_level_0,Comment[ENA_SAMPLE],Comment[BioSD_SAMPLE],Characteristics[organism],Characteristics[age],Characteristics[developmental stage],Characteristics[sex],Characteristics[individual],Characteristics[organism part],Characteristics[disease],Characteristics[genotype],...,Comment[sample barcode offset],Comment[sample barcode read],Comment[sample barcode size],Protocol REF.3,Performer,Assay Name,Technology Type,Comment[ENA_EXPERIMENT],Comment[ENA_RUN],Factor Value[organism part]
Source Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart9508627,ERS15408104,SAMEA113412973,Homo sapiens,55 to 60,adult,Male,D3,heart left ventricle,normal,wild type genotype,...,0,index1/index2,8,P-MTAB-132606,Wellcome Sanger Institute,HCAHeart9508627,sequencing assay,ERX10811380,ERR11403589,heart left ventricle
HCAHeart9508628,ERS15408105,SAMEA113412974,Homo sapiens,60 to 65,adult,Male,D7,right cardiac atrium,normal,wild type genotype,...,0,index1/index2,8,P-MTAB-132606,Wellcome Sanger Institute,HCAHeart9508628,sequencing assay,ERX10811381,ERR11403590,right cardiac atrium
HCAHeart9508629,ERS15408106,SAMEA113412975,Homo sapiens,60 to 65,adult,Male,D7,heart left ventricle,normal,wild type genotype,...,0,index1/index2,8,P-MTAB-132606,Wellcome Sanger Institute,HCAHeart9508629,sequencing assay,ERX10811382,ERR11403591,heart left ventricle
HCAHeart9845431,ERS15408107,SAMEA113412976,Homo sapiens,45 to 50,adult,Male,D8,heart left ventricle,normal,wild type genotype,...,0,index1/index2,8,P-MTAB-132606,Wellcome Sanger Institute,HCAHeart9845431,sequencing assay,ERX10811383,ERR11403592,heart left ventricle
HCAHeart9845432,ERS15408108,SAMEA113412977,Homo sapiens,45 to 50,adult,Male,D8,apical region of left ventricle,normal,wild type genotype,...,0,index1/index2,8,P-MTAB-132606,Wellcome Sanger Institute,HCAHeart9845432,sequencing assay,ERX10811384,ERR11403593,apical region of left ventricle


In [5]:
def remove_columns_interactively(df):
    columns_to_remove = []
    for col in df.columns:
        print(f"Column Name: {col}")
        print(f"Data Type: {df[col].dtype}")
        print(f"Sample Values: {df[col].head().tolist()}")

        decision = input("Do you want to keep this column? (yes/no): ").lower()
        if decision != "yes":
            columns_to_remove.append(col)

    # Drop the selected columns
    df.drop(columns=columns_to_remove, inplace=True)

remove_columns_interactively(compressed_df)


Column Name: Comment[ENA_SAMPLE]
Data Type: object
Sample Values: ['ERS15408104', 'ERS15408105', 'ERS15408106', 'ERS15408107', 'ERS15408108']
Column Name: Comment[BioSD_SAMPLE]
Data Type: object
Sample Values: ['SAMEA113412973', 'SAMEA113412974', 'SAMEA113412975', 'SAMEA113412976', 'SAMEA113412977']
Column Name: Characteristics[organism]
Data Type: object
Sample Values: ['Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens', 'Homo sapiens']
Column Name: Characteristics[age]
Data Type: object
Sample Values: ['55 to 60', '60 to 65', '60 to 65', '45 to 50', '45 to 50']
Column Name: Characteristics[developmental stage]
Data Type: object
Sample Values: ['adult', 'adult', 'adult', 'adult', 'adult']
Column Name: Characteristics[sex]
Data Type: object
Sample Values: ['Male', 'Male', 'Male', 'Male', 'Male']
Column Name: Characteristics[individual]
Data Type: object
Sample Values: ['D3', 'D7', 'D7', 'D8', 'D8']
Column Name: Characteristics[organism part]
Data Type: object
Sample Values:

In [5]:
meta_data_of_interest = compressed_df.columns.tolist()
adata_list = []
for sample in samples:
    adata_path = os.path.join(mapping_path, sample, 'counts_unfiltered/adata.h5ad')
    if os.path.isfile(adata_path):
        adata = sc.read_h5ad(adata_path)
        for value in meta_data_of_interest:
            adata.obs[value] = compressed_df[value][sample]
        adata_list.append(adata)
adata = sc.concat(adata_list)
adata.obs_names_make_unique()
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 1023204 × 70711
    obs: 'Comment[ENA_SAMPLE]', 'Comment[BioSD_SAMPLE]', 'Characteristics[organism]', 'Characteristics[age]', 'Characteristics[developmental stage]', 'Characteristics[sex]', 'Characteristics[individual]', 'Characteristics[organism part]', 'Characteristics[disease]', 'Characteristics[genotype]', 'Material Type', 'Description', 'Protocol REF', 'Protocol REF.1', 'Protocol REF.2', 'Extract Name', 'Comment[LIBRARY_LAYOUT]', 'Comment[LIBRARY_SELECTION]', 'Comment[LIBRARY_SOURCE]', 'Comment[LIBRARY_STRATEGY]', 'Comment[ORIENTATION]', 'Comment[input molecule]', 'Comment[library construction]', 'Comment[primer]', 'Comment[single cell isolation]', 'Comment[spike in]', 'Comment[cdna read]', 'Comment[cdna read offset]', 'Comment[cdna read size]', 'Comment[cell barcode offset]', 'Comment[cell barcode read]', 'Comment[cell barcode size]', 'Comment[umi barcode offset]', 'Comment[umi barcode read]', 'Comment[umi barcode size]', 'Comment[sample barco

In [7]:
adata.write_h5ad('../../../1-QC/0-240416-E-MTAB12916/.data/adata/E-MTAB12916_kb_mapping_adata.h5ad')