# Notebook to check h5ad files generated by STAR mapping from SRA

**Developed by** :Srivalli Kolla

**Created on** : 23 October, 2024

**Last modified** : 23 October, 2024

**Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

# Import Packages

In [1]:
import os
import anndata
import scanpy as sc
import numpy as np
import pandas as pd
import glob

# Import data

In [2]:
directory_path = '../ncbi_sra/data/'

pattern = os.path.join(directory_path, '*_filtered*.h5ad') #Check only for filtered files

filtered_files = glob.glob(pattern)
filtered_files

['../ncbi_sra/data/PRJNA847259_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJNA762100_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJNA767653_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJEB59734_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJNA1127309_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJNA1007964_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJNA934594_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJNA939636_sra_filtered_sk_23_10_2024.h5ad',
 '../ncbi_sra/data/PRJNA769125_sra_filtered_sk_23_10_2024.h5ad']

In [3]:
all_sample_names = []

for file in filtered_files:
    adata = sc.read_h5ad(file)
    all_sample_names.extend(adata.obs['sample_name'].values)


sample_names_df = pd.DataFrame(all_sample_names, columns=['sample_name'])

sample_name_counts = sample_names_df['sample_name'].value_counts()
sample_name_counts



sample_name
SRR23641833    1590
SRR23641836    1586
SRR23641827    1319
SRR23641830    1162
SRR23641829    1149
               ... 
SRR16145821       2
SRR16145831       2
SRR16145834       2
SRR16220106       1
SRR16145840       1
Name: count, Length: 304, dtype: int64

In [6]:
pd.set_option('display.max_rows', None)
sample_name_counts

sample_name
SRR23641833    1590
SRR23641836    1586
SRR23641827    1319
SRR23641830    1162
SRR23641829    1149
SRR23641834    1092
SRR23641832    1024
SRR15835824    1010
SRR23641835     973
SRR23391982     788
SRR23641838     740
SRR23391981     580
SRR25726227     524
SRR15835871     465
SRR15835870     454
SRR15835841     410
SRR23641831     408
SRR15835862     396
SRR23391987     390
SRR15835877     378
SRR15835866     375
SRR15835893     375
SRR15835894     375
SRR15835903     370
SRR15835876     367
SRR15835892     361
SRR15835875     361
SRR23391986     360
SRR15835839     359
SRR15835838     359
SRR23391983     357
SRR15835863     350
SRR15835900     348
SRR15835857     345
SRR15835854     337
SRR15835826     333
SRR15835829     332
SRR15835897     330
SRR15835865     327
SRR15835844     327
SRR15835828     326
SRR15835856     319
SRR15835843     317
SRR15835846     317
SRR23641777     313
SRR23641776     306
SRR23391985     298
SRR15835881     296
SRR23641822     295
SRR15835