# Notebook to convert STAR output files to h5ad files for SRA Project - PRJEB39602 - D3

- **Developed by**: Srivalli Kolla

- **Created date** : 31 October, 2024

- **Modification date** : 05 November, 2024

- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**

Env : Scanpy(Python 3.12.4)

# Import Packages

In [1]:
import os
import pandas as pd
import scipy.io
import anndata
import scanpy as sc
import os
import time


In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')
timestamp = time.strftime("%d_%m_%Y,%H:%M")

-----
anndata     0.10.8
scanpy      1.10.2
-----
PIL                 10.3.0
asttokens           NA
attr                23.2.0
cffi                1.16.0
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.2
decorator           5.1.1
defusedxml          0.7.1
distutils           3.12.4
django              5.0.6
executing           2.0.1
h5py                3.11.0
igraph              0.11.5
ipykernel           6.29.5
ipython_genutils    0.2.0
ipywidgets          8.1.3
jedi                0.19.1
joblib              1.4.2
kiwisolver          1.4.5
legacy_api_wrap     NA
leidenalg           0.10.2
llvmlite            0.43.0
louvain             0.8.2
matplotlib          3.8.4
mpl_toolkits        NA
natsort             8.4.0
numba               0.60.0
numexpr             2.10.1
numpy               1.26.4
packaging           24.1
pandas              2.2.2
parso               0.8.4
pkg_res

# Import sample_names

In [3]:
parent_folder = '/mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/mapped_files'
folders = ['ERR6449986',
'ERR7423245',
'ERR7423246',
'ERR7423472',
'ERR6449756',
'ERR6449758',
'ERR6449761',
'ERR6449775',
'ERR6449954',
'ERR6449985',
'ERR7423243',
'ERR7423262',
'ERR7423473',
'ERR6449755',
'ERR6449757',
'ERR6449759',
'ERR6449760',
'ERR6449762',
'ERR6449764',
'ERR6449770',
'ERR6449984',
'ERR6450000',
'ERR7423251',
'ERR7423441',
'ERR7423471',
'ERR7423487',
'ERR7423242',
'ERR7423244',
'ERR7423247',
'ERR7423248',
'ERR7423249',
'ERR7423257',]

In [4]:
def check_fastqs_in_folders(parent_folder, folders):
    """
    Check for the presence of FASTQ files in specified subfolders of a parent folder.

    Parameters:
    parent_folder (str): The path to the parent folder.
    folders (list): A list of subfolder names to check for FASTQ files.

    Returns:
    dict: A dictionary where keys are folder names and values are lists of found FASTQ files.
    """
    results = {}

    for folder in folders:
        folder_path = os.path.join(parent_folder, folder)
        fastq_files = []

        try:
            # List files in the current subfolder
            for file in os.listdir(folder_path):
                if file.endswith('.fastq') or file.endswith('.fastq.gz'):
                    fastq_files.append(file)

            # Store results
            results[folder] = fastq_files

        except FileNotFoundError:
            print(f"Folder not found: {folder_path}")
            results[folder] = []
        except Exception as e:
            print(f"Error checking for FASTQ files in {folder_path}: {e}")
            results[folder] = []

    return results

In [5]:
fastq_results = check_fastqs_in_folders('/mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/fastq_files', folders)
for folder, files in fastq_results.items():
    if files:
        print(f"FASTQ files found in {folder}: {files}")
    else:
        print(f"No FASTQ files found in {folder}.")

FASTQ files found in ERR6449986: ['HCAHeart7985088_S1_L001_R1_001.fastq.gz', 'HCAHeart7985088_S1_L001_I1_001.fastq.gz', 'HCAHeart7985088_S1_L001_R2_001.fastq.gz']
FASTQ files found in ERR7423245: ['HCAHeart7656536_S1_L001_I1_001.fastq.gz', 'HCAHeart7656536_S1_L001_R1_001.fastq.gz', 'HCAHeart7656536_S1_L001_R2_001.fastq.gz']
FASTQ files found in ERR7423246: ['HCAHeart7656535_S1_L001_I1_001.fastq.gz', 'HCAHeart7656535_S1_L001_R2_001.fastq.gz', 'HCAHeart7656535_S1_L001_R1_001.fastq.gz']
FASTQ files found in ERR7423472: ['HCAHeart7985086_S1_L001_I1_001.fastq.gz', 'HCAHeart7985086_S1_L001_R1_001.fastq.gz', 'HCAHeart7985086_S1_L001_R2_001.fastq.gz']
FASTQ files found in ERR6449756: ['HCAHeart7757638_S1_L001_R2_001.fastq.gz', 'HCAHeart7757638_S1_L001_I1_001.fastq.gz', 'HCAHeart7757638_S1_L001_R1_001.fastq.gz']
FASTQ files found in ERR6449758: ['HCAHeart7656536_S1_L001_I1_001.fastq.gz', 'HCAHeart7656536_S1_L001_R1_001.fastq.gz', 'HCAHeart7656536_S1_L001_R2_001.fastq.gz']
FASTQ files found in E

In [6]:
def convert_selected_to_h5ad_from_genefull(parent_folder, output_folder, folders):
    """Convert GeneFull/raw data to H5AD format for specified folders in the parent directory."""

    for sample_folder in folders:
        sample_path = os.path.join(parent_folder, sample_folder)
        
       
        gene_dir = os.path.join(sample_path, 'outputSolo.out', 'GeneFull', 'raw')
        

        obs_file = os.path.join(gene_dir, 'barcodes.tsv')
        var_file = os.path.join(gene_dir, 'features.tsv')
        spliced_file = os.path.join(gene_dir, 'matrix.mtx')  
        if not (os.path.isdir(gene_dir) and os.path.isfile(obs_file) and os.path.isfile(var_file) and os.path.isfile(spliced_file)):
            print(f"Required files missing in {gene_dir} for {sample_folder}. Skipping.")
            continue


        obs = pd.read_csv(obs_file, header=None, index_col=0)
        obs.index.name = None  
        
        var = pd.read_csv(var_file, sep='\t', names=['gene_ids', 'gene_names', 'gene_types'], index_col=1)
        var.index.name = None  

        
        spliced = scipy.sparse.csr_matrix(scipy.io.mmread(spliced_file).T)

        adata = anndata.AnnData(X=spliced, obs=obs, var=var)
        adata.layers['spliced'] = spliced
        adata.var_names_make_unique()


        os.makedirs(output_folder, exist_ok=True)
        output_file_path = os.path.join(output_folder, f"{sample_folder}_GeneFull_raw.h5ad")
        adata.write_h5ad(output_file_path)
        print(f"Converted {sample_folder} to {output_file_path}")

    print("Conversion complete for specified folders.")


output_folder = '/mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files'

convert_selected_to_h5ad_from_genefull(parent_folder, output_folder, folders)

  utils.warn_names_duplicates("var")


Converted ERR6449986 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449986_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423245 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423245_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423246 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423246_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423472 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423472_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449756 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449756_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449758 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449758_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449761 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449761_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449775 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449775_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449954 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449954_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449985 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449985_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423243 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423243_GeneFull_raw.h5ad
Required files missing in /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/mapped_files/ERR7423262/outputSolo.out/GeneFull/raw for ERR7423262. Skipping.


  utils.warn_names_duplicates("var")


Converted ERR7423473 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423473_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449755 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449755_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449757 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449757_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449759 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449759_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449760 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449760_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449762 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449762_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449764 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449764_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR6449770 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6449770_GeneFull_raw.h5ad
Required files missing in /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/mapped_files/ERR6449984/outputSolo.out/GeneFull/raw for ERR6449984. Skipping.


  utils.warn_names_duplicates("var")


Converted ERR6450000 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR6450000_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423251 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423251_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423441 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423441_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423471 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423471_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423487 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423487_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423242 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423242_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423244 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423244_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423247 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423247_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423248 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423248_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423249 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423249_GeneFull_raw.h5ad


  utils.warn_names_duplicates("var")


Converted ERR7423257 to /mnt/LaCIE/skolla/sc-heart-consortium/ncbi-sra/h5ad_files/ERR7423257_GeneFull_raw.h5ad
Conversion complete for specified folders.
