# Settings

In [1]:
import pandas as pd
import os
import gc
import subprocess
import concurrent.futures
from multiprocessing import Pool
from datetime import date

In [2]:
gc.enable()

In [32]:
# Set directories
base_dir = "/nfs/lab/projects/mega_heart/CAREHF/multiome/"
assets_dir = os.path.join(base_dir, "Assets")

cell_ranger_dir = os.path.join(base_dir, "cellranger.symlinks")

step1_dir = os.path.join(base_dir, "Analysys/1_preprocessing/")
step2_dir = os.path.join(base_dir, "Analysys/2_PeaksReformat/")
step3_dir = os.path.join(base_dir, "Analysys/3_SoupX/")
step4_dir = os.path.join(base_dir, "Analysys/4_Doublet_cleanup/amulet/")

In [33]:
# Load sample info
sample_info = pd.read_csv(os.path.join(assets_dir, "sample.info"), sep="\t", header=0)

# Build variables
sample_ls = sample_info["ID"].tolist()
cellranger_outs_ls = [
    f"{row['CellRanger']}{row['Chamber']}/{row['ID']}/outs/"
    for _, row in sample_info.iterrows()
]

# Check the paths are correct
print(sample_ls[0])
print(cellranger_outs_ls[0])

KA_49_1_2_KA_45_1_2
/nfs/lab/projects/CARE_HF/DATA/multiome/LA/KA_49_1_2_KA_45_1_2/outs/


# Make Single cell indices

In [53]:
def process_sample(i):
    gc.collect()
    # Set sample variable
    sample = sample_ls[i]
    print("Workin on: " + sample)
    sample_dir = cellranger_outs_ls[i]
    # load QC
    metrics = pd.read_csv(sample_dir + "per_barcode_metrics.csv", sep=',')
    #dropping columns we don't need from cellranger arc output
    metrics = metrics.drop(columns= ['gex_barcode', 'atac_barcode', 'excluded_reason',
       'gex_raw_reads', 'gex_mapped_reads', 'gex_conf_intergenic_reads',
       'gex_conf_exonic_reads',  'gex_conf_intronic_reads',
       'gex_conf_exonic_unique_reads', 'gex_conf_exonic_antisense_reads',
       'gex_conf_exonic_dup_reads', 'gex_exonic_umis',
       'gex_conf_intronic_unique_reads', 'gex_conf_intronic_antisense_reads',
       'gex_conf_intronic_dup_reads', 'gex_intronic_umis',
       'gex_conf_txomic_unique_reads', 'gex_umis_count', 'gex_genes_count'])
    #adding in columns found in cellranger atac but not cellranger arc outputs
    metrics['DNase_sensitive_region_fragments'] = 0
    metrics['enhancer_region_fragments'] = 0
    metrics['promoter_region_fragments'] = 0
    metrics['on_target_fragments'] = metrics['atac_TSS_fragments']
    metrics['blacklist_region_fragments'] = 0
    #reading in barcodes from our own "is cell" calls
    keep = pd.read_csv(step1_dir + sample + "_filtered.barcodes.txt", header = None)
    print(sample + " - Barcodes detected: " + str(len(keep)))
    #creating and setting new is_cell column
    print(sample + " - Creating new is_cell column")
    ind = 0
    new_is_cell = []
    for i in metrics['barcode']:
        if i in keep[0].values:
            new_is_cell.append(1)
        elif i not in keep[0].values:
            new_is_cell.append(0)
        if ind%100000 == 0:
            print(ind)
        ind = ind + 1
    metrics['is_cell'] = new_is_cell
    
    #replacing cellid column
    print(sample + " - Creating new cell_id column")
    numid = 1
    ind = 1
    cellid = []
    for i in metrics['is_cell']:
        if i == 0:
            cellid = cellid + ["None"]

        elif i == 1:
            cellid = cellid + ["_cell_" + str(numid)]
            numid = numid + 1

        ind = ind + 1
        if (ind%100000 == 0):
            print(ind)

    metrics['cellid'] = cellid           
    #reordering columns to matching cellranger atac output (single_cell.csv)
    order = ['barcode','atac_raw_reads','atac_dup_reads', 'atac_chimeric_reads', 'atac_unmapped_reads','atac_lowmapq',
             'atac_mitochondrial_reads','atac_fragments', 'cellid',  'is_cell','atac_TSS_fragments',
              'DNase_sensitive_region_fragments', 'enhancer_region_fragments','promoter_region_fragments', 
              'on_target_fragments','blacklist_region_fragments','atac_peak_region_fragments','atac_peak_region_cutsites']

    metrics = metrics[order]
    #save output
    metrics.to_csv(step4_dir + sample + "_singlecell.csv", index=False)

In [54]:
num_cores = 1 # set the number of cores to use here

with Pool(processes=num_cores) as pool:
    pool.map(process_sample, range(len(sample_ls)))

Workin on: QY_2047_1_2_QY_2046_1_2
QY_2047_1_2_QY_2046_1_2 - Barcodes detected: 8241
QY_2047_1_2_QY_2046_1_2 - Creating new is_cell column
0
100000
200000
300000
400000
500000
600000
QY_2047_1_2_QY_2046_1_2 - Creating new cell_id column
100000
200000
300000
400000
500000
600000
Workin on: QY_2041_1_2_QY_2040_1_2
QY_2041_1_2_QY_2040_1_2 - Barcodes detected: 11648
QY_2041_1_2_QY_2040_1_2 - Creating new is_cell column
0
100000
200000
300000
400000
500000
600000
700000
QY_2041_1_2_QY_2040_1_2 - Creating new cell_id column
100000
200000
300000
400000
500000
600000
700000
Workin on: QY_1967_1_2_QY_1966_1_2
QY_1967_1_2_QY_1966_1_2 - Barcodes detected: 8157
QY_1967_1_2_QY_1966_1_2 - Creating new is_cell column
0
100000
200000
300000
400000
500000
600000
700000
QY_1967_1_2_QY_1966_1_2 - Creating new cell_id column
100000
200000
300000
400000
500000
600000
700000
Workin on: QY_1882_1_2_QY_1881_1_2
QY_1882_1_2_QY_1881_1_2 - Barcodes detected: 9452
QY_1882_1_2_QY_1881_1_2 - Creating new is_cell co

# Running Amulet

In [55]:
# Perma dirs
amulet = "/nfs/lab/katha/multiomics/amulet_zip/AMULET.sh"
amulet_dir = "/nfs/lab/katha/multiomics/amulet_zip/"
autosomes_file = "/nfs/lab/katha/multiomics/AMULET/human_autosomes.txt"
blacklist_file = "/nfs/lab/katha/multiomics/AMULET/RepeatFilterFiles/blacklist_repeats_segdups_rmsk_hg38.bed"
output_dir = step4_dir

In [56]:
## Make sample dirs
for i in range(len(sample_ls)):
    # Set sample variable
    sample_ID = sample_ls[i]
    os.mkdir(step4_dir + sample_ID + "/")

FileExistsError: [Errno 17] File exists: '/nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/4_Doublet_cleanup/amulet/QY_2047_1_2_QY_2046_1_2/'

In [57]:
command_ls = []
for i in range(len(sample_ls)):
    # Set sample variable
    sample = sample_ls[i]
    sample_ID = sample_ls[i]
    bam_file = cellranger_outs_ls[i] + "atac_possorted_bam.bam"
    single_cell_csv = step4_dir + sample_ID + "_singlecell.csv"
    output_dir = step4_dir + sample_ID + "/"
    log = "2> /nfs/lab/projects/mega_heart/CAREHF/multiome/log/" + str(date.today()) + "_" + sample_ID + "_Amulet.log"
    # Build the command string
    command_ls.append([amulet + 
            " --forcesorted"+ " --bambc"+ " CB"+ " --bcidx"+  " 0" + " --cellidx"+ " 8"+ " --iscellidx"+ " 9" +
               " " + bam_file + " " + single_cell_csv +
               " " + autosomes_file + " " + blacklist_file + " " + output_dir + " " + amulet_dir +
               " " + log])

In [58]:
command_ls[0]

['/nfs/lab/katha/multiomics/amulet_zip/AMULET.sh --forcesorted --bambc CB --bcidx 0 --cellidx 8 --iscellidx 9 /nfs/lab/projects/CARE_HF/DATA/multiome/LA/QY_2047_1_2_QY_2046_1_2/outs/atac_possorted_bam.bam /nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/4_Doublet_cleanup/amulet/QY_2047_1_2_QY_2046_1_2_singlecell.csv /nfs/lab/katha/multiomics/AMULET/human_autosomes.txt /nfs/lab/katha/multiomics/AMULET/RepeatFilterFiles/blacklist_repeats_segdups_rmsk_hg38.bed /nfs/lab/projects/mega_heart/CAREHF/multiome/Analysys/4_Doublet_cleanup/amulet/QY_2047_1_2_QY_2046_1_2/ /nfs/lab/katha/multiomics/amulet_zip/ 2> /nfs/lab/projects/mega_heart/CAREHF/multiome/log/2024-09-27_QY_2047_1_2_QY_2046_1_2_Amulet.log']

In [59]:
# Define a function to run each command
def run_command(i):
    subprocess.run(command_ls[i], shell=True)

In [None]:
# Set the number of cores to use
cores = 1

# Create a ThreadPoolExecutor with the specified number of cores
with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
    # Submit each command to the executor
    futures = [executor.submit(run_command, i) for i in range(len(command_ls))]

    # Wait for all the commands to finish
    concurrent.futures.wait(futures)

here 1
Reading BAM file.
0
10000000
20000000
30000000
40000000
50000000
60000000
70000000
80000000
90000000
100000000
110000000
120000000
130000000
140000000
150000000
160000000
170000000
180000000
190000000
200000000
210000000
220000000
230000000
240000000
250000000
260000000
270000000
280000000
290000000
300000000
310000000
320000000
330000000
340000000
350000000
360000000
370000000
380000000
390000000
400000000


In [30]:
# Re-running those who didn't make it - for reasons unknown

In [51]:
sample_ls = ['QY_2047_1_2_QY_2046_1_2', 'QY_2041_1_2_QY_2040_1_2',
             'QY_1967_1_2_QY_1966_1_2', 'QY_1882_1_2_QY_1881_1_2'
            ]

In [52]:
# Subset the dataframe based on the sample_ls while maintaining the order
subset_sample_info = sample_info.set_index("ID").loc[sample_ls].reset_index()

# Build cellranger_outs_ls ensuring the order matches sample_ls
cellranger_outs_ls = [
    f"{row['CellRanger']}{row['Chamber']}/{row['ID']}/outs/"
    for _, row in subset_sample_info.iterrows()
]

# Check the paths are correct
print(sample_ls[0])
print(cellranger_outs_ls[0])


QY_2047_1_2_QY_2046_1_2
/nfs/lab/projects/CARE_HF/DATA/multiome/LA/QY_2047_1_2_QY_2046_1_2/outs/
