# SIGNAL-seq ADT mapping with kallisto KITE workflow

1) Map the FASTQ data to barcode reference using the Kallisto KITE pipeline
2) Collapse the polyA and randomHex barcodes into CBs
3) write out the data to a .h5ad file

## Env Setup

### Load packages

In [None]:
# Import packages
import matplotlib
import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse, io
import sys, os, argparse

# Import utils
# Get the current working directory
current_dir = os.getcwd()
utils_path = os.path.join(current_dir, '../../', 'utils')
sys.path.append(utils_path)

# Import adt utils functions
from adt_utils import merge_rt_barcodes, pairwise_rt_comparison 

matplotlib.rcParams.update({'font.size': 12})
%config InlineBackend.figure_format = 'retina'

## Data input

1) PE FASTQ format input files 
2) ADT antibody panel with barcodes and metadata required to generate mapping index

In [None]:
# Load features reference  
# Clean up the antibody names for problematic characters
df = pd.read_csv('barcode_layouts/Ex0015_kite_panel.csv')
df['Antigen']=df['Antigen'].str.replace(' ','_')
df['Antigen']=df['Antigen'].str.replace('(','')
df['Antigen']=df['Antigen'].str.replace(')','')
df

In [None]:
# Generate the tsv raw feature file to input into Kallisto - KITE
df[['Barcode', 'Antigen']].to_csv('barcode_layouts/features.tsv', index=None, header=None, sep='\t')
!cat barcode_layouts/features.tsv

In [None]:
# Use kb to generate the mismatch kallisto index.
!kb ref -i barcode_layouts/mismatch.idx -f1 barcode_layouts/mismatch.fa -g barcode_layouts/t2g.txt --workflow kite barcode_layouts/features.tsv --overwrite

## Run kallisto and bustools to feature count matrix in H5AD format


In [None]:
# Run kb count pipeline
#%%time
!kb count --h5ad -i barcode_layouts/mismatch.idx -o split_adt/ -w barcode_layouts/split_seqv2_barcode_wlist.txt -g barcode_layouts/t2g.txt -x 1,10,18,1,48,56,1,78,86:1,0,10:0,0,0 --workflow kite -t 2 --keep-tmp --overwrite\
~/PATH/ex0015_adt_80_L001_S5_R1_001.fastq.gz ~/PATH/ex0015_adt_80_L001_S5_R2_001.fastq.gz


In [None]:
# Use bustools to capture the reads based on umi, ADT_barcode or split_barcode.
# Can filter based on whitelist here too, if needed in the future
!bustools text -o split_adt/bus_text_raw.txt split_adt/output.bus 
!bustools text -o split_adt/bus_text_pp.txt split_adt/output.unfiltered.bus 

## Generate anndata object for preprocessing

In [None]:
# Figure output directory
sc.settings.figdir = 'pre_processing_figures'

#  kallisto adt x barocde data
adata = sc.read_h5ad('split_adt/counts_unfiltered/adata.h5ad')
adata

In [None]:
# Generate QC counts for CBs
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=True, inplace=True)

## Collapse RT and Random Hex barcodes (BC 1)

### Assign barcodes to well_ids and store as anndata .obs

In [None]:
# Initialize a list for each cell barcode (ubc) component
# RT1
ubc_1 = []
# L2
ubc_2 = []
#L3
ubc_3 = []
# The both ligation BCs
ubc_23 = []

# Loop through each barcode extracting the subsequence
for barcode in adata.obs.index.values:
  ubc_1.append(barcode[-8:])
  ubc_2.append(barcode[8:16])
  ubc_3.append(barcode[:8])
  ubc_23.append(barcode[:16])
  
  

# Add barcode subsequence to adata .obs
adata.obs["barcode_1"] = ubc_1
adata.obs["barcode_2"] = ubc_2
adata.obs["barcode_3"] = ubc_3
adata.obs["barcode_2_3"] = ubc_23

# View adata
adata

In [None]:
# Read RT BC mapping dataframe contains sample ID numbers
# 1-48 are PolyA, 49-96 are rHex
mapping_table = pd.read_csv("barcode_layouts/barcodes_v2_id_map.csv")
mapping_table.columns = ["ID", "barcode_1", "sample_id"]

mapping_table

In [None]:
# Assign RT barcodes sample_id and well index
indices = []
sample_id = []

# Loop through the RT barcodes
for barcode_1 in adata.obs["barcode_1"]:

  # if barcode_1 in mapping dataframe 
  if barcode_1 in mapping_table["barcode_1"].values:
    index_position = mapping_table["barcode_1"] == barcode_1
    indices.append(mapping_table[index_position]["ID"].values[0])
    sample_id.append(mapping_table[index_position]["sample_id"].values[0])

    # else if barcode_1 is invalid provide warning
  else:
    indices.append(-1)
    sample_id.append("invalid")
    print("WLIST ERROR, INVALID INDICES PRESENT!!")

# Annotate adata with corresponding well and sample_id data
adata.obs["index_1"] = indices
adata.obs["sample_id"] = sample_id

In [None]:
# Read mapping daatframe, for Ligation barcodes 2 and 3
mapping_table_l23 = pd.read_csv("barcode_layouts/barcodes_v1.csv")
mapping_table_l23.columns = ["ID", "barcode", "sample_id"]

mapping_table_l23

In [None]:
# Assing index for L2 barcode
indices = []

# Loop through the LS barcodes
for barcode_2 in adata.obs["barcode_2"]:

  # Assign if valid
  if barcode_2 in mapping_table_l23["barcode"].values:
    index_position = mapping_table_l23["barcode"] == barcode_2
    indices.append(mapping_table_l23[index_position]["ID"].values[0])

    # else if barcode_2 is invalid provide warning
  else:
    indices.append(-1)
    print("WLIST ERROR, INVALID INDICES PRESENT!!")

# Annotate adata with corresponding indices
adata.obs["index_2"] = indices

In [None]:
# Repeat process for L3 barcodes
indices = []
for barcode_3 in adata.obs["barcode_3"]:

  if barcode_3 in mapping_table_l23["barcode"].values:
    index_position = mapping_table_l23["barcode"] == barcode_3
    indices.append(mapping_table_l23[index_position]["ID"].values[0])
  else:
    indices.append(-1)
    print("WLIST ERROR, INVALID INDICES PRESENT!!")

adata.obs["index_3"] = indices

In [None]:
# Write out indexing well counts to files
# unmerged index data for further analysis
rt_index_counts = adata.obs['index_1'].value_counts()
rt_index_counts.to_csv("./pre_processing_figures/data/rt_index_counts.csv")

lig_bc2 = adata.obs['index_2'].value_counts()
lig_bc2.to_csv("./pre_processing_figures/data/lig_bc2_index_counts.csv")

lig_bc3 = adata.obs['index_3'].value_counts()
lig_bc2.to_csv("./pre_processing_figures/data/lig_bc_3_index_counts.csv")


In [None]:
# Check abundance of sample_ID cell barcodes
adata.obs['sample_id'].value_counts()

### Merge PolyA and rHex barcodes

In [None]:

# Get unique barcodes and their counts. barcodes_and_counts store both the sequence and the count for each unique barcode.
barcodes_and_counts = np.unique(adata.obs["barcode_2_3"].values, return_counts = True)

# Store barcodes sequences with 2 counts in adata
two_counts_barcodes_2_3 = barcodes_and_counts[0][barcodes_and_counts[1] == 2]

# Initialize list to store pairs of barcodes with ID difference different to 48
unmatched_barcodes = []

# Iterate through barcodes with 2 counts
for barcode in two_counts_barcodes_2_3:
  # Select rows from adata that have the corresponding barcode_2_3. Calculate index difference
  selection = adata.obs[adata.obs["barcode_2_3"].values == barcode]
  difference = selection["index_1"][0] - selection["index_1"][1]

  # We want to remove the barcode with the higher index. Check ID difference and input each barcode to the combine_distinct_barcode function in the right order.
  if difference == -48:
    adata = merge_rt_barcodes(adata, selection.index[0], selection.index[1])
  elif difference == 48:
    adata = merge_rt_barcodes(adata, selection.index[1], selection.index[0])


In [None]:
# Store barcodes_2_3 with multiple counts
multiple_counts_barcodes_2_3 = barcodes_and_counts[0][barcodes_and_counts[1] > 2]

# Loop through the multimatch barcodes
for barcode in multiple_counts_barcodes_2_3:
  index = adata.obs["barcode_2_3"] == barcode
  
  # Compare this barcode to all other barcodes
  matched = pairwise_rt_comparison(adata.obs[index]["index_1"].values)

  # Cerate Matching list
  matching_barcodes = []
  
  # Loop through the RT index pairs barcodes
  for pair in matched:

    # Add index sequences of paired cell barcodes
    matching_barcodes.append(adata.obs[index].index.values[pair])

  # Combine barcodes
  for matching_barcode in matching_barcodes:
    adata = merge_rt_barcodes(adata, matching_barcode[0], matching_barcode[1])



In [None]:
rt_index_counts = adata.obs['index_1'].value_counts()
rt_index_counts.to_csv("split_adt/rt_index_counts_merged.csv")


In [None]:
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=True, inplace=True)

In [None]:
# Save the adata
adata.write('ex0015_adt_80_merged_scanpy.h5ad')