Importing and setting up global vars...

In [2]:
import pandas as pd
import os
from umi_tools import UMIClusterer

DATA_ROOT="/home/mcn26/palmer_scratch/tabula_data/raw_recap/cohen_retina"
umi_len=12
pBC_len=8

Load MPRA barcode data into a table.

In [3]:
def break_up_mpra_count(file_name_in):
    # Define expected barcode lengths
    rBC_len = 24
    
    # List to store parsed data
    data = []
    
    # Read file and process lines
    with open(os.path.join(DATA_ROOT, file_name_in)) as fin:
        for line in fin:
            line = line.strip()
            
            # Break off number of occurrences
            parts = line.split(" ")
            assert len(parts) == 2, "Error: multiple spaces"
            num_occurrences = int(parts[0])
            
            # Break off cell barcode
            subparts = parts[1].split("_")
            assert len(subparts) == 2, "Error: multiple underscores"
            cell_barcode = subparts[0]
            assert len(cell_barcode) == 16, "Error: malformed cell barcode"
            
            # Break out remaining barcodes
            barcode_seq = subparts[1]
            assert len(barcode_seq) == 44, "Error: malformed remaining barcode"
            
            umi = barcode_seq[:umi_len]
            pBC = barcode_seq[umi_len:umi_len + pBC_len]
            rBC = barcode_seq[umi_len + pBC_len:]
            
            # Append parsed data
            data.append([num_occurrences, cell_barcode, umi, pBC, rBC])
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=["num_occurrences", "cell_barcode", "umi", "pBC", "rBC"])
    
    return df

# Example usage
mpra_r1 = break_up_mpra_count("counts/retina_mpra_r1.count")

Take a look...

In [4]:
mpra_r1

Unnamed: 0,num_occurrences,cell_barcode,umi,pBC,rBC
0,1,AAAAAACAGCGCATCC,CATGCTTATCCC,CACGGTAG,ATCCTACTAACGCATTGTGAATTG
1,2,AAAAAACAGCGCATCC,TATCTTAATTGG,CGGAAGGA,TTAATATGCTTAGTTTTTTCATGA
2,2,AAAAAACAGGTAAGTT,TCCCACAATCTT,ATAAGAGG,CTCTCGTTCGATATGGCATACTTG
3,1,AAAAAACGTTTACGAC,GCTCTCAGGAGA,CGGAAGGA,GGAAGAACCCATTCCTTGCTTTTA
4,1,AAAAAACGTTTACGAC,GTTTGTTGATAT,CGAATATC,AGAATCCGGAGAGGCCTTGCAGTA
...,...,...,...,...,...
8735410,1,TTTTTTTATTTTTTTT,TTTTTTTTTTTT,ATCTAGCG,CCAACAGGTAATGGCTTTGTTATA
8735411,1,TTTTTTTTTTTTTTTT,TTTTTTTTTTTT,AATAGTGG,AACATAAAGCGATTCCGATCATCA
8735412,1,TTTTTTTTTTTTTTTT,TTTTTTTTTTTT,CAGAACCT,ATATCATGTTTGCCAAACGCGCAT
8735413,1,TTTTTTTTTTTTTTTT,TTTTTTTTTTTT,CGGAAGGA,TTTCGATGACTTAATTGGCATCGT


Looks good!

Let's load the u6 data..

In [5]:
def break_up_u6_count(filename):
    #format is
    #leading whitespace, num occurances, space, 16 bp cell-barcode, 12bp UMI, 8 bp pBC
    
    # List to store parsed data
    data = []



    with open(f"{DATA_ROOT}/{filename}") as fin:
        for line in fin:
            original_line=line

            line=line.strip()
            
            #break off number of occurrences
            line=line.split(" ")
            assert len(line) == 2, "err, multiple spaces"
            num_occurrences=int(line[0])

            #break off cell barcode
            line=line[1].split("_")
            assert len(line) == 2, "err, multiple underscore"
            cell_barcode=line[0]
            assert len(cell_barcode)==16, "err, malformed cell barcode"

            #break out remaining barcodes
            line=line[1]
            assert len(line)==20, "err, malformed remaining barcode"
            
            umi=line[:umi_len]
            pBC=line[umi_len:]
            
            data.append([num_occurrences, cell_barcode, umi, pBC])

    
    return pd.DataFrame(data,columns=["num_occurrences","cell_barcode","umi","pBC"])



u6_r1=break_up_u6_count("counts/retina_u6_r1.count")
u6_r2=break_up_u6_count("counts/retina_u6_r2.count")

In [6]:
u6_r1

Unnamed: 0,num_occurrences,cell_barcode,umi,pBC
0,1,AAAAAACAGCTCATAT,CACGCAAATTTA,AAGGAGCT
1,1,AAAAAACAGCTCATAT,CTAAAACGATTT,CCACGACT
2,1,AAAAAACAGCTCATAT,GTAGGACGATGC,CCTGAGTT
3,1,AAAAAACGTACGATCT,CACTACCGACTA,CCTCATAC
4,1,AAAAAACGTCACCAGC,CGGCAGGGATAT,ACGAGTCC
...,...,...,...,...
2676482,1,TTTTTGTTCTCGTCTA,ATAAAATATCCC,CGGAAGGA
2676483,1,TTTTTGTTCTCGTCTA,ATTAAACTAACT,ACGAGTCC
2676484,1,TTTTTTGAGGTCGCTT,TTTTAACATACC,CCAATTAC
2676485,1,TTTTTTGCATCAAGTC,AAACACTACATT,CAGCCGAT


Perfect, now load the list of valid cells and their types...

In [7]:
cell_types=pd.read_csv(f"{DATA_ROOT}/cell_names/retina_cell_types.tsv",sep="\t",index_col=0)
cell_types.rename({'cbc':'cell_barcode'},axis=1,inplace=True)
cell_types['replicate']=cell_types['replicate']+1
cell_types


Unnamed: 0,cell_barcode,leiden,replicate
0,AAACCCAAGACAAGCC,Rod,1
1,AAACCCAAGAGATCGC,Rod,1
2,AAACCCAAGCTGAAAT,Rod,1
3,AAACCCAAGTACGTCT,Rod,1
4,AAACCCACAACCCGCA,Rod,1
...,...,...,...
22152,TTTGTTGGTTTACCAG,Rod,2
22153,TTTGTTGTCACCACAA,Rod,2
22154,TTTGTTGTCGGCAGTC,Rod,2
22155,TTTGTTGTCGTTAGAC,Interneuron,2


So to begin with, let's try an intersection of MPRA barcode rep 1 & cell-types. 
Specifically, I want an inner-join (mpra, cell information) on cbc, so that each observation gets annotated with the the cell-type, but observations without an annotation get dropped.

To avoid duplicating observations, the `cell_barcode` column of the cell-type df must have no duplicates (within each replicate). We'll check that first:

In [8]:
rep1_uniq_cells=len(cell_types[cell_types['replicate']==1]['cell_barcode'].unique())
rep1_total_rows=len(cell_types[cell_types['replicate']==1])

assert rep1_uniq_cells==len(cell_types[cell_types['replicate']==1])

Ok, that requires no additional munging. Let's do the intersection:

In [9]:
cell_types[cell_types['replicate']==1]

Unnamed: 0,cell_barcode,leiden,replicate
0,AAACCCAAGACAAGCC,Rod,1
1,AAACCCAAGAGATCGC,Rod,1
2,AAACCCAAGCTGAAAT,Rod,1
3,AAACCCAAGTACGTCT,Rod,1
4,AAACCCACAACCCGCA,Rod,1
...,...,...,...
11337,TTTGTTGCACATTCTT,Rod,1
11338,TTTGTTGGTGACACAG,Rod,1
11339,TTTGTTGTCAAGTGGG,Rod,1
11340,TTTGTTGTCACCTTGC,Rod,1


In [10]:
mpra_r1

Unnamed: 0,num_occurrences,cell_barcode,umi,pBC,rBC
0,1,AAAAAACAGCGCATCC,CATGCTTATCCC,CACGGTAG,ATCCTACTAACGCATTGTGAATTG
1,2,AAAAAACAGCGCATCC,TATCTTAATTGG,CGGAAGGA,TTAATATGCTTAGTTTTTTCATGA
2,2,AAAAAACAGGTAAGTT,TCCCACAATCTT,ATAAGAGG,CTCTCGTTCGATATGGCATACTTG
3,1,AAAAAACGTTTACGAC,GCTCTCAGGAGA,CGGAAGGA,GGAAGAACCCATTCCTTGCTTTTA
4,1,AAAAAACGTTTACGAC,GTTTGTTGATAT,CGAATATC,AGAATCCGGAGAGGCCTTGCAGTA
...,...,...,...,...,...
8735410,1,TTTTTTTATTTTTTTT,TTTTTTTTTTTT,ATCTAGCG,CCAACAGGTAATGGCTTTGTTATA
8735411,1,TTTTTTTTTTTTTTTT,TTTTTTTTTTTT,AATAGTGG,AACATAAAGCGATTCCGATCATCA
8735412,1,TTTTTTTTTTTTTTTT,TTTTTTTTTTTT,CAGAACCT,ATATCATGTTTGCCAAACGCGCAT
8735413,1,TTTTTTTTTTTTTTTT,TTTTTTTTTTTT,CGGAAGGA,TTTCGATGACTTAATTGGCATCGT


In [11]:
mpra_r1['cell_barcode'] = mpra_r1['cell_barcode'].astype(str)
cell_types['cell_barcode'] = cell_types['cell_barcode'].astype(str)


In [12]:
inner_mpra_rep1=mpra_r1.merge(cell_types[cell_types['replicate']==1],on="cell_barcode",how="inner")

In [13]:
inner_mpra_rep1

Unnamed: 0,num_occurrences,cell_barcode,umi,pBC,rBC,leiden,replicate
0,10,AAACCCAAGACAAGCC,ATCATCCCACCA,CGCCTCAA,TGTTACCTTTTTTTTGTGTGACAT,Rod,1
1,3,AAACCCAAGACAAGCC,CAATTTCGTCAG,ACGTTCAA,ATTATAAAAGATACTCATGAGTCT,Rod,1
2,1,AAACCCAAGACAAGCC,CAATTTCGTCAG,CAATGGAC,AAACCTTACCCTTCCTTTACGCGG,Rod,1
3,1,AAACCCAAGACAAGCC,CATGTTCCAAGT,ACTATCTG,TTCCCGGGAAATGAGTCTTCAATC,Rod,1
4,29,AAACCCAAGACAAGCC,CATGTTCCAAGT,CGGACTCT,AACTAGTCTATTAAAGCTAGCAGA,Rod,1
...,...,...,...,...,...,...,...
3392490,1,TTTGTTGTCGCTAATG,CTATCGTGCTAT,CAGCCGAT,CGCAGAGTACATGGAAAAAAAAAA,Interneuron,1
3392491,1,TTTGTTGTCGCTAATG,CTGAAGCTTCGT,AGATGCGA,CTTTATGAAGTTATTTTTTTTCTA,Interneuron,1
3392492,1,TTTGTTGTCGCTAATG,CTGAAGCTTCGT,AGGTAACG,CGCAGTACCTCTCGTATCCGATAT,Interneuron,1
3392493,1,TTTGTTGTCGCTAATG,GGTGATTGCACT,CAAGTCTG,GTATTGTGCTGTCTCTTCCATGGA,Interneuron,1


In [14]:
print(len(inner_mpra_rep1))
print(len(mpra_r1))
print(len(cell_types[cell_types['replicate']==1]))

3392495
8735415
11342


This is not so bad a loss. I will go ahead with this for now, and implement a more sophisticated system using the umi-tools API later.

In [15]:
inner_u6_rep1=u6_r1.merge(cell_types[cell_types['replicate']==1],on="cell_barcode",how="inner")

In [16]:
inner_u6_rep1

Unnamed: 0,num_occurrences,cell_barcode,umi,pBC,leiden,replicate
0,29,AACAAGACACGGTGAA,TTGCTCTTAACG,AATAGTGG,Rod,1
1,45,AACCCAACACAAGTTC,CTTCCGATCTAC,ATCTAGCG,Mueller Glia,1
2,8,AAGTCGTGTGGACTAG,TCTTCTCCTTAA,AATCTCCA,Rod,1
3,1,AATCACGGTCTACAGT,AGTACTCTTCCC,CGGAAGGA,Rod,1
4,114,AATCACGGTCTACAGT,AGTGCTCTTCCC,CGGAAGGA,Rod,1
...,...,...,...,...,...,...
549,2,TTTGATCAGACCATAA,TGTCTCAAGGAC,CCAATTAC,Rod,1
550,1,TTTGATCAGACCATAA,TTAATGAGGACC,CCAATTAC,Rod,1
551,1,TTTGATCAGACCATAA,TTTATCCCGACC,CCAATTAC,Rod,1
552,2,TTTGATCTCGCCATAA,GCTCACCAGGAA,ATGAAAAA,Rod,1


Now let's add the CRE to each pBC (again, doing this in a rather unsophisticated way, just based on perfect matches)...

In [17]:
#download the supplementary tables.
import urllib
urllib.request.urlretrieve("https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01278-7/MediaObjects/41588_2022_1278_MOESM3_ESM.xlsx",f"{DATA_ROOT}/sup.xlsx")

('/home/mcn26/palmer_scratch/tabula_data/raw_recap/cohen_retina/sup.xlsx',
 <http.client.HTTPMessage at 0x1475df3a5400>)

In [18]:
CRE_IDs=pd.read_excel(f"{DATA_ROOT}/sup.xlsx",sheet_name="Supplementary Table 5",skiprows=1)
CRE_IDs.drop(["order_name","sequence"],axis=1,inplace=True)
CRE_IDs.rename({'barcode':'pBC'},axis=1,inplace=True)
assert len(CRE_IDs)==len(CRE_IDs["name"].unique())==len(CRE_IDs["pBC"].unique())
CRE_IDs

Unnamed: 0,name,pBC
0,ebox_gcagctgg_to_acagctgg,AACAACAC
1,ebox_gcagctgg_to_ccagctgg,AACAAGGT
2,ebox_gcagctgg_to_tcagctgg,AACACTGA
3,ebox_gcagctgg_to_gaagctgg,AACATTCC
4,ebox_gcagctgg_to_ggagctgg,AACCAGCC
...,...,...
110,combo_Q50_crx3_crx2,CGTAACAC
111,combo_Q50_crx3_crx1,CGTAGCTT
112,combo_Q50_crx2_crx1,CGTCTATG
113,wt_1,CGTTCTCG


In [38]:
final_mpra_rep1=inner_mpra_rep1.merge(CRE_IDs,how="inner",on="pBC")
final_mpra_rep1

Unnamed: 0,num_occurrences,cell_barcode,umi,pBC,rBC,leiden,replicate,name
0,10,AAACCCAAGACAAGCC,ATCATCCCACCA,CGCCTCAA,TGTTACCTTTTTTTTGTGTGACAT,Rod,1,combo_Q50_crx5_crx4
1,3,AAACCCAAGACAAGCC,CAATTTCGTCAG,ACGTTCAA,ATTATAAAAGATACTCATGAGTCT,Rod,1,tail_conserve1_159_to_164
2,1,AAACCCAAGACAAGCC,CAATTTCGTCAG,CAATGGAC,AAACCTTACCCTTCCTTTACGCGG,Rod,1,swap_crx3_to_ctaatccc_forward
3,1,AAACCCAAGACAAGCC,CATGTTCCAAGT,ACTATCTG,TTCCCGGGAAATGAGTCTTCAATC,Rod,1,tail_conserve1_169_to_174
4,29,AAACCCAAGACAAGCC,CATGTTCCAAGT,CGGACTCT,AACTAGTCTATTAAAGCTAGCAGA,Rod,1,combo_Q50_crx4_crx3
...,...,...,...,...,...,...,...,...
2881724,1,TTTGTTGTCGCTAATG,CTATCGTGCTAT,CAGCCGAT,CGCAGAGTACATGGAAAAAAAAAA,Interneuron,1,swap_crx1_to_gggcttag_reverse
2881725,1,TTTGTTGTCGCTAATG,CTGAAGCTTCGT,AGATGCGA,CTTTATGAAGTTATTTTTTTTCTA,Interneuron,1,tail_conserve3_210_to_215
2881726,1,TTTGTTGTCGCTAATG,CTGAAGCTTCGT,AGGTAACG,CGCAGTACCTCTCGTATCCGATAT,Interneuron,1,tail_conserve3_240_to_245
2881727,1,TTTGTTGTCGCTAATG,GGTGATTGCACT,CAAGTCTG,GTATTGTGCTGTCTCTTCCATGGA,Interneuron,1,swap_crx4_to_ctaatccc_forward


In [20]:
min(final_mpra_rep1["num_occurrences"])

1

In [21]:
print(len(final_mpra_rep1))
print(len(inner_mpra_rep1))

2881729
3392495


In [22]:
final_u6_rep1=inner_u6_rep1.merge(CRE_IDs,how="inner",on="pBC")
final_u6_rep1

Unnamed: 0,num_occurrences,cell_barcode,umi,pBC,leiden,replicate,name
0,29,AACAAGACACGGTGAA,TTGCTCTTAACG,AATAGTGG,Rod,1,ebox_gcagctgg_to_gcagcggg
1,45,AACCCAACACAAGTTC,CTTCCGATCTAC,ATCTAGCG,Mueller Glia,1,tail_conserve5_468_to_473
2,8,AAGTCGTGTGGACTAG,TCTTCTCCTTAA,AATCTCCA,Rod,1,ebox_gcagctgg_to_gcagctag
3,1,AATCACGGTCTACAGT,AGTACTCTTCCC,CGGAAGGA,Rod,1,combo_Q50_crx5_crx1
4,114,AATCACGGTCTACAGT,AGTGCTCTTCCC,CGGAAGGA,Rod,1,combo_Q50_crx5_crx1
...,...,...,...,...,...,...,...
404,1,TTTCCTCTCGTTGCCT,GCCAACGTCAGT,CGGTGAGA,Rod,1,combo_Q50_crx4_crx2
405,1,TTTCCTCTCGTTGCCT,TTCATGCGATTT,ATAAGAGG,Rod,1,tail_conserve5_443_to_448
406,2,TTTGATCAGACCATAA,TGTCTCAAGGAC,CCAATTAC,Rod,1,combo_Mute_crx2
407,1,TTTGATCAGACCATAA,TTAATGAGGACC,CCAATTAC,Rod,1,combo_Mute_crx2


OK, so now I *would* expand the dataframe to include all possible zeroes: treating each unobserved combination of `cell-barcode` & `name` as an observed zero. HOWEVER this only makes sense after *some level* of summary: by UMIs, or by MPRA barcodes or something. 

Now let's reformat the table to match our specifications and dump to disc.

In [39]:
final_mpra_rep1.rename({"num_occurrences":"reads","cell_barcode":"cell_bc","name":"cre_id","leiden":"cell_type","rBC":"mpra_bc","replicate":"rep_id"},axis=1,inplace=True)

In [41]:
final_mpra_rep1=final_mpra_rep1[["cell_bc","rep_id","cre_id","cell_type","mpra_bc","umi","reads"]]
final_mpra_rep1

Unnamed: 0,cell_bc,rep_id,cre_id,cell_type,mpra_bc,umi,reads
0,AAACCCAAGACAAGCC,1,combo_Q50_crx5_crx4,Rod,TGTTACCTTTTTTTTGTGTGACAT,ATCATCCCACCA,10
1,AAACCCAAGACAAGCC,1,tail_conserve1_159_to_164,Rod,ATTATAAAAGATACTCATGAGTCT,CAATTTCGTCAG,3
2,AAACCCAAGACAAGCC,1,swap_crx3_to_ctaatccc_forward,Rod,AAACCTTACCCTTCCTTTACGCGG,CAATTTCGTCAG,1
3,AAACCCAAGACAAGCC,1,tail_conserve1_169_to_174,Rod,TTCCCGGGAAATGAGTCTTCAATC,CATGTTCCAAGT,1
4,AAACCCAAGACAAGCC,1,combo_Q50_crx4_crx3,Rod,AACTAGTCTATTAAAGCTAGCAGA,CATGTTCCAAGT,29
...,...,...,...,...,...,...,...
2881724,TTTGTTGTCGCTAATG,1,swap_crx1_to_gggcttag_reverse,Interneuron,CGCAGAGTACATGGAAAAAAAAAA,CTATCGTGCTAT,1
2881725,TTTGTTGTCGCTAATG,1,tail_conserve3_210_to_215,Interneuron,CTTTATGAAGTTATTTTTTTTCTA,CTGAAGCTTCGT,1
2881726,TTTGTTGTCGCTAATG,1,tail_conserve3_240_to_245,Interneuron,CGCAGTACCTCTCGTATCCGATAT,CTGAAGCTTCGT,1
2881727,TTTGTTGTCGCTAATG,1,swap_crx4_to_ctaatccc_forward,Interneuron,GTATTGTGCTGTCTCTTCCATGGA,GGTGATTGCACT,1


In [42]:
final_mpra_rep1.to_csv(f"{DATA_ROOT}/read_wise_mpra_rep_1.tsv",index=False,sep="\t")