# Import Packages and Define Utility Functions

This notebook is meant to be run as a one-time way to parse data into a format usable by the new Bitome

In [1]:
from pathlib import Path

import pandas as pd

In [2]:
def to_strand_int(strand):
    if strand in ['forward', '+', 1]:
        return 1
    elif strand in ['reverse', '-', -1]:
        return -1
    else:
        return None

# Genes (Initial Parsing)

In [3]:
DATA_PATH = Path('..', 'data', 'bitome2', 'm_florum', 'msb_matteau_2020')
DATA_OUTPUT_PATH = Path('..', 'data', 'bitome2', 'm_florum')

Do some cleanup:
- parse gene name from last column
- add 1 to all left indices to make 1-indexed (right is not inclusive in original scheme, so no need to add anything, becomes inclusive in new 1-indexed scheme)
- convert strand to int
- re-index to locus tag

In [4]:
gene_df = pd.read_excel(Path(DATA_PATH, 'gene.xlsx'), sheet_name='Functional categories')
gene_df = gene_df.drop(index=[0], columns=['gene name (RefSeq)'])
gene_df = gene_df.rename(columns={'start': 'left', 'end': 'right', 'gene name (RAST)': 'locus_tag',
                                 'KO number': 'KO', 'sub subcategory': 'subsubcategory',
                                 'gene product': 'product_name'})
gene_df['left'] = gene_df['left'] + 1
gene_df['strand'] = gene_df['strand'].apply(to_strand_int)

def gene_product_to_name(prod_str):
    return prod_str.split(',')[0]
gene_df['name'] = gene_df['product_name'].apply(gene_product_to_name)

gene_df.head()

Unnamed: 0,left,right,strand,locus_tag,KO,category,subcategory,subsubcategory,product_name,name
1,1.0,1332.0,1,peg.1,K02313,Genetic Information Processing,DNA Maintenance,DNA replication and partition,"dnaA, chromosomal replication initiator protein",dnaA
2,1560.0,2681.0,1,peg.2,K02338,Genetic Information Processing,DNA Maintenance,DNA replication and partition,"dnaN, DNA polymerase III subunit beta [EC:2.7....",dnaN
3,2708.0,3682.0,-1,peg.3,K03569,Cellular Processes,Cytoskeleton,Cytoskeleton proteins,"mreB, rod shape-determining protein MreB and r...",mreB
4,3751.0,4278.0,1,peg.4,K05985,Genetic Information Processing,"Folding, Sorting and Degradation",Nucleases,"rnmV, ribonuclease M5 [EC:3.1.26.8]",rnmV
5,4280.0,5083.0,1,peg.5,K02528,Genetic Information Processing,Translation,Ribosome biogenesis,"ksgA, 16S rRNA (adenine1518-N6/adenine1519-N6)...",ksgA


In [5]:
rna_seq_df = pd.read_excel(Path(DATA_PATH, 'rna_seq.xlsx'), sheet_name='mRNA_reformat')
rna_seq_df.head()

Unnamed: 0,gene name (RAST),mean FPKM,SD
0,peg.1,192.793077,140.445751
1,peg.2,642.952373,463.072151
2,peg.3,2046.205311,840.497959
3,peg.4,8.156475,19.047571
4,peg.5,215.862583,184.938361


In [6]:
rna_seq_df = rna_seq_df.rename(columns={'gene name (RAST)': 'locus_tag', 'mean FPKM': 'fpkm',
                                        'SD': 'sd_fpkm'})
rna_seq_df = rna_seq_df.drop(columns=['sd_fpkm'])
rna_seq_df.head()

Unnamed: 0,locus_tag,fpkm
0,peg.1,192.793077
1,peg.2,642.952373
2,peg.3,2046.205311
3,peg.4,8.156475
4,peg.5,215.862583


In [7]:
gene_rnaseq_df = gene_df.merge(rna_seq_df, how='left', on='locus_tag')
gene_rnaseq_df.head()

Unnamed: 0,left,right,strand,locus_tag,KO,category,subcategory,subsubcategory,product_name,name,fpkm
0,1.0,1332.0,1,peg.1,K02313,Genetic Information Processing,DNA Maintenance,DNA replication and partition,"dnaA, chromosomal replication initiator protein",dnaA,192.793077
1,1560.0,2681.0,1,peg.2,K02338,Genetic Information Processing,DNA Maintenance,DNA replication and partition,"dnaN, DNA polymerase III subunit beta [EC:2.7....",dnaN,642.952373
2,2708.0,3682.0,-1,peg.3,K03569,Cellular Processes,Cytoskeleton,Cytoskeleton proteins,"mreB, rod shape-determining protein MreB and r...",mreB,2046.205311
3,3751.0,4278.0,1,peg.4,K05985,Genetic Information Processing,"Folding, Sorting and Degradation",Nucleases,"rnmV, ribonuclease M5 [EC:3.1.26.8]",rnmV,8.156475
4,4280.0,5083.0,1,peg.5,K02528,Genetic Information Processing,Translation,Ribosome biogenesis,"ksgA, 16S rRNA (adenine1518-N6/adenine1519-N6)...",ksgA,215.862583


# Transcription Units

In [8]:
tu_df = pd.read_excel(Path(DATA_PATH, 'tu.xlsx'), sheet_name='Transcription units')
tu_df = tu_df.drop(columns=['number of genes in TU', 'genes position in TU', 'genes start',
                           'genes end', 'genes length', 'genes average FPKM', 'TSS RSPM', 'manual curation notes',
                           'TU length', "5'-UTR length", "3'-UTR length", 'term stem-loop free energy',
                           'term score', 'term stem-loop coordinates'])
tu_df = tu_df.rename(columns={'TU start': 'left', 'TU end': 'right', 'TU name': 'name', 'TU strand': 'strand',
                             'TU length': 'length', 'TSS coordinates': 'tss', 'TSS type': 'tss_type',
                             '-10 box coordinates': 'box_10', 'term coordinates': 'terminator',
                             'genes name (RAST)': 'gene_names'})
tu_df['locus_tag'] = tu_df['name']
tu_df['strand'] = tu_df['strand'].apply(to_strand_int)
tu_df['left'] = tu_df['left'] + 1

def tss_to_int(tss):
    return int(tss.split('-')[-1])
tu_df['tss'] = tu_df['tss'].apply(tss_to_int)

def box_10_or_term_left(coords):
    if coords != 'ND':
        return int(coords.split('-')[0])+1
    else:
        return None
def box_10_or_term_right(coords):
    if coords != 'ND':
        return int(coords.split('-')[1])
    else:
        return None
tu_df['box_10_left'] = tu_df['box_10'].apply(box_10_or_term_left)
tu_df['box_10_right'] = tu_df['box_10'].apply(box_10_or_term_right)
tu_df['terminator_left'] = tu_df['terminator'].apply(box_10_or_term_left)
tu_df['terminator_right'] = tu_df['terminator'].apply(box_10_or_term_right)
tu_df = tu_df.drop(columns=['box_10', 'terminator'])

tu_df.head()

Unnamed: 0,left,right,name,strand,gene_names,tss,tss_type,locus_tag,box_10_left,box_10_right,terminator_left,terminator_right
0,793209,1364,TU_001,1,peg.1,793209,gTSS,TU_001,793196.0,793201.0,1308.0,1364.0
1,1542,2720,TU_002,1,peg.2,1542,gTSS,TU_002,1529.0,1534.0,2657.0,2720.0
2,2675,3701,TU_003,-1,peg.3,3701,gTSS,TU_003,3708.0,3713.0,2675.0,2734.0
3,2675,3706,TU_004,-1,peg.3,3706,gTSS,TU_004,3713.0,3718.0,2675.0,2734.0
4,3729,5130,TU_005,1,peg.4;peg.5,3729,gTSS,TU_005,3717.0,3722.0,5071.0,5130.0


In [9]:
# HACK remove wrap-around TU
tu_df = tu_df.iloc[1:, :]

In [10]:
tu_df.to_csv(Path(DATA_OUTPUT_PATH, 'tu.csv'))

# Terminators

In [11]:
term_df = pd.read_excel(Path(DATA_PATH, 'terminator.xlsx'), sheet_name='Terms')
term_df = term_df.drop(columns='term sequence')
term_df = term_df.rename(columns={'term start': 'left', 'term stop': 'right',
                                  'stem-loop start': 'stem_loop_left', 'stem-loop stop': 'stem_loop_right',
                                 'upstream gene': 'upstream_gene', 'stem-loop free energy': 'free_energy',
                                 'term score': 'score'})
term_df['left'] = term_df['left'] + 1
term_df['stem_loop_left'] = term_df['stem_loop_left'] + 1
term_df['strand'] = term_df['strand'].apply(to_strand_int)
term_df['locus_tag'] = [f'term_{i}' for i in range(1, term_df.shape[0] + 1)]
term_df.head()

Unnamed: 0,left,right,stem_loop_left,stem_loop_right,strand,upstream_gene,free_energy,score,locus_tag
0,1308,1364,1330,1349,1,peg.1,-6.2,2.5,term_1
1,2657,2720,2690,2705,1,peg.2,-7.0,3.9,term_2
2,2675,2734,2690,2705,-1,peg.3,-7.0,3.3,term_3
3,5071,5130,5099,5115,1,peg.5,-7.0,3.1,term_4
4,9689,9744,9718,9729,1,peg.7,-4.9,3.0,term_5


In [12]:
term_df.to_csv(Path(DATA_OUTPUT_PATH, 'terminator.csv'))

# TSSes

In [18]:
gtss_df = pd.read_excel(Path(DATA_PATH, 'tss.xlsx'), sheet_name='gTSS', nrows=337).dropna(how='all')

In [19]:
gtss_df.head()

Unnamed: 0,TSS start,TSS end,TSS strand,TSS base,TSS type,-10 box coordinates,-10 box sequence,MEME p-value,MAST p-value,spacing between TSS and -10 box,TSS raw signal height,TSS RSPM,closest downstream gene start,closest downstream gene end,closest downstream gene name (RAST),closest downstream gene strand,closest downstream gene name (RefSeq),closest downstream gene product
0,1541,1542,+,G,p-gTSS,1528-1534,GATAAT,1.3e-05,8.02e-08,7,3679,528,1559,2681,peg.2,+,mfl002,DNA_polymerase_III_beta_subunit_(EC_2.7.7.7)
1,3700,3701,-,G,p-gTSS,3707-3713,TATAAT,,2.94e-06,6,1879,270,2707,3682,peg.3,-,mfl003,Rod_shape-determining_protein_MreB
2,3705,3706,-,A,p-gTSS,3712-3718,TATACT,0.0018,,6,1984,285,2707,3682,peg.3,-,mfl003,Rod_shape-determining_protein_MreB
3,3728,3729,+,A,p-gTSS,3716-3722,TATAAT,0.00019,3.43e-06,6,295,42,3750,4278,peg.4,+,mfl004,Ribonuclease_M5_(EC_3.1.26.8)
4,5266,5267,+,A,p-gTSS,5253-5259,TATAAT,0.00038,3.99e-06,7,7571,1088,5301,7209,peg.6,+,mfl006,DNA_gyrase_subunit_B_(EC_5.99.1.3)


In [20]:
gtss_df = pd.read_excel(Path(DATA_PATH, 'tss.xlsx'), sheet_name='gTSS', nrows=337).dropna(how='all')
gtss_df = gtss_df[['TSS end', 'TSS strand', 'TSS type', '-10 box coordinates', 'MEME p-value',
                  'TSS RSPM', 'spacing between TSS and -10 box']]
gtss_df = gtss_df.rename(columns={'TSS end': 'tss', 'TSS strand': 'strand',
                                 'TSS type': 'tss_type','-10 box coordinates': 'box_10',
                                  'MEME p-value': 'MEME_p_value',
                                 'TSS RSPM': 'rspm', 'spacing between TSS and -10 box': 'tss_box_10_dist'})

gtss_df['strand'] = gtss_df['strand'].apply(to_strand_int)

gtss_df['box_10_left'] = gtss_df['box_10'].apply(box_10_or_term_left)
gtss_df['box_10_right'] = gtss_df['box_10'].apply(box_10_or_term_right)

gtss_df = gtss_df.drop(columns='box_10')
gtss_df.head()

Unnamed: 0,tss,strand,tss_type,MEME_p_value,rspm,tss_box_10_dist,box_10_left,box_10_right
0,1542,1,p-gTSS,1.3e-05,528,7,1529,1534
1,3701,-1,p-gTSS,,270,6,3708,3713
2,3706,-1,p-gTSS,0.0018,285,6,3713,3718
3,3729,1,p-gTSS,0.00019,42,6,3717,3722
4,5267,1,p-gTSS,0.00038,1088,7,5254,5259


In [21]:
itss_df = pd.read_excel(Path(DATA_PATH, 'tss.xlsx'), sheet_name='iTSS').dropna(how='all')
itss_df = itss_df[['TSS end', 'TSS strand', 'TSS type', '-10 box coordinates', 'MEME p-value',
                  'TSS RSPM', 'spacing between TSS and -10 box']]
itss_df = itss_df.rename(columns={'TSS end': 'tss', 'TSS strand': 'strand',
                                 'TSS type': 'tss_type','-10 box coordinates': 'box_10',
                                  'MEME p-value': 'MEME_p_value',
                                 'TSS RSPM': 'rspm', 'spacing between TSS and -10 box': 'tss_box_10_dist'})

itss_df['strand'] = itss_df['strand'].apply(to_strand_int)

itss_df['box_10_left'] = itss_df['box_10'].apply(box_10_or_term_left)
itss_df['box_10_right'] = itss_df['box_10'].apply(box_10_or_term_right)

itss_df = itss_df.drop(columns='box_10')
itss_df.head()

Unnamed: 0,tss,strand,tss_type,MEME_p_value,rspm,tss_box_10_dist,box_10_left,box_10_right
0,3389,-1,p-iTSS,0.0069,64,6,3396,3401
1,11037,-1,a-iTSS,0.0069,14,6,11044,11049
2,18051,1,p-iTSS,0.0018,54,6,18039,18044
3,32967,1,p-iTSS,0.0017,15,6,32955,32960
4,33629,-1,a-iTSS,0.0051,22,7,33637,33642


In [22]:
full_tss_df = pd.concat([gtss_df, itss_df])
full_tss_df['locus_tag'] = [f'TSS_{i}' for i in range(1, full_tss_df.shape[0] + 1)]
full_tss_df.head()

Unnamed: 0,tss,strand,tss_type,MEME_p_value,rspm,tss_box_10_dist,box_10_left,box_10_right,locus_tag
0,1542,1,p-gTSS,1.3e-05,528,7,1529,1534,TSS_1
1,3701,-1,p-gTSS,,270,6,3708,3713,TSS_2
2,3706,-1,p-gTSS,0.0018,285,6,3713,3718,TSS_3
3,3729,1,p-gTSS,0.00019,42,6,3717,3722,TSS_4
4,5267,1,p-gTSS,0.00038,1088,7,5254,5259,TSS_5


In [23]:
full_tss_df.to_csv(Path(DATA_OUTPUT_PATH, 'tss.csv'))

# Primary TU for Genes

In [26]:
# add in a handy lookup of TU IDs to gene IDs if we have a gene_names column in TU table
tu_to_genes = {}
for tu_row in tu_df.itertuples():
    tu_to_genes[tu_row.locus_tag] = list(
        set(tu_row.gene_names.split(';')).intersection(
            set(list(gene_rnaseq_df.locus_tag))
        )
    )

# also prepare the reverse lookup, gene IDs to TU IDs
gene_to_tus = {}
for tu_id, tu_gene_ids in tu_to_genes.items():
    for tu_gene_id in tu_gene_ids:
        if tu_gene_id in gene_to_tus:
            gene_to_tus[tu_gene_id].append(tu_id)
        else:
            gene_to_tus[tu_gene_id] = [tu_id]

In [33]:
def strongest_tu_for_gene(gene_row):
    
    # get TUs associated with gene
    tus_for_gene = gene_to_tus.get(gene_row.locus_tag, [])
    tus_for_gene_df = tu_df[tu_df['locus_tag'].isin(tus_for_gene)]
    if tus_for_gene_df.empty:
        return None
    
    # get the strongest TU specifically; previous work indicates that this helps
    def tss_to_rspm(tss):
        tss_df = full_tss_df[full_tss_df['tss'] == tss]
        if tss_df.empty:
            return None
        else:
            return tss_df.iloc[0, :]['rspm']
    tus_for_gene_df['rspm'] = tus_for_gene_df['tss'].apply(tss_to_rspm)
    gene_strong_tu = tus_for_gene_df.sort_values(by='rspm', ascending=False).iloc[0, :]
    
    return gene_strong_tu.locus_tag

In [34]:
gene_rnaseq_df['primary_tu'] = gene_rnaseq_df.apply(strongest_tu_for_gene, axis=1)
gene_rnaseq_df.head()

Unnamed: 0,left,right,strand,locus_tag,KO,category,subcategory,subsubcategory,product_name,name,fpkm,primary_tu
0,1.0,1332.0,1,peg.1,K02313,Genetic Information Processing,DNA Maintenance,DNA replication and partition,"dnaA, chromosomal replication initiator protein",dnaA,192.793077,
1,1560.0,2681.0,1,peg.2,K02338,Genetic Information Processing,DNA Maintenance,DNA replication and partition,"dnaN, DNA polymerase III subunit beta [EC:2.7....",dnaN,642.952373,TU_002
2,2708.0,3682.0,-1,peg.3,K03569,Cellular Processes,Cytoskeleton,Cytoskeleton proteins,"mreB, rod shape-determining protein MreB and r...",mreB,2046.205311,TU_004
3,3751.0,4278.0,1,peg.4,K05985,Genetic Information Processing,"Folding, Sorting and Degradation",Nucleases,"rnmV, ribonuclease M5 [EC:3.1.26.8]",rnmV,8.156475,TU_005
4,4280.0,5083.0,1,peg.5,K02528,Genetic Information Processing,Translation,Ribosome biogenesis,"ksgA, 16S rRNA (adenine1518-N6/adenine1519-N6)...",ksgA,215.862583,TU_005


In [35]:
gene_rnaseq_df.to_csv(Path(DATA_OUTPUT_PATH, 'gene.csv'))