# Processing data from Mass-spectrometry experiments

In [None]:
import numpy as np
import pandas as pd

from copy import deepcopy

import matplotlib.pyplot as plt

import regseq2

regseq2.viz.matplotlib_style()

## Parse individual files

In this data we are looking for enrichment of transcription factors bound to the DNA oligos compared to control sequences. We first normalize the abundance of all proteins within a sample, and then compare the relative abundance of each protein in a sample compared to control. 

In [None]:
def parse_mass_spec(file, sample_dict, gc_dict):
    # some files are csv, some are excel
    if '.csv' in file:
        df = pd.read_csv(file)
    elif '.xlsx' in file:
        df = pd.read_excel(file)

    # get date of experiment
    date = file.split('/')[4].split('_')[0]

    # Get column headers
    cols = [x for x in df.columns if np.any([y == x[:3] for y in sample_dict.keys()])]
    df = deepcopy(df)

    # normalize each sample
    for col in cols:
        df[col] = df[col] / np.sum(df[col])
    
    # reshape dataframe
    df_parsed = pd.melt(df, id_vars=['Sum PEP Score', '# Peptides', 'Gene Symbol'], value_vars=cols, var_name='sample', value_name='abundance')
    
    # add condition and sample name
    sample = []
    gc = []
    for s in df_parsed['sample'].values:
        sample.append(sample_dict[s[:3]])
        gc.append(gc_dict[s[3:]])
    df_parsed['sample'] = sample
    df_parsed['condition'] = gc

    # split by conditions and compute enrichment
    out_df_list = []
    control_list = []
    for gc, gdf in df_parsed.groupby('condition'):
        gdf = pd.pivot(gdf, index=['Sum PEP Score', '# Peptides', 'Gene Symbol', 'condition'], columns='sample', values='abundance').reset_index()
        
        control_cols = [col for col in gdf.columns[4:] if 'Control' in col]
        filtered_cols = [col for col in gdf.columns[4:] if 'Control' not in col]
        
        for control in control_cols:
            _df = deepcopy(gdf)
            _df = _df[_df[control] > 0]
            for col in filtered_cols:
                _df[col] = _df[col] / _df[control]
                
            _df.drop(control_cols, axis=1, inplace=True)
            _df['Control'] = [key for key, val in sample_dict.items() if val == control][0]
            _df['date'] = date
            
            out_df_list.append(_df.dropna(subset='Gene Symbol').fillna(0))
            control_list.append((_df['Control'].values[0], _df['condition'].values[0]))
    return out_df_list, control_list

Now we go through each experiment, and compute enrichments. Then, all experiments get concatenated together for easier evaluation later. In each experiment, the samples are named by the DNA oligo used and the condition the cells used to make the lysate are grown in .

In [None]:
df_list = []
df = pd.read_csv("../../../data/mass-spec/20240905_salt_stat-phase/HJ_20240905_OTE_AurTS25_TMTpro_1.csv")

sample_dict = {
    "058": "Control",
    "208": "yjbJ",
    "210": "yadE",
    "212": "yadI",
    "214": "ybaY",
    "216": "yagB",
    "218": "yqjE"
}

gc_dict = {
    "S": 'stationary',
    "N": 'salt'
}
out_df_list, control_list = parse_mass_spec("../../../data/mass-spec/20240905_salt_stat-phase/HJ_20240905_OTE_AurTS25_TMTpro_1.csv", sample_dict, gc_dict)
print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
sample_dict = {
    "058": "Control",
    "208": "yjbJ",
    "210": "yadE",
    "212": "yadI",
    "214": "ybaY",
    "216": "yagB",
    "218": "yqjE"
}

gc_dict = {
    "SA": 'stationary, 37C',
    "NA": 'salt, 37C'
}

out_df_list, control_list = parse_mass_spec("../../../data/mass-spec/20240905_salt_stat-phase/HJ_20240905_OTE_AurTS25_TMTpro_2.csv", sample_dict, gc_dict)
print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241031/HJ_20241031_OTE_AurTS25_sc90min_1.xlsx")
df.columns

In [None]:
sample_dict = {
    "058": "Control",
    "496": "mglB_1",
    "241": "araB_2",
    "839": "Control_long",
    "837": "Control_medium",
    "835": "Control_short",
}

gc_dict = {
    "C": 'cAMP, 4C',
    "CH": 'cAMP, 37C',
    "H": 'glucose, 37C',
    "G": 'glucose, 4C'
}

out_df_list, control_list = parse_mass_spec("../../../data/mass-spec/20241031/HJ_20241031_OTE_AurTS25_sc90min_1.xlsx", sample_dict, gc_dict)
print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241031/HJ_20241031_OTE_AurTS25_sc90min_2.xlsx")
df.columns

In [None]:
sample_dict = {
    "058": "Control",
    "054": "lacI_Oid",
    "208": "yjbJ",
    "216": "yagB",
}

gc_dict = {
    'S': 'stationary',
    'N': 'salt',
    'G': 'glucose',
    'SA': 'stationary, 37C',
    'NA': 'salt, 37C',
    'GA': 'glucose, 37C'
}

file = "../../../data/mass-spec/20241031/HJ_20241031_OTE_AurTS25_sc90min_2.xlsx"
out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)
print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
sample_dict = {
    "058": "Control",
    "054": "lacI_Oid",
    "208": "yjbJ",
    "216": "yagB",
}

gc_dict = {
    'S': 'stationary, 4C',
    'N': 'salt, 4C',
    'G': 'glucose, 4C',
    'SA': 'stationary, 37C',
    'NA': 'salt, 37C',
    'GA': 'glucose, 37C'
}

file = "../../../data/mass-spec/20241105/HJ_20241031_OTE_AurTS25_sc90min_2.xlsx"
out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)
print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241125/HJ_20241125_OTE_AurTS25_set1_proteins.xlsx")
df.columns

In [None]:
sample_dict = {
    "208": "yjbJ",
    "216": "yagB",
    '212': 'yadI',
    '233': 'gyrA',
    '829': 'acrZ',
    '835': 'Control',
    '837': 'Control_2',
    '839': 'Control_3',
    '174': 'mhpRp2',
    '176': 'ygiW_1',
    '178': 'ygiW_2',
    '180': 'sohA',
    '186': 'elaB',
    '188': 'rcsBp2',
    '192': 'galEp1',
    '196': 'tisB'
}

gc_dict = {
    'S': 'stationary',
}

file = "../../../data/mass-spec/20241125/HJ_20241125_OTE_AurTS25_set1_proteins.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)
print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241125/HJ_20241125_OTE_AurTS25_set2_proteins.xlsx")
df.columns

In [None]:
sample_dict = {
    "975": "pheMp",
    "977": "furpb",
    '979': 'flda',
    '981': 'ihfAp',
    '983': 'ompRp3',
    '985': 'ecnb',
    '098': 'yqaE',
    '208': 'yjbJ',
    '214': 'ybaY',
    '216': 'yagB',
    '837': 'Control'
}

gc_dict = {
    'P': 'phenazine',
    'N1': 'salt',
    'N2': 'salt2'
}

file = "../../../data/mass-spec/20241125/HJ_20241125_OTE_AurTS25_set2_proteins.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241216/HJ_20241216_OTE_AurTS25_setR.xlsx")
df.columns

In [None]:
sample_dict = {
    "098": "yqaE",
    "174": "mhpRp2",
    '176': 'ygiW_1',
    '178': 'ygiW_2',
    '180': 'sohA',
    '182': 'kbpp4',
    '186': 'elaB',
    '188': 'rcsBp2',
    '190': 'flda',
    '192': 'galEp1',
    '194': 'ompRp3',
    '196': 'tisB',
    '233': 'gyrA',
    '829': 'acrZ',
    '837': 'Control',
    '839': 'Control2'
}

gc_dict = {
    'S': 'stationary',
}

file = "../../../data/mass-spec/20241216/HJ_20241216_OTE_AurTS25_setR.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241216/HJ_20241216_OTE_AurTS25_setB.xlsx")
df.columns

In [None]:
sample_dict = {
    "975": "yqaE",
    "977": "furpb",
    '979': 'flda',
    '981': 'ihfAp',
    '983': 'ompRp3',
    '985': 'ecnb',
    '829': 'acrZ',
    '831': 'acrZ_2',
    '833': 'marRp',
    '841': 'dicCp',
    '843': 'yqjE_2',
    '851': 'galEp1',
    '192': 'gyrA',
    '829': 'acrZ',
    '837': 'Control_medium',
    '839': 'Control_short'
}


gc_dict = {
    'P': 'phenazine',
    'D': 'dipyridyl',
}

file = "../../../data/mass-spec/20241216/HJ_20241216_OTE_AurTS25_setB.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241221/HJ_20241221_OTE_AurTS25_setR.xlsx")
df.columns

In [None]:
sample_dict = {
    "231": "yahC",
    "233": "gyrA",
    '839': 'Control',
    '192': 'galEp1',
    '481': 'rcsBp2',
    '829': 'acrZ',
    '831': 'acrZ_2',
    '833': 'marRp',
    '841': 'dicCp',
    '843': 'aceBp',
    '851': 'yqjE_2',
    '182': 'kbpp4',
    '849': 'rcsDp',
    '853': 'araCp'
}


gc_dict = {
    'A': 'arabinose',
    'D': 'dipyridyl',
    'X': 'xylose',
    
}

file = "../../../data/mass-spec/20241221/HJ_20241221_OTE_AurTS25_setR.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20241221/HJ_20241221_OTE_AurTS25_setB.xlsx")
df.columns

In [None]:
sample_dict = {
    "235": "TSS_1414_1",
    "237": "TSS_1414_2",
    '239': 'araB_1',
    '241': 'araB_2',
    '484': 'tmaR_1',
    '489': 'tmaR_2',
    '496': 'mglB_1',
    '492': 'tmaR_3',
    '498': 'rspA_1',
    '500': 'rspA_2',
    '186': 'elaB',
    '214': 'ybaY',
    '855': 'mglB_2',
    '837': 'Control_medium',
    '839': 'Control_long',
    '981': 'ihfAp'
}


gc_dict = {
    'A': 'arabinose',
    'G': 'glucose',
    'D': 'dipyridyl',
    'X': 'Xylose'
}

file = "../../../data/mass-spec/20241221/HJ_20241221_OTE_AurTS25_setB.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250113_analysis/HJ_20250112_OTE_AurTS25_setR.xlsx")
df.columns

In [None]:
sample_dict = {
    '738': 'cpxR',
    "740": "intE",
    "742": "yadE2",
    '744': 'ybeDp',
    '746': 'ompRp2',
    '748': 'acrA',
    '839': 'Control_long',
    '186': 'elaB',
    '214': 'ybaY',
    '837': 'Control_medium',
    '855': 'mglB_2',
    '981': 'ihfAp',
    '182': 'kbpp4',
    '839': 'Control_long',
    '849': 'rcsDp',
    '853': 'araCp',
}


gc_dict = {
    'J': 'gentamicin',
    'G': 'glucose',
    'X': 'Xylose'
}

file = "../../../data/mass-spec/20250113_analysis/HJ_20250112_OTE_AurTS25_setR.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250113_analysis/HJ_20250112_OTE_AurTS25_setB.xlsx")
df.columns

In [None]:
sample_dict = {
    '231': 'yahC',
    "233": "gyrA",
    "235": "TSS_1414_1",
    '237': 'TSS_1414_2',
    '239': 'araB_1',
    '241': 'araB_2',
    '481': 'rcsBp2',
    '484': 'tmaR_1',
    '489': 'tmaR_2',
    '492': 'tmaR_3',
    '496': 'mglB_1',
    '498': 'rspA_1',
    '500': 'rspA_2',
    '839': 'Control_long',
    '837': 'Control_medium',
    '208': 'yjbJ'
}


gc_dict = {
    'S': 'stationary',
    'A': 'arabinose',
}

file = "../../../data/mass-spec/20250113_analysis/HJ_20250112_OTE_AurTS25_setB.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250124_analysis/HJ_20250122_OTE_AurTS25_setB.xlsx")
df.columns

In [None]:
sample_dict = {
    '738': 'cpxR',
    "740": "intE",
    "742": "yadE2",
    '744': 'ybeDp',
    '746': 'ompRp2',
    '748': 'acrA',
    '839': 'Control_long',
    '975': 'elaB',
    '977': 'furpb',
    '979': 'flda',
    '981': 'ihfAp',
    '983': 'ompRp3',
    '985': 'ecnb',
    '839': 'Control_long',
    '212': 'yadI',
}


gc_dict = {
    'S': 'stationary',
    'J': 'gentamicin',
    'P': 'phenazine'
}

file = "../../../data/mass-spec/20250124_analysis/HJ_20250122_OTE_AurTS25_setB.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250124_analysis/HJ_20250122_OTE_AurTS25_setR.xlsx")
df.columns

In [None]:
sample_dict = {
    '875': 'uofp',
    "878": "lpp1",
    "882": "lpp2",
    '885': 'blr',
    '889': 'ybiY1',
    '892': 'ybiY2',
    '895': 'ybiY3',
    '898': 'araCp_2',
    '905': 'zapB1',
    '083': 'zapB2',
    '085': 'zapB3',
    '837': 'Control_medium',
    '839': 'Control_long',
    '087': 'ompRp2',
    '089': 'acrA',
}


gc_dict = {
    'G': 'glucose',
    'S': 'stationary',
}

file = "../../../data/mass-spec/20250124_analysis/HJ_20250122_OTE_AurTS25_setR.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250213_analysis/HJ_20250210_OTE_AurTS25_setB.xlsx")
df.columns

In [None]:
sample_dict = {
    '853': 'araCp',
    "849": "rcsDp",
    "839": "Control_long",
    '182': 'kbpp4',
    '837': 'Control_medium',
    '218': 'yqjE',
    '214': 'ybaY',
    '212': 'yadI',
    '210': 'yadE',
    '208': 'yjbJ',
    '098': 'yqaE',
}


gc_dict = {
    'X': 'xylose',
    'S': 'stationary',
    'N': 'salt'
}

file = "../../../data/mass-spec/20250213_analysis/HJ_20250210_OTE_AurTS25_setB.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250213_analysis/HJ_20250210_OTE_AurTS25_setR.xlsx")
df.columns

In [None]:
sample_dict = {
    '905': 'zapB1',
    "981": "ihfAp",
    "839": "Control_long",
    '898': 'araCp_2',
    '895': 'ybiY3',
    '892': 'ybiY2',
    '889': 'ybiY1',
    '885': 'blr',
    '882': 'lpp2',
    '878': 'lpp1',
    '875': 'uofp',
    '855': 'mglB_2',
    '214': 'ybaY',
    '186': 'elaB',
    '085': 'zapB3',
    '083': 'zapB2'
}


gc_dict = {
    'G': 'glucose',
    'X': 'xylose'
}

file = "../../../data/mass-spec/20250213_analysis/HJ_20250210_OTE_AurTS25_setR.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250303_analysis/HJ_20250303_OTE_AurTS25_set1.xlsx")
df.columns

In [None]:
sample_dict = {
    '851': 'yqjE_2',
    "843": "aceBp",
    "841": "dicCp",
    '839': 'Control_long',
    '833': 'marRp',
    '831': 'acrZ_2',
    '829': 'acrZ',
    '192': 'galEp1',
    '985': 'ecnb',
    '983': 'ompRp3',
    '981': 'ihfAp',
    '979': 'flda',
    '977': 'furpb',
    '975': 'pheMp',
    '837': 'Control_medium',
}


gc_dict = {
    'D': 'dipyridyl',
    'P': 'phenazine'
}

file = "../../../data/mass-spec/20250303_analysis/HJ_20250303_OTE_AurTS25_set1.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250303_analysis/HJ_20250303_OTE_AurTS25_set2.xlsx")
df.columns

In [None]:
sample_dict = {
    '188': 'rcsBp2',
    "186": "elaB",
    "176": "ygiW_1",
    '174': 'mhpRp2',
    '098': 'yqaE',
    '089': 'acrA',
    '087': 'ompRp2',
    '837': 'Control_medium',
    '218': 'yqjE',
    '214': 'ybaY',
    '210': 'yadE',
    '839': 'Control_long',
    '194': 'ompRp3',
    '190': 'fldA',
}


gc_dict = {
    '$': 'stationary, high vol',
    'N': 'salt'
}

file = "../../../data/mass-spec/20250303_analysis/HJ_20250303_OTE_AurTS25_set2.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250307/HJ_20250325_OTE_AurTS25_0307.xlsx")
df.columns

In [None]:
sample_dict = {
    '839': 'Control_long',
    "738": "cpxR",
    "655": "cusC2",
    '653': 'cusC',
    '837': 'Control_medium',
    '196': 'tisB',
    '186': 'elaB',
    '985': 'ecnb',
    '651': 'yagB2',
    '216': 'yagB',
}


gc_dict = {
    'P': 'H2O2',
    'H': 'heatshock',
    'C1': 'copper1',
    'C2': 'copper2',
    'L': 'Leucine'
}

file = "../../../data/mass-spec/20250307/HJ_20250325_OTE_AurTS25_0307.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

In [None]:
df = pd.read_excel("../../../data/mass-spec/20250326/HJ_20250325_OTE_AurTS25_0313.xlsx")
df.columns

In [None]:
sample_dict = {
    '839': 'Control_long',
    "738": "cpxR",
    "657": "icdC",
    '212': 'yadI',
    '837': 'Control_medium',
    '196': 'tisB',
    '186': 'elaB',
    '985': 'ecnb',
    '651': 'yagB2',
    '216': 'yagB',
}


gc_dict = {
    'P': 'H2O2',
    'H': 'heatshock',
    'LB1': 'LB1',
    'LB2': 'LB2',
    'Le': 'Leucine'
}

file = "../../../data/mass-spec/20250326/HJ_20250325_OTE_AurTS25_0313.xlsx"

out_df_list, control_list = parse_mass_spec(file, sample_dict, gc_dict)

print(control_list)
display(out_df_list[0].head(5))
df_list.extend(out_df_list)

## Combine all the data sets

Now we can combine all datasets into a single dataframe.

In [None]:
df_combined = pd.concat([pd.melt(df, id_vars=['Sum PEP Score', '# Peptides', 'Gene Symbol', 'condition', 'Control', 'date'], value_name='enrichment') for df in df_list])

In [None]:
df_combined.reset_index(drop=True)

## Filter for DNA binding proteins

In some samples we find enrichment for proteins that are not DNA binding proteins. We use RegulonDB to get a list of all transcription factors.

In [None]:
from bson import decode_all

with open("../../../data/metadata/regulonDatamart.bson", "rb") as file:
    data = file.read()

# Decode all BSON documents into a list of dictionaries
documents = decode_all(data)
len(documents)
df = pd.DataFrame(documents)

name_list = []
syn_list = [] 
type_list = []

for x in df['regulator'].values:
    name_list.append(x['abbreviatedName'])
    if 'synonyms' in x.keys():
        syn_list.append(x['synonyms'])
    else:
        syn_list.append([''])
    type_list.append(x['type'])

df_TF = pd.DataFrame({"name": name_list, 'synonyms':syn_list, 'type': type_list})

## Plot specific samples

Using the dataframe, we can make enrichment plots for specific examples shown in the paper.

In [None]:
def plot_mass_spec(df, sample, filter_binding_proteins=False, highlighted_genes=[], filter_control=None):
    
    df_filtered = df[df['sample'] == sample]

    
    if filter_binding_proteins:
        # Create a set of lowercased binding protein names from df_TF
        binding_names = set(df_TF['name'].str.lower())
        binding_names |= {syn.lower() for synonyms in df_TF['synonyms'] for syn in synonyms}
        
        # Filter df_filtered by checking if the lower-case gene symbol is in the set
        df_filtered = df_filtered[df_filtered['Gene Symbol'].str.lower().isin(binding_names)]

    if filter_control != None:
        df_filtered = df_filtered[df_filtered['Control'] == filter_control]
        

    gdf = df_filtered.groupby(['condition', 'date'])
    n_conditions = len(gdf)

    fig, ax = plt.subplots(1, n_conditions, figsize=(1*n_conditions, 2), sharey=True)
    if n_conditions == 1:
        ax = [ax]
            
    for i, ((condition, date), _df) in enumerate(gdf):
        x = np.random.randn(len(_df['enrichment']))
        ax[i].scatter(x, _df['enrichment'].values, s=12, linewidth=0)
        for gene in highlighted_genes:
            ind = np.where(_df['Gene Symbol'] == gene)
            ax[i].scatter(x[ind], _df['enrichment'].values[ind], s=12)    
        ax[i].set_yscale('log')
        ax[i].set_title(condition+'\n'+date)
        ax[i].set_xticks([])
        ax[i].grid(True)
        
        
    ax[0].set_ylabel("enrichment")
    plt.tight_layout()

        
    return fig, ax, df_filtered.sort_values('enrichment', ascending=False)
        

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ihfAp', filter_binding_proteins=False, highlighted_genes=['ihfA', 'ihfB'], filter_control='837')
df.head(20)
fig.savefig("ihfAp_mass-spec.pdf")

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'dicCp', filter_binding_proteins=True, highlighted_genes=['arcA', 'ygbI'], filter_control='839')
df.head(20)
fig.savefig("dicCp_mass-spec.pdf")

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'mhpRp2', filter_binding_proteins=True, highlighted_genes=['arcA', 'galS'], filter_control='837')
fig.savefig("mhpRp2_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ybiY1', filter_binding_proteins=True, highlighted_genes=['yciT'])#, filter_control='837')
fig.savefig("ybiY1_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ybiY2', filter_binding_proteins=True, highlighted_genes=['yciT'])#, filter_control='837')
fig.savefig("ybiY2_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ybiY3', filter_binding_proteins=True, highlighted_genes=['yciT'])#, filter_control='837')
fig.savefig("ybiY3_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'gyrA', filter_binding_proteins=True, highlighted_genes=['ygbI', "galR"], filter_control='839')
fig.savefig("gyrA_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'sohA', highlighted_genes=['prlF'], filter_control='839')
fig.savefig("sohA_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'TSS_1414_2', filter_binding_proteins=True, highlighted_genes=['ihfA', 'ihfB'])#, filter_control='837')
#fig.savefig("yahM_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined[['stationary' in x for x in df_combined['condition']]], 'yagB', filter_binding_proteins=True, highlighted_genes=['xynR'])#, filter_control='837')
fig.savefig("yagB_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'cpxR', filter_binding_proteins=True, highlighted_genes=['cpxR'])#, filter_control='837')
fig.savefig("cpxR_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'TSS_1414_2', filter_binding_proteins=False, highlighted_genes=['ihfA', 'ihfB'])#, filter_control='837')
fig.savefig("yahM_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'intE', filter_binding_proteins=True, highlighted_genes=['yhaJ'])#, filter_control='837')
fig.savefig("ymfH_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined[df_combined['condition'] == 'stationary'], 'galEp1', filter_binding_proteins=True, highlighted_genes=['galR'], filter_control='837')
fig.savefig("galE_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'cusC', filter_binding_proteins=True)#, highlighted_genes=['galR'], filter_control='837')
fig.savefig("cusC_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'elaB', filter_binding_proteins=True)#, filter_control='837')
fig.savefig("elaB_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ybaY', filter_binding_proteins=True)#, filter_control='837')
fig.savefig("ybaY_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ygiW_2', filter_binding_proteins=True, highlighted_genes=['tyrR'], filter_control='839')
fig.savefig("ygiW_mass-spec.pdf")
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'yqjE_2', filter_binding_proteins=True)#), highlighted_genes=['tyrR'], filter_control='839')
#fig.savefig("ygiW_mass-spec.pdf")
df.head(20)

In [None]:
df_comp = pd.read_csv("../../../data/sequence_scanning/arcA_dicCp.csv", comment='#')
df_comp['TF'] = [x.split('_')[0] for x in df_comp['Target_ID']]
display(df_comp.head())

fig, ax = plt.subplots(figsize=(2, 4))
x = np.random.randn(len(df_comp))
ax.scatter(np.random.randn(len(df_comp)), np.log10(df_comp['p-value'].values ), s=15)
ax.set_xticks([])
fig.savefig("dicCp_tomtom.pdf")

In [None]:
df_comp = pd.read_csv("../../../data/sequence_scanning/arcA_mhpRp2.csv", comment='#')
df_comp['TF'] = [x.split('_')[0] for x in df_comp['Target_ID']]
display(df_comp.head())

fig, ax = plt.subplots(figsize=(2, 4))
x = np.random.randn(len(df_comp))
ax.scatter(np.random.randn(len(df_comp)), np.log10(df_comp['p-value'].values ), s=15)
ax.set_xticks([])
fig.savefig("mhpRp2_tomtom.pdf")

In [None]:
df_comp = pd.read_csv("../../../data/sequence_scanning/CRP_mhpRp2.csv", comment='#')
df_comp['TF'] = [x.split('_')[0] for x in df_comp['Target_ID']]
display(df_comp.head())

fig, ax = plt.subplots(figsize=(2, 4))
x = np.random.randn(len(df_comp))
ax.scatter(np.random.randn(len(df_comp)), np.log10(df_comp['p-value'].values ), s=15)
ax.set_xticks([])
fig.savefig("mhpRp2_CRP_tomtom.pdf")

In [None]:
fig.savefig("mhpRp2_mass-spec.pdf")

In [None]:
fig.savefig("dicC_mass-spec.pdf")

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'araB_1', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'elaB', filter_binding_proteins=True)
df[df['condition'] == 'H2O2']

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'cpxR', filter_binding_proteins=True)
df.head(20)

In [None]:
df_combined[df_combined['sample'] == 'cpxR'].sort_values('enrichment', ascending=False)
df.head(20)

In [None]:
_, _, df = plot_mass_spec(df_combined, 'araB_1', filter_binding_proteins=True)
gdf = df.groupby(['condition', 'date'])

fig, ax = plt.subplots(n_conditions, 1, figsize=(2, 0.75*n_conditions), sharex=True)
if n_conditions == 1:
    ax = [ax]
        
for i, ((condition, date), _df) in enumerate(gdf):
    y = np.random.randn(len(_df['enrichment']))
    ind = np.where(_df['Gene Symbol'] == 'araC')[0][0]
    ax[i].scatter(_df['enrichment'].values, y, s=10)#, s=_df['Sum PEP Score'].values**(1/4)*3)
    ax[i].scatter([_df['enrichment'].values[ind]], [y[ind]], s=10)#, s=_df['Sum PEP Score'].values[ind]**(1/4)*3)
    
    #ax[i].set_xlabel('enrichment')
    ax[i].set_yticks([])
    ax[i].grid(True)
plt.tight_layout()
ax[1].set_xlabel('enrichment')
fig.savefig('araB_1_mass-spec.pdf')   
    

In [None]:
_, _, df = plot_mass_spec(df_combined, 'tisB', filter_binding_proteins=True)
df = df[df['Sum PEP Score'] > 5]

gdf = df.groupby(['condition', 'date'])
n_conditions = len(gdf)
fig, ax = plt.subplots(n_conditions, 1, figsize=(1, 0.2*n_conditions), sharex=True)
if n_conditions == 1:
    ax = [ax]
        
for i, ((condition, date), _df) in enumerate(gdf):
    y = np.random.randn(len(_df['enrichment']))
    ax[i].scatter(_df['enrichment'].values, y, s=2)#, s=_df['Sum PEP Score'].values**(1/4)*3)
    if 'lexA' in _df['Gene Symbol'].values:
        ind = np.where(_df['Gene Symbol'] == 'lexA')[0][0]
        ax[i].scatter([_df['enrichment'].values[ind]], [y[ind]], s=2)#, s=_df['Sum PEP Score'].values[ind]**(1/4)*3)
    
    
    #ax[i].set_xlabel('enrichment')
    ax[i].set_yticks([])
    ax[i].grid(True)
plt.tight_layout()
ax[3].set_xlabel('enrichment')
fig.savefig('tisB_mass-spec.pdf')   
    

In [None]:
_, _, df = plot_mass_spec(df_combined, 'tisB', filter_binding_proteins=True)
df = df[df['Sum PEP Score'] > 5]

gdf = df.groupby(['condition', 'date'])
n_conditions = len(gdf)
fig, ax = plt.subplots(n_conditions, 1, figsize=(1, 0.2*n_conditions), sharex=True)
if n_conditions == 1:
    ax = [ax]
        
for i, ((condition, date), _df) in enumerate(gdf):
    y = np.random.randn(len(_df['enrichment']))
    ax[i].scatter(_df['enrichment'].values, y, s=2)#, s=_df['Sum PEP Score'].values**(1/4)*3)
    if 'lexA' in _df['Gene Symbol'].values:
        ind = np.where(_df['Gene Symbol'] == 'lexA')[0][0]
        ax[i].scatter([_df['enrichment'].values[ind]], [y[ind]], s=2)#, s=_df['Sum PEP Score'].values[ind]**(1/4)*3)
    
    
    #ax[i].set_xlabel('enrichment')
    ax[i].set_yticks([])
    ax[i].grid(True)
plt.tight_layout()
ax[3].set_xlabel('enrichment')
fig.savefig('tisB_mass-spec.pdf')   
    

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'tisB', filter_binding_proteins=True)
df[df['Sum PEP Score'] > 5]

In [None]:
_, _, df = plot_mass_spec(df_combined, 'tisB', filter_binding_proteins=True)
df = df[df['Sum PEP Score'] > 5]

gdf = df.groupby(['condition', 'date'])

n_conditions = len(gdf)
fig, ax = plt.subplots(1, n_conditions, figsize=(0.75*n_conditions, 2), sharey=True)
if n_conditions == 1:
    ax = [ax]
        
for i, ((condition, date), _df) in enumerate(gdf):
    y = np.random.randn(len(_df['enrichment']))
    ax[i].scatter(y, _df['enrichment'].values, s=10)#, s=_df['Sum PEP Score'].values**(1/4)*3)
    if 'lexA' in _df['Gene Symbol'].values:
        ind = np.where(_df['Gene Symbol'] == 'lexA')[0][0]
        ax[i].scatter([y[ind]], [_df['enrichment'].values[ind]], s=10)#, s=_df['Sum PEP Score'].values[ind]**(1/4)*3)
    
    #ax[i].set_xlabel('enrichment')
    ax[i].set_xticks([])
    ax[i].grid(True)
plt.tight_layout()
ax[0].set_ylabel('enrichment')
fig.savefig('tisB_mass-spec.pdf')   
    

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'cusC')
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'yagB', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'yagB2', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'intE', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ybiY1', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'yjbJ', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'yqjE', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'yadI', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'yadE', filter_binding_proteins=True)
df.head(20)

In [None]:
fig, ax, df = plot_mass_spec(df_combined, 'ygiW_2', filter_binding_proteins=True)
df.head(20)

In [None]:
df[df['Gene Symbol']=='sucB']

In [None]:
df_comp = pd.read_csv("../../../data/cases/yadI_predicted_-45_-20.tsv", sep='\t', comment='#')
df_comp['TF'] = [x.split('_')[0] for x in df_comp['Target_ID']]
display(df_comp.head())

fig, ax = plt.subplots(figsize=(2, 4))
x = np.random.randn(len(df_comp))

inds = np.where(df_comp['TF'] == 'CRP')[0][0:6]
ax.scatter(x, np.log10(df_comp['p-value'].values ), s=15)
ax.scatter(x[inds], np.log10(df_comp['p-value'].values )[inds], s=15)
ax.set_ylabel("log10 p-value")
ax.set_xticks([])
fig.tight_layout()
fig.savefig("yadI_CRP_tomtom.pdf")