# Create_RegulonDB_TRN

In [1]:
import pandas as pd

# Read the TSV file
RegulonDB_TRN = pd.read_csv('../Data/RegulonDB_v13_08042024/NetworkRegulatorGene.tsv', sep='\t', comment='#')

In [2]:
RegulonDB_TRN

Unnamed: 0,1)regulatorId,2)regulatorName,3)RegulatorGeneName,4)regulatedId,5)regulatedName,6)function,7)confidenceLevel
0,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00006,hisM,-,W
1,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00030,agp,+,W
2,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00039,cysQ,+,W
3,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00065,argI,+,W
4,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00067,argS,-,W
...,...,...,...,...,...,...,...
7122,RDBECOLITFC00238,FimZ,fimZ,RDBECOLIGNC03170,sfmC,+,S
7123,RDBECOLITFC00238,FimZ,fimZ,RDBECOLIGNC03507,puuD,+,W
7124,RDBECOLITFC00239,DicD,dicD,RDBECOLIGNC00220,dicA,-,W
7125,RDBECOLITFC00239,DicD,dicD,RDBECOLIGNC00221,dicB,-,W


In [3]:
RegulonDB_TRN.rename(columns={'7)confidenceLevel ': 'confidenceLevel'}, inplace=True)

In [4]:
RegulonDB_TRN['confidenceLevel'].value_counts()

S     3011
W     2744
C     1366
?        5
W        1
Name: confidenceLevel, dtype: int64

In [5]:
# Filter the dataframe for rows where confidenceLevel is 'S' or 'C' 
RegulonDB_TRN_confident = RegulonDB_TRN[RegulonDB_TRN['confidenceLevel'].isin(['S ', 'C '])]

In [6]:
RegulonDB_TRN_confident

Unnamed: 0,1)regulatorId,2)regulatorName,3)RegulatorGeneName,4)regulatedId,5)regulatedName,6)function,confidenceLevel
309,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC00341,ftsZ,-,S
310,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC00559,manX,-,S
311,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC00793,pykA,-,S
312,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC02444,xylR,-,S
313,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00450,hns,-,S
...,...,...,...,...,...,...,...
7116,RDBECOLITFC00237,PtrR,ptrR,RDBECOLIGNC03662,yneG,+,S
7117,RDBECOLITFC00237,PtrR,ptrR,RDBECOLIGNC03663,glsB,+,S
7118,RDBECOLITFC00237,PtrR,ptrR,RDBECOLIGNC03664,sad,+,S
7121,RDBECOLITFC00238,FimZ,fimZ,RDBECOLIGNC03169,sfmA,+,S


In [7]:
# Pivot the table to create the desired dataframe
# We use 'regulatorName' as columns, 'regulatedName' as the index,
# and fill the cells with 1, indicating presence (0 will be used for absence by default).
RegulonDB_TRN_confident_df = RegulonDB_TRN_confident.pivot_table(
    index='5)regulatedName', 
    columns='2)regulatorName', 
    aggfunc='size',  # Count the occurrences which are not NaN (presence of relationship)
    fill_value=0
)

RegulonDB_TRN_confident_df.index.name = None
RegulonDB_TRN_confident_df.columns.name = None

# Display the resulting dataframe
RegulonDB_TRN_confident_df

Unnamed: 0,AcrR,Ada,AdiY,AgaR,AidB,AlaS,AllR,AraC,ArcA,ArcZ,...,YiaU,YjjQ,YqhC,ZntR,ZraR,Zur,carbon storage regulator CsrA,ribonuclease E,small heat shock protein IbpA,threonine&mdash;tRNA ligase
aaeA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abgA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
znuC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
zraP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
zraR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
zraS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [8]:
RegulonDB_TRN_confident_df = RegulonDB_TRN_confident_df.iloc[:, :-4].copy()

In [9]:
RegulonDB_TRN_confident_df

Unnamed: 0,AcrR,Ada,AdiY,AgaR,AidB,AlaS,AllR,AraC,ArcA,ArcZ,...,YefM,YefMB,YgiV,YhaJ,YiaU,YjjQ,YqhC,ZntR,ZraR,Zur
aaeA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abgA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
znuC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
zraP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
zraR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
zraS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
RegulonDB_TRN_confident_df.drop(columns=['RpoN'], inplace=True)

## Add in Sigma Factors

In [11]:
RegulonDB_Sigma = pd.read_csv("../Data/RegulonDB_v13_08042024/NetworkSigmaGene.tsv", sep = '\t', comment='#')

In [12]:
RegulonDB_Sigma['1)sigmaName'].value_counts()

sigma70    1718
sigma38     340
sigma32     177
sigma54     141
sigma24     132
sigma28      80
sigma19       5
Name: 1)sigmaName, dtype: int64

In [13]:
RegulonDB_Sigma

Unnamed: 0,1)sigmaName,2)regulatedName,3)function,4)promoterEvidence,5)confidenceLevel
0,sigma19,fecA,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
1,sigma19,fecB,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
2,sigma19,fecC,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
3,sigma19,fecD,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
4,sigma19,fecE,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
...,...,...,...,...,...
2588,sigma70,ytiD,+,"[COMP-HINF,COMP-AINF,COMP-HINF-POSITIONAL-IDEN...",S
2589,sigma70,zinT,+,"[COMP-HINF-POSITIONAL-IDENTIFICATION,EXP-IDA-T...",S
2590,sigma70,zntA,+,"[EXP-IDA-TRANSCRIPTION-INIT-MAPPING,COMP-AINF,...",S
2591,sigma70,znuB,+,"[EXP-IDA-TRANSCRIPTION-INIT-MAPPING,COMP-AINF]",S


In [14]:
RegulonDB_Sigma = RegulonDB_Sigma[RegulonDB_Sigma['5)confidenceLevel '].isin(['S ', 'C '])].copy()

In [15]:
# Creating a mapping dictionary for sigma factors and their corresponding names
sigma_mapping = {
    'sigma70': 'RpoD',
    'sigma38': 'RpoS',
    'sigma32': 'RpoH',
    'sigma54': 'RpoN',
    'sigma24': 'RpoE',
    'sigma28': 'FliA',
    'sigma19': 'FecI'
}

# Replacing the values in the '1)sigmaName' column based on the mapping
RegulonDB_Sigma['1)sigmaName'] = RegulonDB_Sigma['1)sigmaName'].replace(sigma_mapping)

In [16]:
RegulonDB_Sigma

Unnamed: 0,1)sigmaName,2)regulatedName,3)function,4)promoterEvidence,5)confidenceLevel
0,FecI,fecA,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
1,FecI,fecB,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
2,FecI,fecC,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
3,FecI,fecD,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
4,FecI,fecE,+,"[COMP-HINF-SIMILAR-TO-CONSENSUS,COMP-HINF-POSI...",C
...,...,...,...,...,...
2587,RpoD,ytiC,+,"[COMP-HINF,COMP-AINF,COMP-HINF-POSITIONAL-IDEN...",S
2588,RpoD,ytiD,+,"[COMP-HINF,COMP-AINF,COMP-HINF-POSITIONAL-IDEN...",S
2589,RpoD,zinT,+,"[COMP-HINF-POSITIONAL-IDENTIFICATION,EXP-IDA-T...",S
2590,RpoD,zntA,+,"[EXP-IDA-TRANSCRIPTION-INIT-MAPPING,COMP-AINF,...",S


In [17]:
# Pivot the table to create the desired dataframe
# We use 'regulatorName' as columns, 'regulatedName' as the index,
# and fill the cells with 1, indicating presence (0 will be used for absence by default).
RegulonDB_Sigma_df = RegulonDB_Sigma.pivot_table(
    index='2)regulatedName', 
    columns='1)sigmaName', 
    aggfunc='size',  # Count the occurrences which are not NaN (presence of relationship)
    fill_value=0
)

RegulonDB_Sigma_df.index.name = None
RegulonDB_Sigma_df.columns.name = None

# Display the resulting dataframe
RegulonDB_Sigma_df = RegulonDB_Sigma_df.drop(index=RegulonDB_Sigma_df.index[0])

In [18]:
RegulonDB_Sigma_df

Unnamed: 0,FecI,FliA,RpoD,RpoE,RpoH,RpoN,RpoS
accB,0,0,1,0,0,0,0
accC,0,0,1,0,0,0,0
accD,0,0,1,0,0,0,0
aceA,0,0,1,0,0,0,0
aceB,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
ytiD,0,0,1,0,0,0,0
zinT,0,0,1,0,0,0,0
zntA,0,0,1,0,0,0,0
zntR,0,0,0,0,1,0,0


In [19]:
# Merging the two dataframes with an outer join to keep all indexes
merged_df = RegulonDB_Sigma_df.merge(RegulonDB_TRN_confident_df, 
                                     how='outer', 
                                     left_index=True, 
                                     right_index=True)

# Filling missing values with 0s
merged_df = merged_df.fillna(0)

In [20]:
merged_df

Unnamed: 0,FecI,FliA,RpoD,RpoE,RpoH,RpoN,RpoS,AcrR,Ada,AdiY,...,YefM,YefMB,YgiV,YhaJ,YiaU,YjjQ,YqhC,ZntR,ZraR,Zur
aaeA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaeB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaeR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaeX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abgA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
znuC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
zraP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
zraR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
zraS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
merged_df.to_csv('../Data/RegulonDB_TRN_C_and_S.csv')