# Create_RegulonDB_TRN

In [1]:
import pandas as pd

# Read the TSV file
RegulonDB_TRN = pd.read_csv('../Data/NetworkRegulatorGene.tsv', sep='\t')

In [2]:
RegulonDB_TRN

Unnamed: 0,regulatorId,regulatorName,RegulatorGeneName,regulatedId,regulatedName,function,confidenceLevel
0,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00006,hisM,-,W
1,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00030,agp,+,W
2,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00039,cysQ,+,W
3,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00065,argI,+,W
4,RDBECOLICNC00063,ppGpp,,RDBECOLIGNC00067,argS,-,W
...,...,...,...,...,...,...,...
7052,RDBECOLITFC00238,FimZ,fimZ,RDBECOLIGNC03170,sfmC,+,S
7053,RDBECOLITFC00238,FimZ,fimZ,RDBECOLIGNC03507,puuD,+,W
7054,RDBECOLITFC00239,DicD,dicD,RDBECOLIGNC00220,dicA,-,W
7055,RDBECOLITFC00239,DicD,dicD,RDBECOLIGNC00221,dicB,-,W


In [3]:
RegulonDB_TRN.rename(columns={'confidenceLevel ': 'confidenceLevel'}, inplace=True)

In [4]:
RegulonDB_TRN['confidenceLevel'].value_counts()

S     2962
W     2704
C     1385
?        5
W        1
Name: confidenceLevel, dtype: int64

In [5]:
# Filter the dataframe for rows where confidenceLevel is 'S' or 'C' 
RegulonDB_TRN_confident = RegulonDB_TRN[RegulonDB_TRN['confidenceLevel'].isin(['S ', 'C '])]

In [6]:
RegulonDB_TRN_confident

Unnamed: 0,regulatorId,regulatorName,RegulatorGeneName,regulatedId,regulatedName,function,confidenceLevel
309,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC00341,ftsZ,-,S
310,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC00559,manX,-,S
311,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC00793,pykA,-,S
312,RDBECOLIPDC00328,DicF,dicF,RDBECOLIGNC02444,xylR,-,S
313,RDBECOLIPDC00358,DsrA,dsrA,RDBECOLIGNC00450,hns,-,S
...,...,...,...,...,...,...,...
7046,RDBECOLITFC00237,PtrR,ptrR,RDBECOLIGNC03662,yneG,+,S
7047,RDBECOLITFC00237,PtrR,ptrR,RDBECOLIGNC03663,glsB,+,S
7048,RDBECOLITFC00237,PtrR,ptrR,RDBECOLIGNC03664,sad,+,S
7051,RDBECOLITFC00238,FimZ,fimZ,RDBECOLIGNC03169,sfmA,+,S


In [7]:
# Pivot the table to create the desired dataframe
# We use 'regulatorName' as columns, 'regulatedName' as the index,
# and fill the cells with 1, indicating presence (0 will be used for absence by default).
RegulonDB_TRN_confident_df = RegulonDB_TRN_confident.pivot_table(
    index='regulatedName', 
    columns='regulatorName', 
    aggfunc='size',  # Count the occurrences which are not NaN (presence of relationship)
    fill_value=0
)

RegulonDB_TRN_confident_df.index.name = None
RegulonDB_TRN_confident_df.columns.name = None

# Display the resulting dataframe
RegulonDB_TRN_confident_df

Unnamed: 0,AcrR,Ada,AdiY,AgaR,AidB,AlaS,AllR,AraC,ArcA,ArcZ,...,YiaU,YjjQ,YqhC,ZntR,ZraR,Zur,carbon storage regulator CsrA,ribonuclease E,small heat shock protein IbpA,threonine&mdash;tRNA ligase
aaeA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abgA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
znuC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
zraP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
zraR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
zraS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [8]:
RegulonDB_TRN_confident_df = RegulonDB_TRN_confident_df.iloc[:, :-4]

In [9]:
RegulonDB_TRN_confident_df

Unnamed: 0,AcrR,Ada,AdiY,AgaR,AidB,AlaS,AllR,AraC,ArcA,ArcZ,...,YefM,YefMB,YgiV,YhaJ,YiaU,YjjQ,YqhC,ZntR,ZraR,Zur
aaeA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaeX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abgA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
znuC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
zraP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
zraR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
zraS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
RegulonDB_TRN_confident_df.to_csv('../Data/RegulonDB_TRN_C_and_S.csv')