### 导入结点

In [1]:
import pandas as pd
gene = pd.read_csv('./Gene_Entity_v2.csv')[['id','amino_acid_seq:String','~label']]
gene = gene[gene['amino_acid_seq:String']!='none']
metabolite = pd.read_csv('./Metabolite_Entity_v5.csv')[['id','label']]
pathway = pd.read_csv('./Pathway_Entity_v3.csv')[['id','label']]
reaction = pd.read_csv('./Reaction_Entity_v5.csv')[['id','label']]
node = pd.concat([gene,metabolite,pathway,reaction])
# node = node[node["~label"]!="Gene;SIGMA"]
node = node.drop_duplicates(subset='id')
node.index = range(len(node)) 
node.label=node.label.fillna('Gene')
node['~label'] = node['~label'].replace(['Gene','Gene;TF','Gene;SIGMA'],['Non-TF','TF','TF'])
node['~label'].value_counts()

~label
Non-TF    4106
TF         216
Name: count, dtype: int64

In [2]:
node

Unnamed: 0,id,amino_acid_seq:String,~label,label
0,b0001,MKRISTTITTTITITTGNGAG,Non-TF,Gene
1,b0002,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,Non-TF,Gene
2,b0003,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,Non-TF,Gene
3,b0004,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,Non-TF,Gene
4,b0005,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,Non-TF,Gene
...,...,...,...,...
8236,SUCCt1pp,,,Reaction
8237,QUINDH,,,Reaction
8238,LCARSyi,,,Reaction
8239,BIOMASS_Ec_iML1515_core_75p37M,,,Reaction


### 导入边关系

In [3]:
edge = pd.read_csv('./Edge20220217_V3.csv')
edge = edge[['eid','source','target','label']]
edge.eid = edge.eid.apply(lambda x:x.replace("Edge_",""))
edge_order = ['TFGI','CPI','sRGI','GRI','MRI','RMI','SFGI','PPI','RPI']
edge['label'] = edge['label'].replace(edge_order,[0,1,8,2,3,4,5,6,7]) # 2 1 3 5 4 7 0 6
edge = edge.sort_values('label')
edge

Unnamed: 0,eid,source,target,label
79,79,b3025,b3026,0
80,80,b4113,b3026,0
81,81,b3083,b3082,0
82,82,b3082,b3083,0
83,83,b3255,b3256,0
...,...,...,...,...
11854,11854,b3864,b1479,8
11853,11853,b3864,b3603,8
11852,11852,b3864,b2988,8
11835,11835,b4451,b3365,8


### 处理边关系

In [4]:
# 对节点数据进行编号
node['nid'] = range(len(node))
# 创建一个字典，将节点的id映射到nid
id_to_nid = dict(zip(node['id'], node['nid']))

# 替换边关系数据中的source和target列为对应的nid值
edge['source'] = edge['source'].map(id_to_nid)
edge['target'] = edge['target'].map(id_to_nid)
edge = edge.dropna()
edge.index = range(len(edge))
edge = edge.astype(int)
edge.eid = range(len(edge))
edge

Unnamed: 0,eid,source,target,label
0,0,2829,2830,0
1,1,3833,2830,0
2,2,2885,2884,0
3,3,2884,2885,0
4,4,3045,3046,0
...,...,...,...,...
35858,35858,6744,5499,7
35859,35859,6837,5499,7
35860,35860,6926,5499,7
35861,35861,6929,5499,7


In [5]:
edge_nodes = set(edge['source']).union(set(edge['target']))
node = node[node['nid'].isin(edge_nodes)]
# 按照给定索引顺序创建一个新的DataFrame
index_order = ['Gene', 'Reaction', 'Metabolite', 'Pathway']
node = node[node['label'].isin(index_order)].sort_values(by=['label'], key=lambda x: x.map({label: i for i, label in enumerate(index_order)}))
node

Unnamed: 0,id,amino_acid_seq:String,~label,label,nid
4305,b4687,MTDCRYLIKRVIKIIIAVLQLILLFL,Non-TF,Gene,4305
4314,b4702,MEPDPTPLPRRRLKLFR,Non-TF,Gene,4314
4316,b4705,MNEFKRCMRVFSHSPFKVRLMLLSMLCDMVNNKPQQDKPSDK,Non-TF,Gene,4316
13,b0015,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,Non-TF,Gene,13
16,b0019,MKHLHRFFSSDASGGIILIIAAILAMIMANSGATSGWYHDFLETPV...,Non-TF,Gene,16
...,...,...,...,...,...
5503,g13,,,Pathway,5503
5504,g14,,,Pathway,5504
5505,g15,,,Pathway,5505
5506,g16,,,Pathway,5506


In [6]:
node.label.value_counts()

label
Gene          3035
Reaction      2373
Metabolite    1059
Pathway         38
Name: count, dtype: int64

In [7]:
edge

Unnamed: 0,eid,source,target,label
0,0,2829,2830,0
1,1,3833,2830,0
2,2,2885,2884,0
3,3,2884,2885,0
4,4,3045,3046,0
...,...,...,...,...
35858,35858,6744,5499,7
35859,35859,6837,5499,7
35860,35860,6926,5499,7
35861,35861,6929,5499,7


In [8]:
node_mapping = {}
new_nid = 0
for index, row in node.iterrows():
    node_mapping[row['nid']] = new_nid
    new_nid += 1
edge['source'] = edge['source'].map(node_mapping)
edge['target'] = edge['target'].map(node_mapping)
node['nid'] = node['nid'].map(node_mapping)
node

Unnamed: 0,id,amino_acid_seq:String,~label,label,nid
4305,b4687,MTDCRYLIKRVIKIIIAVLQLILLFL,Non-TF,Gene,0
4314,b4702,MEPDPTPLPRRRLKLFR,Non-TF,Gene,1
4316,b4705,MNEFKRCMRVFSHSPFKVRLMLLSMLCDMVNNKPQQDKPSDK,Non-TF,Gene,2
13,b0015,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,Non-TF,Gene,3
16,b0019,MKHLHRFFSSDASGGIILIIAAILAMIMANSGATSGWYHDFLETPV...,Non-TF,Gene,4
...,...,...,...,...,...
5503,g13,,,Pathway,6500
5504,g14,,,Pathway,6501
5505,g15,,,Pathway,6502
5506,g16,,,Pathway,6503


In [9]:
edge

Unnamed: 0,eid,source,target,label
0,0,1969.0,1970.0,0
1,1,2780.0,1970.0,0
2,2,1999.0,1998.0,0
3,3,1998.0,1999.0,0
4,4,2120.0,2121.0,0
...,...,...,...,...
35858,35858,3753.0,6486.0,7
35859,35859,3686.0,6486.0,7
35860,35860,3808.0,6486.0,7
35861,35861,3811.0,6486.0,7


In [10]:
# 使用 dropna() 方法删除含有 NA 的行并返回新的 DataFrame
df_without_na = edge.dropna()

# 或者使用布尔索引筛选出含有 NA 的行而不删除它们
df_with_na = edge[edge.isna().any(axis=1)]
df_with_na

Unnamed: 0,eid,source,target,label
4552,4552,,1864.0,1
4553,4553,,1535.0,1
4556,4556,,2430.0,1
4557,4557,,1622.0,1
4558,4558,,227.0,1
...,...,...,...,...
21983,21983,4971.0,,4
22003,22003,4966.0,,4
22004,22004,4977.0,,4
22005,22005,4978.0,,4


In [11]:
node['label'].value_counts()

label
Gene          3035
Reaction      2373
Metabolite    1059
Pathway         38
Name: count, dtype: int64

In [12]:
edge = edge.dropna()

### 写入文件

In [13]:
label_df = node[node['~label'].notna()]
label_df['~label'] = label_df['~label'].replace(['Non-TF','TF'],[0,1])

# 随机打乱数据行的顺序
# label_df = label_df.sample(frac=1, random_state=2020).reset_index(drop=True)
label_df = label_df.sample(frac=1, random_state=1).reset_index(drop=True)


# 指定测试集所占的比例
test_ratio = 0.3

# 确定测试集的大小
test_size = int(len(label_df) * test_ratio)

# 划分为训练集和测试集
train_data = label_df.iloc[test_size:]
test_data = label_df.iloc[:test_size]

# 将数据写入 label.dat 和 label.dat.test 文件
with open('../ERM/label.dat', 'w', encoding='utf-8') as label_file:
    for index, row in train_data.iterrows():
        label_file.write('{}\t\t{}\t{}\n'.format(row['nid'], 0, row['~label']))

with open('../ERM/label.dat.test', 'w', encoding='utf-8') as test_label_file:
    for index, row in test_data.iterrows():
        test_label_file.write('{}\t\t{}\t{}\n'.format(row['nid'], 0, row['~label']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df['~label'] = label_df['~label'].replace(['Non-TF','TF'],[0,1])


In [14]:
train_data.index = range(len(train_data))
train_data.to_csv("train_node.csv",index=False)
test_data.to_csv("test_node.csv",index=False)

In [15]:
def pandas_to_fasta(dataframe, id_column, sequence_column, output_file):
    with open(output_file, 'w') as f:
        for index, row in dataframe.iterrows():
            identifier = row[id_column]
            sequence = row[sequence_column]
            f.write(f">{identifier}\n{sequence}\n")

# 假设你的Pandas DataFrame 名称为df，其中'id'列包含序列标识符，'sequence'列包含序列数据
pandas_to_fasta(node[['nid','amino_acid_seq:String']], 'nid', 'amino_acid_seq:String', 'input.fasta')

In [16]:
edge['source']=edge['source'].astype(int)
edge['target']=edge['target'].astype(int)
edge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edge['source']=edge['source'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edge['target']=edge['target'].astype(int)


Unnamed: 0,eid,source,target,label
0,0,1969,1970,0
1,1,2780,1970,0
2,2,1999,1998,0
3,3,1998,1999,0
4,4,2120,2121,0
...,...,...,...,...
35858,35858,3753,6486,7
35859,35859,3686,6486,7
35860,35860,3808,6486,7
35861,35861,3811,6486,7


In [17]:
edge = edge.drop_duplicates(subset=['source','target'], keep='first', inplace=False)
edge = edge.sort_values(by=['label','source','target'])
edge

Unnamed: 0,eid,source,target,label
3632,3632,5,4,0
3633,3633,5,5,0
3622,3622,5,641,0
3621,3621,5,642,0
3620,3620,5,643,0
...,...,...,...,...
34383,34383,5403,6481,7
34384,34384,5404,6481,7
34456,34456,5405,6480,7
34385,34385,5406,6481,7


In [18]:
nodedat = open('../ERM/link.dat', 'w', encoding='utf-8')
for index,row in edge.iterrows():
    nodedat.write('{}\t{}\t{}\t{}\n'.format(row['source'], row['target'], row['label'],1.0))
nodedat.close()

In [19]:
link = pd.read_csv('../ERM/link.dat',sep='\t',header=None)
link

Unnamed: 0,0,1,2,3
0,5,4,0,1.0
1,5,5,0,1.0
2,5,641,0,1.0
3,5,642,0,1.0
4,5,643,0,1.0
...,...,...,...,...
31470,5403,6481,7,1.0
31471,5404,6481,7,1.0
31472,5405,6480,7,1.0
31473,5406,6481,7,1.0


In [20]:
link[0].min()

3

In [21]:
link[1].min()

0

### ESM-2蛋白质转化

In [22]:
import os
import torch
def find_image_file(source_path, file_lst):
    """
    递归寻找 文件夹以及子目录的 图片文件。
    :param source_path: 源文件夹路径
    :param file_lst: 输出 文件路径列表
    :return:
    """
    image_ext = ['.pt']
    for dir_or_file in os.listdir(source_path):
        file_path = os.path.join(source_path, dir_or_file)
        if os.path.isfile(file_path):  # 判断是否为文件
            file_name_ext = os.path.splitext(os.path.basename(file_path))  # 文件名与后缀
            if len(file_name_ext) < 2:
                continue
            if file_name_ext[1] in image_ext:  # 后缀在后缀列表中
                file_lst.append(file_path)
            else:
                continue
        elif os.path.isdir(file_path):  # 如果是个dir，则再次调用此函数，传入当前目录，递归处理。
            find_image_file(file_path, file_lst)
        else:
            print('文件夹没有环境' + os.path.basename(file_path))
env_path_list=[]
find_image_file('/home/linjw/GNNs/HGB/NC/benchmark/output',env_path_list)
label = [x.split('/')[-1].replace('.pt','') for x in env_path_list]
class_labels = ['dandelion','daisy','sunflower']
rawdata = pd.DataFrame([env_path_list,label],index=['path','label']).T
rawdata

Unnamed: 0,path,label
0,/home/linjw/GNNs/HGB/NC/benchmark/output/b4618.pt,b4618
1,/home/linjw/GNNs/HGB/NC/benchmark/output/b1265.pt,b1265
2,/home/linjw/GNNs/HGB/NC/benchmark/output/b1715.pt,b1715
3,/home/linjw/GNNs/HGB/NC/benchmark/output/b2598.pt,b2598
4,/home/linjw/GNNs/HGB/NC/benchmark/output/b2018.pt,b2018
...,...,...
4317,/home/linjw/GNNs/HGB/NC/benchmark/output/b4676.pt,b4676
4318,/home/linjw/GNNs/HGB/NC/benchmark/output/b4588.pt,b4588
4319,/home/linjw/GNNs/HGB/NC/benchmark/output/b4648.pt,b4648
4320,/home/linjw/GNNs/HGB/NC/benchmark/output/b4671.pt,b4671


In [23]:
protein_embedding = rawdata.path.apply(lambda x :torch.load(x)['mean_representations'][0].tolist())
rawdata['embedding'] = protein_embedding

In [24]:
import numpy as np
# 创建一个字典，用于存储label和embedding的对应关系
embedding_dict = dict(zip(rawdata['label'], rawdata['embedding']))

# 定义一个函数，用于根据label获取对应的embedding
def get_embedding(label):
    try:
        return embedding_dict[label]
    except:
        return np.nan

# 将embedding补全到node数据中
node['embedding'] = node['id'].apply(get_embedding)

node

Unnamed: 0,id,amino_acid_seq:String,~label,label,nid,embedding
4305,b4687,MTDCRYLIKRVIKIIIAVLQLILLFL,Non-TF,Gene,0,"[0.014961499720811844, 0.005771731957793236, -..."
4314,b4702,MEPDPTPLPRRRLKLFR,Non-TF,Gene,1,"[0.02268233150243759, -0.017017479985952377, -..."
4316,b4705,MNEFKRCMRVFSHSPFKVRLMLLSMLCDMVNNKPQQDKPSDK,Non-TF,Gene,2,"[0.006886598654091358, 0.00012860269634984434,..."
13,b0015,MAKQDYYEILGVSKTAEEREIRKAYKRLAMKYHPDRNQGDKEAEAK...,Non-TF,Gene,3,"[-0.02172139286994934, -0.014070559293031693, ..."
16,b0019,MKHLHRFFSSDASGGIILIIAAILAMIMANSGATSGWYHDFLETPV...,Non-TF,Gene,4,"[-0.002686335239559412, -0.0003937475266866386..."
...,...,...,...,...,...,...
5503,g13,,,Pathway,6500,
5504,g14,,,Pathway,6501,
5505,g15,,,Pathway,6502,
5506,g16,,,Pathway,6503,


In [25]:
nodedat = open('../ERM/node.dat', 'w', encoding='utf-8')
node['label'] = node['label'].replace(index_order,[0,1,2,3]) # 都变成0
for index,row in node.iterrows():
    try:
        nodedat.write('{}\t{}\t{}\t{}\n'.format(row['nid'], row['amino_acid_seq:String'], row['label'], ','.join([str(item) for item in row['embedding']])))
    except:
        nodedat.write('{}\t{}\t{}\n'.format(row['nid'], row['amino_acid_seq:String'], row['label']))
nodedat.close()

In [26]:
node['label'].astype(int)

4305    0
4314    0
4316    0
13      0
16      0
       ..
5503    3
5504    3
5505    3
5506    3
5507    3
Name: label, Length: 6505, dtype: int64

In [27]:
node.label.value_counts()

label
0    3035
1    2373
2    1059
3      38
Name: count, dtype: int64

In [28]:
import json
# edge_order = ['TFGI','CPI','sRGI','GRI','MRI','RMI','SFGI','PPI','RPI']
# edge['label'] = edge['label'].replace(edge_order,[0,1,2,3,4,5,6,7,8]) # 2 1 3 5 4 7 0 6
info = {
'node.dat': {'node type': {0: 'Gene', 1: 'Reaction', 2: 'Metabolite', 3: 'Pathway'}},

'label.dat': {'node type': {0: {0: 'Non-TF', 1: 'TF'}}},

'link.dat': {
        "link type": {
            "0": {
                "start": 0,
                "end": 0,
                "meaning": "TFGI"
            },
            "1": {
                "start": 2,
                "end": 0,
                "meaning": "CPI"
            },
            "2": {
                "start": 0,
                "end": 1,
                "meaning": "GRI"
            },
            "3": {
                "start": 2,
                "end": 1,
                "meaning": "MRI"
            },
            "4": {
                "start": 1,
                "end": 2,
                "meaning": "RMI"
            },
            "5": {
                "start": 0,
                "end": 0,
                "meaning": "SFGI"
            },
            "6": {
                "start": 0,
                "end": 0,
                "meaning": "PPI"
            },
            "7": {
                "start": 1,
                "end": 3,
                "meaning": "RPI"
            }
        }}
}

with open('../ERM/info.dat', 'w', encoding='utf-8') as info_file:
    json.dump(info, info_file, indent=4)

In [29]:
aa = node["~label"].value_counts()
aa

~label
Non-TF    2819
TF         216
Name: count, dtype: int64

In [30]:
index_order

['Gene', 'Reaction', 'Metabolite', 'Pathway']

In [31]:
edge_order

['TFGI', 'CPI', 'sRGI', 'GRI', 'MRI', 'RMI', 'SFGI', 'PPI', 'RPI']

In [32]:
node["label"].value_counts()

label
0    3035
1    2373
2    1059
3      38
Name: count, dtype: int64

In [33]:
edge["label"].value_counts()

label
6    9059
0    4552
2    4297
1    3644
4    2710
3    2627
7    2375
5    2211
Name: count, dtype: int64