### 导入结点

#### 特征处理

In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np

nums_col = ["Len:Int","LeftEnd:Int","RightEnd:Int","Cs:Float","MW:Float","Orthology_org:String","ProteinOntology:String"]
cls_col = ["Ori:String","Operon:String","Uber_Operon:String"]
other_col = ["ENZYME:String","GO:String[]"]
node_info = pd.read_csv('./Gene_Entity_v2.csv')
gene = node_info[['id','amino_acid_seq:String','~label']]
gene = gene[gene['amino_acid_seq:String']!='none']
def get_term(go_term):
    try:
        name = list(set([x['go_term_name'] for x in eval(go_term).values()]))
        belong = list(set([x['go_belong'] for x in eval(go_term).values()]))
        return name ,belong
    except:
        return "",""

In [2]:
# other 
mlb = MultiLabelBinarizer()
term_encode = mlb.fit_transform([x[0] for x in node_info["GO:String[]"].apply(get_term).values])
mlb = MultiLabelBinarizer()
go_belong_encode = mlb.fit_transform([x[1] for x in node_info["GO:String[]"].apply(get_term).values])
ec_num = node_info["ENZYME:String"].fillna("-1.").apply(lambda x:x.split(".")[0]).values.astype(int).reshape(-1,1)

# cls
label_encoder = LabelEncoder()
cls_encoded = node_info[cls_col].apply(lambda col: label_encoder.fit_transform(col))

df_float = node_info[nums_col].fillna(-1)
df_float = df_float.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(-1).astype(float)

node_feat = np.concatenate((term_encode, go_belong_encode, ec_num, cls_encoded.values, df_float.values), axis=1)
# MinMaxScaler
scaler = MinMaxScaler()
node_feat = scaler.fit_transform(node_feat)
node_feat_dict = dict(zip(node_info.id.values,node_feat))
node_feat.shape

(4502, 2912)

In [3]:
gene = pd.read_csv('./Gene_Entity_v2.csv')[['id','amino_acid_seq:String','~label']]
gene = gene[gene['amino_acid_seq:String']!='none']
metabolite = pd.read_csv('./Metabolite_Entity_v5.csv')[['id','label']]
pathway = pd.read_csv('./Pathway_Entity_v3.csv')[['id','label']]
reaction = pd.read_csv('./Reaction_Entity_v5.csv')[['id','label']]
node = pd.concat([gene,metabolite,pathway,reaction])
# node = node[node["~label"]!="Gene;SIGMA"]
node = node.drop_duplicates(subset='id')
node.index = range(len(node)) 
node.label=node.label.fillna('Gene')
node['~label'] = node['~label'].replace(['Gene','Gene;TF','Gene;SIGMA'],['Non-TF','TF','TF'])
node['~label'].value_counts()

~label
Non-TF    4106
TF         216
Name: count, dtype: int64

In [4]:
node.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8241 entries, 0 to 8240
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     8241 non-null   object
 1   amino_acid_seq:String  4322 non-null   object
 2   ~label                 4322 non-null   object
 3   label                  8241 non-null   object
dtypes: object(4)
memory usage: 257.7+ KB


### 导入边关系

In [5]:
edge = pd.read_csv('./Edge20220217_V3.csv')
edge = edge[['eid','source','target','attribute','label']]
edge.eid = edge.eid.apply(lambda x:x.replace("Edge_",""))
edge_order = ['TFGI','CPI','sRGI','GRI','MRI','RMI','SFGI','PPI','RPI']
edge['label'] = edge['label'].replace(edge_order,[0,1,8,2,3,4,5,6,7]) # 2 1 3 5 4 7 0 6
edge = edge.sort_values('label')
edge = edge.fillna("")
label_encoder = LabelEncoder()
edge["attribute"] = label_encoder.fit_transform(edge["attribute"].values)
edge

Unnamed: 0,eid,source,target,attribute,label
0,0,b2217,b0076,39,0
3159,3159,b3773,b3773,46,0
3158,3158,b3773,b3774,39,0
3157,3157,b4264,b4265,39,0
3156,3156,b4264,b4264,39,0
...,...,...,...,...,...
11894,11894,b4443,b1920,43,8
11893,11893,b4443,b3089,43,8
11892,11892,b4443,b3661,43,8
11908,11908,b4439,b0889,43,8


### 处理边关系

In [6]:
# 对节点数据进行编号
node['nid'] = range(len(node))
# 创建一个字典，将节点的id映射到nid
id_to_nid = dict(zip(node['id'], node['nid']))

# 替换边关系数据中的source和target列为对应的nid值
edge['source'] = edge['source'].map(id_to_nid)
edge['target'] = edge['target'].map(id_to_nid)
edge = edge.dropna()
edge.index = range(len(edge))
edge = edge.astype(int)
edge.eid = range(len(edge))
edge

Unnamed: 0,eid,source,target,attribute,label
0,0,2083,71,39,0
1,1,3529,3529,46,0
2,2,3529,3530,39,0
3,3,3975,3976,39,0
4,4,3975,3975,39,0
...,...,...,...,...,...
35858,35858,7000,5499,26,7
35859,35859,7001,5499,26,7
35860,35860,7003,5499,26,7
35861,35861,6972,5499,26,7


In [7]:
edge_nodes = set(edge['source']).union(set(edge['target']))
node = node[node['nid'].isin(edge_nodes)]
# 按照给定索引顺序创建一个新的DataFrame
index_order = ['Gene', 'Reaction', 'Metabolite', 'Pathway']
node = node[node['label'].isin(index_order)].sort_values(by=['label'], key=lambda x: x.map({label: i for i, label in enumerate(index_order)}))
node

Unnamed: 0,id,amino_acid_seq:String,~label,label,nid
0,b0001,MKRISTTITTTITITTGNGAG,Non-TF,Gene,0
2862,b3059,MSAIAPGMILIAYLCGSISSAILVCRLCGLPDPRTSGSGNPGATNV...,Non-TF,Gene,2862
2863,b3060,MLNSWPLAKDLQVLVEIVHSGSFSAAAATLGQTPAFVTKRIQILEN...,Non-TF,Gene,2863
2864,b3061,MMSESNKQQAVNKLTEIVANFTAMISTRMPDDVVDKLKQLKDAETS...,Non-TF,Gene,2864
2865,b3062,MKKILTTPIKAEDLQDIRVGDVIYLTGTLVTCRDVCHRRLIELKRP...,Non-TF,Gene,2865
...,...,...,...,...,...
5506,g16,,,Pathway,5506
5507,g17,,,Pathway,5507
5508,g18,,,Pathway,5508
5500,g10,,,Pathway,5500


In [8]:
node.label.value_counts()

label
Gene          3035
Reaction      2373
Metabolite    1059
Pathway         38
Name: count, dtype: int64

In [9]:
node_mapping = {}
new_nid = 0
for index, row in node.iterrows():
    node_mapping[row['nid']] = new_nid
    new_nid += 1
edge['source'] = edge['source'].map(node_mapping)
edge['target'] = edge['target'].map(node_mapping)
node['nid'] = node['nid'].map(node_mapping)
node

Unnamed: 0,id,amino_acid_seq:String,~label,label,nid
0,b0001,MKRISTTITTTITITTGNGAG,Non-TF,Gene,0
2862,b3059,MSAIAPGMILIAYLCGSISSAILVCRLCGLPDPRTSGSGNPGATNV...,Non-TF,Gene,1
2863,b3060,MLNSWPLAKDLQVLVEIVHSGSFSAAAATLGQTPAFVTKRIQILEN...,Non-TF,Gene,2
2864,b3061,MMSESNKQQAVNKLTEIVANFTAMISTRMPDDVVDKLKQLKDAETS...,Non-TF,Gene,3
2865,b3062,MKKILTTPIKAEDLQDIRVGDVIYLTGTLVTCRDVCHRRLIELKRP...,Non-TF,Gene,4
...,...,...,...,...,...
5506,g16,,,Pathway,6500
5507,g17,,,Pathway,6501
5508,g18,,,Pathway,6502
5500,g10,,,Pathway,6503


In [10]:
edge

Unnamed: 0,eid,source,target,attribute,label
0,0,2514.0,2078.0,39,0
1,1,1474.0,1474.0,46,0
2,2,1474.0,1475.0,39,0
3,3,1061.0,1062.0,39,0
4,4,1061.0,1061.0,39,0
...,...,...,...,...,...
35858,35858,3094.0,6495.0,26,7
35859,35859,3095.0,6495.0,26,7
35860,35860,3096.0,6495.0,26,7
35861,35861,3125.0,6495.0,26,7


In [11]:
# 使用 dropna() 方法删除含有 NA 的行并返回新的 DataFrame
df_without_na = edge.dropna()

# 或者使用布尔索引筛选出含有 NA 的行而不删除它们
df_with_na = edge[edge.isna().any(axis=1)]
df_with_na

Unnamed: 0,eid,source,target,attribute,label
4552,4552,,2008.0,40,1
4553,4553,,2672.0,40,1
4556,4556,,2407.0,40,1
4557,4557,,2406.0,40,1
4558,4558,,3003.0,40,1
...,...,...,...,...,...
22007,22007,3151.0,,0,4
22008,22008,3152.0,,0,4
22009,22009,3153.0,,0,4
22159,22159,3865.0,,0,4


In [12]:
node['label'].value_counts()

label
Gene          3035
Reaction      2373
Metabolite    1059
Pathway         38
Name: count, dtype: int64

In [13]:
edge = edge.dropna()

### 写入文件

In [14]:
label_df = node[node['~label'].notna()]
label_df['~label'] = label_df['~label'].replace(['Non-TF','TF'],[0,1])

# 随机打乱数据行的顺序
# label_df = label_df.sample(frac=1, random_state=2020).reset_index(drop=True)
label_df = label_df.sample(frac=1, random_state=1).reset_index(drop=True)


# 指定测试集所占的比例
test_ratio = 0.3

# 确定测试集的大小
test_size = int(len(label_df) * test_ratio)

# 划分为训练集和测试集
train_data = label_df.iloc[test_size:]
test_data = label_df.iloc[:test_size]

# 将数据写入 label.dat 和 label.dat.test 文件
with open('../ERM_ELP/label.dat', 'w', encoding='utf-8') as label_file:
    for index, row in train_data.iterrows():
        label_file.write('{}\t\t{}\t{}\n'.format(row['nid'], 0, row['~label']))

with open('../ERM_ELP/label.dat.test', 'w', encoding='utf-8') as test_label_file:
    for index, row in test_data.iterrows():
        test_label_file.write('{}\t\t{}\t{}\n'.format(row['nid'], 0, row['~label']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df['~label'] = label_df['~label'].replace(['Non-TF','TF'],[0,1])


In [15]:
train_data.index = range(len(train_data))
train_data.to_csv("train_node.csv",index=False)
test_data.to_csv("test_node.csv",index=False)

In [16]:
def pandas_to_fasta(dataframe, id_column, sequence_column, output_file):
    with open(output_file, 'w') as f:
        for index, row in dataframe.iterrows():
            identifier = row[id_column]
            sequence = row[sequence_column]
            f.write(f">{identifier}\n{sequence}\n")

# 假设你的Pandas DataFrame 名称为df，其中'id'列包含序列标识符，'sequence'列包含序列数据
pandas_to_fasta(node[['nid','amino_acid_seq:String']], 'nid', 'amino_acid_seq:String', 'input.fasta')

In [17]:
edge['source']=edge['source'].astype(int)
edge['target']=edge['target'].astype(int)
edge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edge['source']=edge['source'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edge['target']=edge['target'].astype(int)


Unnamed: 0,eid,source,target,attribute,label
0,0,2514,2078,39,0
1,1,1474,1474,46,0
2,2,1474,1475,39,0
3,3,1061,1062,39,0
4,4,1061,1061,39,0
...,...,...,...,...,...
35858,35858,3094,6495,26,7
35859,35859,3095,6495,26,7
35860,35860,3096,6495,26,7
35861,35861,3125,6495,26,7


In [18]:
edge = edge.drop_duplicates(subset=['source','target'], keep='first', inplace=False)
edge = edge.sort_values(by=['label','source','target'])
edge

Unnamed: 0,eid,source,target,attribute,label
2182,2182,12,11,46,0
2181,2181,12,12,46,0
4016,4016,15,16,46,0
4015,4015,15,17,46,0
639,639,33,33,46,0
...,...,...,...,...,...
34864,34864,5403,6489,2,7
34865,34865,5404,6489,2,7
33804,33804,5405,6472,25,7
34863,34863,5406,6489,2,7


In [19]:
nodedat = open('../ERM_ELP/link.dat', 'w', encoding='utf-8')
for index,row in edge.iterrows():
    nodedat.write('{}\t{}\t{}\t{}\t{}\n'.format(row['source'], row['target'], row['label'], row['attribute'] ,1.0))
nodedat.close()

In [20]:
link = pd.read_csv('../ERM_ELP/link.dat',sep='\t',header=None)
link

Unnamed: 0,0,1,2,3,4
0,12,11,0,46,1.0
1,12,12,0,46,1.0
2,15,16,0,46,1.0
3,15,17,0,46,1.0
4,33,33,0,46,1.0
...,...,...,...,...,...
31470,5403,6489,7,2,1.0
31471,5404,6489,7,2,1.0
31472,5405,6472,7,25,1.0
31473,5406,6489,7,2,1.0


### ESM-2蛋白质转化

In [21]:
import os
import torch
def find_image_file(source_path, file_lst):
    """
    递归寻找 文件夹以及子目录的 图片文件。
    :param source_path: 源文件夹路径
    :param file_lst: 输出 文件路径列表
    :return:
    """
    image_ext = ['.pt']
    for dir_or_file in os.listdir(source_path):
        file_path = os.path.join(source_path, dir_or_file)
        if os.path.isfile(file_path):  # 判断是否为文件
            file_name_ext = os.path.splitext(os.path.basename(file_path))  # 文件名与后缀
            if len(file_name_ext) < 2:
                continue
            if file_name_ext[1] in image_ext:  # 后缀在后缀列表中
                file_lst.append(file_path)
            else:
                continue
        elif os.path.isdir(file_path):  # 如果是个dir，则再次调用此函数，传入当前目录，递归处理。
            find_image_file(file_path, file_lst)
        else:
            print('文件夹没有环境' + os.path.basename(file_path))
env_path_list=[]
find_image_file('/home/linjw/GNNs/HGB/NC/benchmark/output',env_path_list)
label = [x.split('/')[-1].replace('.pt','') for x in env_path_list]
class_labels = ['dandelion','daisy','sunflower']
rawdata = pd.DataFrame([env_path_list,label],index=['path','label']).T
rawdata

Unnamed: 0,path,label
0,/home/linjw/GNNs/HGB/NC/benchmark/output/b4618.pt,b4618
1,/home/linjw/GNNs/HGB/NC/benchmark/output/b1265.pt,b1265
2,/home/linjw/GNNs/HGB/NC/benchmark/output/b1715.pt,b1715
3,/home/linjw/GNNs/HGB/NC/benchmark/output/b2598.pt,b2598
4,/home/linjw/GNNs/HGB/NC/benchmark/output/b2018.pt,b2018
...,...,...
4317,/home/linjw/GNNs/HGB/NC/benchmark/output/b4676.pt,b4676
4318,/home/linjw/GNNs/HGB/NC/benchmark/output/b4588.pt,b4588
4319,/home/linjw/GNNs/HGB/NC/benchmark/output/b4648.pt,b4648
4320,/home/linjw/GNNs/HGB/NC/benchmark/output/b4671.pt,b4671


In [22]:
protein_embedding = rawdata.path.apply(lambda x :torch.load(x)['mean_representations'][0].tolist())
rawdata['embedding'] = protein_embedding

In [23]:
import numpy as np
# 创建一个字典，用于存储label和embedding的对应关系
embedding_dict = dict(zip(rawdata['label'], rawdata['embedding']))

# 定义一个函数，用于根据label获取对应的embedding
def get_embedding(label):
    try:
        protein_emb = embedding_dict[label]
        node_feat_emb = node_feat_dict.get(label).tolist()
        if node_feat_emb!= None:
            protein_emb.extend(node_feat_emb)
        else:
            protein_emb.extend([0]*node_feat.shape[1])
        return protein_emb
    except:
        return np.nan

# 将embedding补全到node数据中
node['embedding'] = node['id'].apply(get_embedding)

node

Unnamed: 0,id,amino_acid_seq:String,~label,label,nid,embedding
0,b0001,MKRISTTITTTITITTGNGAG,Non-TF,Gene,0,"[-0.05258099362254143, 0.034376759082078934, -..."
2862,b3059,MSAIAPGMILIAYLCGSISSAILVCRLCGLPDPRTSGSGNPGATNV...,Non-TF,Gene,1,"[-0.015170845203101635, 0.0016907488461583853,..."
2863,b3060,MLNSWPLAKDLQVLVEIVHSGSFSAAAATLGQTPAFVTKRIQILEN...,Non-TF,Gene,2,"[-0.005344230215996504, -0.012884224765002728,..."
2864,b3061,MMSESNKQQAVNKLTEIVANFTAMISTRMPDDVVDKLKQLKDAETS...,Non-TF,Gene,3,"[-0.014897570013999939, -0.0018042243318632245..."
2865,b3062,MKKILTTPIKAEDLQDIRVGDVIYLTGTLVTCRDVCHRRLIELKRP...,Non-TF,Gene,4,"[-0.01663176156580448, -0.004438449628651142, ..."
...,...,...,...,...,...,...
5506,g16,,,Pathway,6500,
5507,g17,,,Pathway,6501,
5508,g18,,,Pathway,6502,
5500,g10,,,Pathway,6503,


In [24]:
nodedat = open('../ERM_ELP/node.dat', 'w', encoding='utf-8')
node['label'] = node['label'].replace(index_order,[0,1,2,3]) # 都变成0
for index,row in node.iterrows():
    try:
        nodedat.write('{}\t{}\t{}\t{}\n'.format(row['nid'], row['amino_acid_seq:String'], row['label'], ','.join([str(item) for item in row['embedding']])))
    except:
        nodedat.write('{}\t{}\t{}\n'.format(row['nid'], row['amino_acid_seq:String'], row['label']))
nodedat.close()

In [25]:
node['label'].astype(int)

0       0
2862    0
2863    0
2864    0
2865    0
       ..
5506    3
5507    3
5508    3
5500    3
5496    3
Name: label, Length: 6505, dtype: int64

In [26]:
node.label.value_counts()

label
0    3035
1    2373
2    1059
3      38
Name: count, dtype: int64

In [27]:
import json
# edge_order = ['TFGI','CPI','sRGI','GRI','MRI','RMI','SFGI','PPI','RPI']
# edge['label'] = edge['label'].replace(edge_order,[0,1,2,3,4,5,6,7,8]) # 2 1 3 5 4 7 0 6
info = {
'node.dat': {'node type': {0: 'Gene', 1: 'Reaction', 2: 'Metabolite', 3: 'Pathway'}},

'label.dat': {'node type': {0: {0: 'Non-TF', 1: 'TF'}}},

'link.dat': {
        "link type": {
            "0": {
                "start": 0,
                "end": 0,
                "meaning": "TFGI"
            },
            "1": {
                "start": 2,
                "end": 0,
                "meaning": "CPI"
            },
            "2": {
                "start": 0,
                "end": 1,
                "meaning": "GRI"
            },
            "3": {
                "start": 2,
                "end": 1,
                "meaning": "MRI"
            },
            "4": {
                "start": 1,
                "end": 2,
                "meaning": "RMI"
            },
            "5": {
                "start": 0,
                "end": 0,
                "meaning": "SFGI"
            },
            "6": {
                "start": 0,
                "end": 0,
                "meaning": "PPI"
            },
            "7": {
                "start": 1,
                "end": 3,
                "meaning": "RPI"
            }
        }}
}

with open('../ERM_ELP/info.dat', 'w', encoding='utf-8') as info_file:
    json.dump(info, info_file, indent=4)

In [28]:
aa = node["~label"].value_counts()
aa

~label
Non-TF    2819
TF         216
Name: count, dtype: int64

In [29]:
index_order

['Gene', 'Reaction', 'Metabolite', 'Pathway']

In [30]:
edge_order

['TFGI', 'CPI', 'sRGI', 'GRI', 'MRI', 'RMI', 'SFGI', 'PPI', 'RPI']

In [31]:
node["label"].value_counts()

label
0    3035
1    2373
2    1059
3      38
Name: count, dtype: int64

In [32]:
edge["label"].value_counts()

label
6    9059
0    4552
2    4297
1    3644
4    2710
3    2627
7    2375
5    2211
Name: count, dtype: int64

### Test_Link

In [33]:
link

Unnamed: 0,0,1,2,3,4
0,12,11,0,46,1.0
1,12,12,0,46,1.0
2,15,16,0,46,1.0
3,15,17,0,46,1.0
4,33,33,0,46,1.0
...,...,...,...,...,...
31470,5403,6489,7,2,1.0
31471,5404,6489,7,2,1.0
31472,5405,6472,7,25,1.0
31473,5406,6489,7,2,1.0


In [34]:
link = pd.read_csv('../ERM_ELP/link.dat',sep='\t',header=None)
link.columns = ['source','target','label','attribute','weight']
link = link.astype(int)
link

Unnamed: 0,source,target,label,attribute,weight
0,12,11,0,46,1
1,12,12,0,46,1
2,15,16,0,46,1
3,15,17,0,46,1
4,33,33,0,46,1
...,...,...,...,...,...
31470,5403,6489,7,2,1
31471,5404,6489,7,2,1
31472,5405,6472,7,25,1
31473,5406,6489,7,2,1


In [35]:
selected_rows = link[link["label"] == 0]
# 计算要删除的行数（10%）
rows_to_delete = int(0.1 * len(selected_rows))
# 随机选择10%的行作为测试集
test_df = selected_rows.sample(n=rows_to_delete)
train_df = link.drop(test_df.index)
train_df

Unnamed: 0,source,target,label,attribute,weight
0,12,11,0,46,1
1,12,12,0,46,1
2,15,16,0,46,1
3,15,17,0,46,1
5,33,46,0,46,1
...,...,...,...,...,...
31470,5403,6489,7,2,1
31471,5404,6489,7,2,1
31472,5405,6472,7,25,1
31473,5406,6489,7,2,1


In [36]:
nodedat = open('../ERM_ELP/link.dat', 'w', encoding='utf-8')
for index,row in train_df.iterrows():
    nodedat.write('{}\t{}\t{}\t{}\t{}\n'.format(row['source'], row['target'], row['label'], row['attribute'] ,1.0))
nodedat.close()
                                                                                                                        

nodedat = open('../ERM_ELP/link.dat.test', 'w', encoding='utf-8')
for index,row in test_df.iterrows():
    nodedat.write('{}\t{}\t{}\t{}\t{}\n'.format(row['source'], row['target'], row['label'], row['attribute'] ,1.0))
nodedat.close()