In [None]:
import pandas as pd
import numpy as np
import itertools

## Entity & Relation Pool

In [None]:
# drug_gene_pool, gene_gene_pool, gene_disease_pool

drug_gene_pool = [
    'DRUG_ACTIVATION_GENE',
    'DRUG_BINDING_GENE',
    'DRUG_CATALYSIS_GENE',
    'DRUG_INHIBITION_GENE',
    'DRUG_BINDACT_GENE',
    'DRUG_BINDINH_GENE',
    'DRUG_REACTION_GENE',
]

gene_gene_pool = [
    'GENE_GENE',
    'GENE_ACTIVATION_GENE',
    'GENE_BINDING_GENE',
    'GENE_CATALYSIS_GENE',
    'GENE_EXPRESSION_GENE',
    'GENE_INHIBITION_GENE',
    'GENE_REACTION_GENE',
    'GENE_PTMOD_GENE',
]

gene_disease_pool = ['GENE_DIS']

## Data Preparation

In [None]:
def clean_entity(e):

    entity2id_path = r'all\entity2id.txt'
    entity2id = {}
    with open(entity2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            entity2id[int(idx)] = name

    entity_embedding = {}
    with open(e, 'r') as f:
        for line in f:
            entity, vector = line.strip().split(': ')
            entity_num=int(entity.split('_')[1])
            vector = eval(vector.strip())
            entity_embedding[entity_num] = vector
    embedding_df = pd.DataFrame.from_dict(entity_embedding, orient='index')
    embedding_df.columns = [f'dim_{i}' for i in range(embedding_df.shape[1])]
    embedding_df['name'] = embedding_df.index.map(entity2id)
    embedding_df = embedding_df[['name'] + [col for col in embedding_df.columns if col != 'name']]
    embedding_df = embedding_df.sort_index()

    return embedding_df


def clean_relation(r):

    relation2id_path = r'all\relation2id.txt'
    relation2id = {}
    with open(relation2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            relation2id[int(idx)] = name

    relation_embedding = {}
    with open(r, 'r') as f:
        for line in f:
            relation, vector = line.strip().split(': ')
            relation_num = int(relation.split('_')[1])
            vector = eval(vector.strip())
            relation_embedding[relation_num] = vector
    embedding_df = pd.DataFrame.from_dict(relation_embedding, orient='index')
    embedding_df.columns = [f'dim_{i}' for i in range(embedding_df.shape[1])]
    embedding_df['name'] = embedding_df.index.map(relation2id)
    embedding_df = embedding_df[['name'] + [col for col in embedding_df.columns if col != 'name']]
    embedding_df = embedding_df.sort_index()

    return embedding_df


def clean_norm(n):

    relation2id_path = r'all\relation2id.txt'
    relation2id = {}
    with open(relation2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            relation2id[int(idx)] = name

    norm_embedding = {}
    with open(n, 'r') as f:
        for line in f:
            norm, vector = line.strip().split(': ')
            norm_num = int(norm.split('_')[2])
            vector = eval(vector.strip())
            norm_embedding[norm_num] = vector
    embedding_df = pd.DataFrame.from_dict(norm_embedding, orient='index')
    embedding_df.columns = [f'dim_{i}' for i in range(embedding_df.shape[1])]
    embedding_df['name'] = embedding_df.index.map(relation2id)
    embedding_df = embedding_df[['name'] + [col for col in embedding_df.columns if col != 'name']]
    embedding_df = embedding_df.sort_index()

    return embedding_df


def clean_matrix(m):
    import pandas as pd
    import ast

    relation2id_path = r'all\relation2id.txt'
    id2relation = {}
    with open(relation2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            id2relation[int(idx)] = name  # key: int idx, value: relation name

    relation_matrix_df = {}
    with open(m, 'r') as f:
        current_relation = None
        current_matrix = []
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith('Matrix_'):
                if current_relation is not None:
                    relation_name = id2relation[current_relation]
                    relation_matrix_df[relation_name] = pd.DataFrame(current_matrix)
                current_relation = int(line.split('_')[1].rstrip(':'))
                current_matrix = []
            else:
                current_matrix.append(ast.literal_eval(line))

        if current_relation is not None:
            relation_name = id2relation[current_relation]
            relation_matrix_df[relation_name] = pd.DataFrame(current_matrix)

    return relation_matrix_df

In [None]:
# TransE
entity = r'transe_entity_embeddings.txt'
relation = r'transe_relation_embeddings.txt'
transe_entity = clean_entity(entity)
transe_relation = clean_relation(relation)


# TransH

entity = r'transh_entity_embeddings.txt'
relation = r'transh_relation_embeddings.txt'
norm = r'transh_norm_vector_embeddings.txt'
transh_entity = clean_entity(entity)
transh_relation = clean_relation(relation)
transh_norm = clean_norm(norm)


# TransR

entity = r'transr_entity_embeddings.txt'
relation = r'transr_relation_embeddings.txt'
matrix = r'transr_matrix_embeddings.txt'
transr_entity = clean_entity(entity)
transr_relation = clean_relation(relation)
transr_matrix = clean_matrix(matrix)


# ComplEx

entity_im = r'complex_entity_im_embeddings.txt'
entity_re = r'complex_entity_re_embeddings.txt'
relation_im = r'complex_relation_im_embeddings.txt'
relation_re = r'complex_relation_re_embeddings.txt'
complex_im_entity = clean_entity(entity_im)
complex_re_entity = clean_entity(entity_re)
complex_im_relation = clean_relation(relation_im)
complex_re_relation = clean_relation(relation_re)


# RotatE

entity = r'rotate_entity_embeddings.txt'
relation = r'rotate_relation_embeddings.txt'
rotate_entity = clean_entity(entity)
rotate_relation = clean_relation(relation)
rotate_entity_im = rotate_entity[['name'] + [f'dim_{i}' for i in range(50)]].copy()
rotate_entity_re = rotate_entity[['name'] + [f'dim_{i}' for i in range(50, 100)]].copy()


# R-GCN
entity = r'rgcn_entity_embeddings.txt'
relation = r'rgcn_relation_embeddings.txt'
rgcn_entity = clean_entity(entity)
rgcn_relation = clean_relation(relation)


# HittER
entity = r'hitter_entity_embeddings.txt'
relation = r'hitter_relation_embeddings.txt'
hitter_entity = clean_entity(entity)
hitter_relation = clean_relation(relation)

In [None]:
# Disease: index 0 ~ 204
disease_pool_final = transe_entity.iloc[0:205]['name'].tolist()
# Gene: index 205 ~ 17065
gene_pool_final = transe_entity.iloc[205:17066]['name'].tolist()
# Drug: index 17066 ~ 18359
drug_pool_final = transe_entity.iloc[17066:]['name'].tolist()

In [None]:
entity_list = list(drug_pool_final) + list(gene_pool_final) + list(disease_pool_final)

In [None]:
drug_gene_pool_final = [name for name in drug_gene_pool if name in transe_relation['name'].values]
gene_gene_pool_final = [name for name in gene_gene_pool if name in transe_relation['name'].values]
gene_disease_pool_final = [name for name in gene_disease_pool if name in transe_relation['name'].values]

In [None]:
relation_list = list(drug_gene_pool_final) + list(gene_gene_pool_final) + list(gene_disease_pool_final)

## Score Function

In [None]:
def get_vector(df, name):
    row = df[df['name'] == name]
    try:
        return row.iloc[0, 1:].values.astype(float)
    except IndexError:
        print(f"Warning: Name '{name}' not found in the DataFrame")
        # Return a default vector or None
        return None

In [None]:
transe_entity_embeddings = np.array([get_vector(transe_entity, n) for n in entity_list], dtype=np.float32)
transh_entity_embeddings = np.array([get_vector(transh_entity, n) for n in entity_list], dtype=np.float32)
transr_entity_embeddings = np.array([get_vector(transr_entity, n) for n in entity_list], dtype=np.float32)

complex_im_entity_embeddings = np.array([get_vector(complex_im_entity, n) for n in entity_list], dtype=np.float32)
complex_re_entity_embeddings = np.array([get_vector(complex_re_entity, n) for n in entity_list], dtype=np.float32)
rotate_entity_im_embeddings = np.array([get_vector(rotate_entity_im, n) for n in entity_list], dtype=np.float32)
rotate_entity_re_embeddings = np.array([get_vector(rotate_entity_re, n) for n in entity_list], dtype=np.float32)

rgcn_entity_embeddings = np.array([get_vector(rgcn_entity, n) for n in entity_list], dtype=np.float32)
hitter_entity_embeddings = np.array([get_vector(hitter_entity, n) for n in entity_list], dtype=np.float32)

In [None]:
transe_relation_embeddings = np.array([get_vector(transe_relation, n) for n in relation_list], dtype=np.float32)
transh_relation_embeddings = np.array([get_vector(transh_relation, n) for n in relation_list], dtype=np.float32)
transh_norm_embeddings = np.array([get_vector(transh_norm, n) for n in relation_list], dtype=np.float32)
transr_relation_embeddings = np.array([get_vector(transr_relation, n) for n in relation_list], dtype=np.float32)

complex_im_relation_embeddings = np.array([get_vector(complex_im_relation, n) for n in relation_list], dtype=np.float32)
complex_re_relation_embeddings = np.array([get_vector(complex_re_relation, n) for n in relation_list], dtype=np.float32)
rotate_relation_embeddings = np.array([get_vector(rotate_relation, n) for n in relation_list], dtype=np.float32)

rgcn_relation_embeddings = np.array([get_vector(rgcn_relation, n) for n in relation_list], dtype=np.float32)
hitter_relation_embeddings = np.array([get_vector(hitter_relation, n) for n in relation_list], dtype=np.float32)

In [None]:
# TransE
def transe_score(h, t, r):
    h_emb = transe_entity_embeddings[h]
    t_emb = transe_entity_embeddings[t]                    
    r_emb = transe_relation_embeddings[r]
    score = -np.linalg.norm(h_emb + r_emb - t_emb, ord = 2)
    return score

# TransH
def transh_score(h, t, r):
    h_emb = transh_entity_embeddings[h]
    t_emb = transh_entity_embeddings[t]                    
    dr_emb = transh_relation_embeddings[r]
    wr_emb = transh_norm_embeddings[r]
    
    h_proj = h_emb - np.dot(wr_emb, h_emb) * wr_emb
    t_proj = t_emb - np.dot(wr_emb, t_emb) * wr_emb
    
    score = -np.linalg.norm(h_proj + dr_emb - t_proj, ord = 2) ** 2
    return score

# TransR
def transr_score(h, t, r):
    h_emb = transr_entity_embeddings[h]
    t_emb = transr_entity_embeddings[t]                    
    r_emb = transr_relation_embeddings[r]
    M_r = transr_matrix[transe_relation.loc[r,'name']].values

    h_proj = M_r @ h_emb
    t_proj = M_r @ t_emb

    score = - np.linalg.norm(h_proj + r - t_proj, ord = 2) ** 2

    return  score

# ComplEx
def complex_score(h, t, r):
    h_re_emb = complex_re_entity_embeddings[h]
    h_im_emb = complex_im_entity_embeddings[h]
    t_re_emb = complex_re_entity_embeddings[t]
    t_im_emb = complex_im_entity_embeddings[t]
    r_re_emb = complex_re_relation_embeddings[r]
    r_im_emb = complex_im_relation_embeddings[r]

    score = np.sum(
          h_re_emb * t_re_emb * r_re_emb
        + h_im_emb * t_im_emb * r_re_emb
        + h_re_emb * t_im_emb * r_im_emb
        - h_im_emb * t_re_emb * r_im_emb
    )

    return score

# RotatE
def rotate_score(h, t, r):
    h_re_emb = rotate_entity_re_embeddings[h]
    h_im_emb = rotate_entity_im_embeddings[h]
    t_re_emb = rotate_entity_re_embeddings[t]
    t_im_emb = rotate_entity_im_embeddings[t]
    r_emb = rotate_relation_embeddings[r]
    embedding_range = np.pi

    phase_relation = r_emb / (embedding_range)
    r_re_emb = np.cos(phase_relation)
    r_im_emb = np.sin(phase_relation)

    re_score = h_re_emb * r_re_emb - h_im_emb * r_im_emb
    im_score = h_re_emb * r_im_emb + h_im_emb * r_re_emb

    re_diff = re_score - t_re_emb
    im_diff = im_score - t_im_emb

    score = - np.sum(np.sqrt(re_diff ** 2 + im_diff ** 2))

    return score

# R-GCN
def rgcn_score(h, t, r):
    h_emb = rgcn_entity_embeddings[h]
    t_emb = rgcn_entity_embeddings[t]                    
    r_emb = rgcn_relation_embeddings[r]
    score = -np.linalg.norm(h_emb + r_emb - t_emb, ord = 2)
    return score

# HittER
def hitter_score(h, t, r):
    h_emb = hitter_entity_embeddings[h]
    t_emb = hitter_entity_embeddings[t]
    r_emb = hitter_relation_embeddings[r]
    score = np.sum(h_emb * r_emb * t_emb)
    return score

## Existing Edges Score Calculation

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from itertools import product
from tqdm import tqdm
from joblib import Parallel, delayed

In [None]:
TP_edges = pd.read_csv(r'HQ_DIR\out\edges_relation_4type_filtered.csv', header=None)
TP_edges[['Entity1', 'Relation', 'Entity2','1','2']] = TP_edges[0].str.split('\t', expand=True)
TP_edges = TP_edges.drop(columns=[0])
TP_edges = TP_edges.drop(columns=['1', '2'])

In [None]:
TP_edges_drug_gene = TP_edges[
    TP_edges['Entity1'].isin(drug_pool_final) & TP_edges['Entity2'].isin(gene_pool_final)
]
TP_edges_gene_gene = TP_edges[
    TP_edges['Entity1'].isin(gene_pool_final) & TP_edges['Entity2'].isin(gene_pool_final)
]
TP_edges_gene_dis = TP_edges[
    TP_edges['Entity1'].isin(gene_pool_final) & TP_edges['Entity2'].isin(disease_pool_final)
]
TP_edges_dis_drug = TP_edges[
    TP_edges['Entity1'].isin(disease_pool_final) & TP_edges['Entity2'].isin(drug_pool_final)
]

In [None]:
drug_gene_scores = []

relation_to_index = {
    'DRUG_ACTIVATION_GENE': 0,
    'DRUG_BINDING_GENE': 1,
    'DRUG_CATALYSIS_GENE': 2,
    'DRUG_INHIBITION_GENE': 3,
    'DRUG_BINDACT_GENE': 4,
    'DRUG_BINDINH_GENE': 5,
    'DRUG_REACTION_GENE': 6,
}

for _, row in tqdm(TP_edges_drug_gene.iterrows(), total=len(TP_edges_drug_gene), desc="Computing scores", ncols=100):
    ent1 = transe_entity[transe_entity['name'] == row['Entity1']]
    ent2 = transe_entity[transe_entity['name'] == row['Entity2']]
    if ent1.empty or ent2.empty:
        continue

    idx1 = ent1.index[0]
    idx2 = ent2.index[0]

    relation_name = row['Relation']
    if relation_name not in relation_to_index:
        continue

    rel_idx = relation_to_index[relation_name]

    score = hitter_score(idx1, idx2, rel_idx)

    drug_gene_scores.append({
        'Drug': idx1,
        'Gene': idx2,
        'Relation': relation_name,
        'Score': score
    })

# DataFrame 생성
drug_gene_df = pd.DataFrame(drug_gene_scores)

# CSV로 저장
drug_gene_df.to_csv(r'drug_gene_existing_hitter.csv', index=False)  

In [None]:
gene_gene_scores = []

relation_to_index = {
    'GENE_GENE': 7,
    'GENE_ACTIVATION_GENE': 8,
    'GENE_BINDING_GENE': 9,
    'GENE_CATALYSIS_GENE': 10,
    'GENE_EXPRESSION_GENE': 11,
    'GENE_INHIBITION_GENE': 12,
    'GENE_REACTION_GENE': 13,
    'GENE_PTMOD_GENE': 14,
}

for _, row in tqdm(TP_edges_gene_gene.iterrows(), total=len(TP_edges_gene_gene), desc="Computing scores", ncols=100):
    ent1 = transe_entity[transe_entity['name'] == row['Entity1']]
    ent2 = transe_entity[transe_entity['name'] == row['Entity2']]
    if ent1.empty or ent2.empty:
        continue

    idx1 = ent1.index[0]
    idx2 = ent2.index[0]

    relation_name = row['Relation']
    if relation_name not in relation_to_index:
        continue

    rel_idx = relation_to_index[relation_name]

    score = hitter_score(idx1, idx2, rel_idx)

    gene_gene_scores.append({
        'Drug': idx1,
        'Gene': idx2,
        'Relation': relation_name,
        'Score': score
    })

# DataFrame 생성
drug_gene_df = pd.DataFrame(gene_gene_scores)

# CSV로 저장
drug_gene_df.to_csv(r'gene_gene_existing_hitter.csv', index=False)

# Predict Edges Score Calculation

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from itertools import product
from tqdm import tqdm
from joblib import Parallel, delayed

In [None]:
drugs = list(drug_pool_final) # index 17066~18359
genes = list(gene_pool_final) # index 205~17065
n_gene, n_drug = len(genes), len(drugs)

rel_drug_gene = drug_gene_pool_final # index 0~6
rel_gene_gene = gene_gene_pool_final # index 7~14

### Prediction (Drug - Gene)

In [None]:
existing_edges = set()
for _, row in TP_edges_drug_gene.iterrows():
    ent1 = transe_entity[transe_entity['name'] == row['Entity1']]
    ent2 = transe_entity[transe_entity['name'] == row['Entity2']]
    if ent1.empty or ent2.empty:
        continue
    idx1 = ent1.index[0]
    idx2 = ent2.index[0]
    existing_edges.add((idx1, idx2))

In [None]:
drug_start = 17066
drug_end = 18359
gene_start = 205
gene_end = 17065
rel_start = 0
rel_end = 6

In [None]:
drug_gene_pool = [
    'DRUG_ACTIVATION_GENE',
    'DRUG_BINDING_GENE',
    'DRUG_CATALYSIS_GENE',
    'DRUG_INHIBITION_GENE',
    'DRUG_BINDACT_GENE',
    'DRUG_BINDINH_GENE',
    'DRUG_REACTION_GENE',
]

In [None]:
model = 'hitter'

drug_gene_scores = {}

candidate_pairs = [
    (d, g)
    for g in range(gene_start, gene_end + 1)
    for d in range(drug_start, drug_end + 1)
    if (d, g) not in existing_edges
]

for d, g in tqdm(candidate_pairs, desc="Computing scores", ncols=100):
    scores = []
    for r in range(rel_start, rel_end + 1):
        score = hitter_score(d, g, r)
        scores.append(score)
    drug_gene_scores[(d, g)] = scores

rows = []
for (d, g), scores in drug_gene_scores.items():
    row = [d, g] + scores
    rows.append(row)

num_rel = len(next(iter(drug_gene_scores.values())))
columns = ['Drug', 'Gene'] + drug_gene_pool

drug_gene_df = pd.DataFrame(rows, columns=columns)
drug_gene_df.to_csv(r'drug_gene_predicted_hitter.csv', index=False)

### Prediction (Gene - Gene)

In [None]:
name_to_idx = {name: idx for idx, name in transe_entity['name'].items()}

existing_edges = set()
for row in TP_edges_gene_gene.itertuples(index=False):
    idx1 = name_to_idx[row.Entity1]
    idx2 = name_to_idx[row.Entity2]
    existing_edges.add((idx1, idx2))

In [None]:
gene_start = 205
gene_end = 17065

rel_start = 7
rel_end = 14

In [None]:
gene_gene_pool = [
    'GENE_GENE',             # rel 7
    'GENE_ACTIVATION_GENE',  # rel 8
    'GENE_BINDING_GENE',     # rel 9
    'GENE_CATALYSIS_GENE',   # rel 10
    'GENE_EXPRESSION_GENE',  # rel 11
    'GENE_INHIBITION_GENE',  # rel 12
    'GENE_REACTION_GENE',    # rel 13
    'GENE_PTMOD_GENE',       # rel 14
]

In [None]:
candidate_pairs = [
    (g1, g2)
    for g1 in range(205, 17066) 
    for g2 in range(gene_start, gene_end + 1)
    if g1 != g2 and (g1, g2) not in existing_edges
]

In [None]:
progress_bar = tqdm(total=len(candidate_pairs), desc="Computing scores")

# callback 함수로 tqdm update
def compute_scores_with_progress(g1, g2):
    result = [g1, g2] + [
        hitter_score(g1, g2, r)
        for r in range(rel_start, rel_end + 1)
    ]
    progress_bar.update(1)
    return result

# Parallel 실행
rows = Parallel(n_jobs=-1, backend="threading")(
    delayed(compute_scores_with_progress)(g1, g2) for g1, g2 in candidate_pairs
)

progress_bar.close()

columns = ['Gene1', 'Gene2'] + gene_gene_pool
gene_gene_df = pd.DataFrame(rows, columns=columns)
gene_gene_df.to_csv(r'gene_gene_predicted_hitter.csv', index=False)