In [None]:
import ast
import pandas as pd
import numpy as np

# Data Preparation (Existing)

In [None]:
dataset = r'HQ_DIR\out\edges_relation_4type_filtered.csv'
col_names = ['entity1', 'relation', 'entity2', '0', '1']
fulldf = pd.read_csv(dataset, sep='\t', header=None, names=col_names, encoding='utf-8', on_bad_lines='skip')
fulldf = fulldf.drop(['0', '1'], axis=1)

In [None]:
testdf = fulldf.sample(frac=0.1, random_state=42)

In [None]:
drug_gene_relation_candidates = [
    'DRUG_ACTIVATION_GENE',
    'DRUG_BINDING_GENE',
    'DRUG_CATALYSIS_GENE',
    'DRUG_INHIBITION_GENE',
    'DRUG_BINDACT_GENE',
    'DRUG_BINDINH_GENE',
    'DRUG_REACTION_GENE',
]

drug_gene_test_df = testdf[testdf['relation'].isin(drug_gene_relation_candidates)].copy()
drug_gene_test_df['candidate'] = [drug_gene_relation_candidates] * len(drug_gene_test_df)
drug_gene_test_df = drug_gene_test_df.explode('candidate').reset_index(drop=True)

In [None]:
gene_gene_relation_candidates = [
    'GENE_GENE',
    'GENE_ACTIVATION_GENE',
    'GENE_BINDING_GENE',
    'GENE_CATALYSIS_GENE',
    'GENE_EXPRESSION_GENE',
    'GENE_INHIBITION_GENE',
    'GENE_REACTION_GENE',
    'GENE_PTMOD_GENE',
]

gene_gene_test_df = testdf[testdf['relation'].isin(gene_gene_relation_candidates)].copy()
gene_gene_test_df['candidate'] = [gene_gene_relation_candidates] * len(gene_gene_test_df)
gene_gene_test_df = gene_gene_test_df.explode('candidate').reset_index(drop=True)

In [None]:
def clean_entity(e):

    entity2id_path = r'all\entity2id.txt'
    entity2id = {}
    with open(entity2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            entity2id[int(idx)] = name

    entity_embedding = {}
    with open(e, 'r') as f:
        for line in f:
            entity, vector = line.strip().split(': ')
            entity_num=int(entity.split('_')[1])
            vector = eval(vector.strip())
            entity_embedding[entity_num] = vector
    embedding_df = pd.DataFrame.from_dict(entity_embedding, orient='index')
    embedding_df.columns = [f'dim_{i}' for i in range(embedding_df.shape[1])]
    embedding_df['name'] = embedding_df.index.map(entity2id)
    embedding_df = embedding_df[['name'] + [col for col in embedding_df.columns if col != 'name']]
    embedding_df = embedding_df.sort_index()

    return embedding_df


def clean_relation(r):

    relation2id_path = r'all\relation2id.txt'
    relation2id = {}
    with open(relation2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            relation2id[int(idx)] = name

    relation_embedding = {}
    with open(r, 'r') as f:
        for line in f:
            relation, vector = line.strip().split(': ')
            relation_num = int(relation.split('_')[1])
            vector = eval(vector.strip())
            relation_embedding[relation_num] = vector
    embedding_df = pd.DataFrame.from_dict(relation_embedding, orient='index')
    embedding_df.columns = [f'dim_{i}' for i in range(embedding_df.shape[1])]
    embedding_df['name'] = embedding_df.index.map(relation2id)
    embedding_df = embedding_df[['name'] + [col for col in embedding_df.columns if col != 'name']]
    embedding_df = embedding_df.sort_index()

    return embedding_df


def clean_norm(n):

    relation2id_path = r'all\relation2id.txt'
    relation2id = {}
    with open(relation2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            relation2id[int(idx)] = name

    norm_embedding = {}
    with open(n, 'r') as f:
        for line in f:
            norm, vector = line.strip().split(': ')
            norm_num = int(norm.split('_')[2])
            vector = eval(vector.strip())
            norm_embedding[norm_num] = vector
    embedding_df = pd.DataFrame.from_dict(norm_embedding, orient='index')
    embedding_df.columns = [f'dim_{i}' for i in range(embedding_df.shape[1])]
    embedding_df['name'] = embedding_df.index.map(relation2id)
    embedding_df = embedding_df[['name'] + [col for col in embedding_df.columns if col != 'name']]
    embedding_df = embedding_df.sort_index()

    return embedding_df


def clean_matrix(m):
    import pandas as pd
    import ast

    relation2id_path = r'all\relation2id.txt'
    id2relation = {}
    with open(relation2id_path, 'r') as f:
        next(f)
        for line in f:
            line = line.strip()
            if not line:
                continue
            name, idx = line.split('\t')
            id2relation[int(idx)] = name  # key: int idx, value: relation name

    relation_matrix_df = {}
    with open(m, 'r') as f:
        current_relation = None
        current_matrix = []
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith('Matrix_'):
                if current_relation is not None:
                    relation_name = id2relation[current_relation]
                    relation_matrix_df[relation_name] = pd.DataFrame(current_matrix)
                current_relation = int(line.split('_')[1].rstrip(':'))
                current_matrix = []
            else:
                current_matrix.append(ast.literal_eval(line))

        if current_relation is not None:
            relation_name = id2relation[current_relation]
            relation_matrix_df[relation_name] = pd.DataFrame(current_matrix)

    return relation_matrix_df

In [None]:
# TransE
entity = r'transe_entity_embeddings.txt'
relation = r'transe_relation_embeddings.txt'
transe_entity = clean_entity(entity)
transe_relation = clean_relation(relation)


# TransH

entity = r'transh_entity_embeddings.txt'
relation = r'transh_relation_embeddings.txt'
norm = r'transh_norm_vector_embeddings.txt'
transh_entity = clean_entity(entity)
transh_relation = clean_relation(relation)
transh_norm = clean_norm(norm)


# TransR

entity = r'transr_entity_embeddings.txt'
relation = r'transr_relation_embeddings.txt'
matrix = r'transr_matrix_embeddings.txt'
transr_entity = clean_entity(entity)
transr_relation = clean_relation(relation)
transr_matrix = clean_matrix(matrix)


# ComplEx

entity_im = r'complex_entity_im_embeddings.txt'
entity_re = r'complex_entity_re_embeddings.txt'
relation_im = r'complex_relation_im_embeddings.txt'
relation_re = r'complex_relation_re_embeddings.txt'
complex_im_entity = clean_entity(entity_im)
complex_re_entity = clean_entity(entity_re)
complex_im_relation = clean_relation(relation_im)
complex_re_relation = clean_relation(relation_re)


# RotatE

entity = r'rotate_entity_embeddings.txt'
relation = r'rotate_relation_embeddings.txt'
rotate_entity = clean_entity(entity)
rotate_relation = clean_relation(relation)
rotate_entity_im = rotate_entity[['name'] + [f'dim_{i}' for i in range(50)]].copy()
rotate_entity_re = rotate_entity[['name'] + [f'dim_{i}' for i in range(50, 100)]].copy()


# R-GCN
entity = r'rgcn_entity_embeddings.txt'
relation = r'rgcn_relation_embeddings.txt'
rgcn_entity = clean_entity(entity)
rgcn_relation = clean_relation(relation)


# HittER
entity = r'hitter_entity_embeddings.txt'
relation = r'hitter_relation_embeddings.txt'
hitter_entity = clean_entity(entity)
hitter_relation = clean_relation(relation)

# Score Function

In [None]:
def get_vector(df, name):
    row = df[df['name'] == name]
    try:
        return row.iloc[0, 1:].values.astype(float)
    except IndexError:
        print(f"Warning: Name '{name}' not found in the DataFrame")
        # Return a default vector or None
        return None

In [None]:
transe_entity_embeddings = np.array([get_vector(transe_entity, n) for n in entity_list], dtype=np.float32)
transh_entity_embeddings = np.array([get_vector(transh_entity, n) for n in entity_list], dtype=np.float32)
transr_entity_embeddings = np.array([get_vector(transr_entity, n) for n in entity_list], dtype=np.float32)

complex_im_entity_embeddings = np.array([get_vector(complex_im_entity, n) for n in entity_list], dtype=np.float32)
complex_re_entity_embeddings = np.array([get_vector(complex_re_entity, n) for n in entity_list], dtype=np.float32)
rotate_entity_im_embeddings = np.array([get_vector(rotate_entity_im, n) for n in entity_list], dtype=np.float32)
rotate_entity_re_embeddings = np.array([get_vector(rotate_entity_re, n) for n in entity_list], dtype=np.float32)

rgcn_entity_embeddings = np.array([get_vector(rgcn_entity, n) for n in entity_list], dtype=np.float32)
hitter_entity_embeddings = np.array([get_vector(hitter_entity, n) for n in entity_list], dtype=np.float32)

In [None]:
transe_relation_embeddings = np.array([get_vector(transe_relation, n) for n in relation_list], dtype=np.float32)
transh_relation_embeddings = np.array([get_vector(transh_relation, n) for n in relation_list], dtype=np.float32)
transh_norm_embeddings = np.array([get_vector(transh_norm, n) for n in relation_list], dtype=np.float32)
transr_relation_embeddings = np.array([get_vector(transr_relation, n) for n in relation_list], dtype=np.float32)

complex_im_relation_embeddings = np.array([get_vector(complex_im_relation, n) for n in relation_list], dtype=np.float32)
complex_re_relation_embeddings = np.array([get_vector(complex_re_relation, n) for n in relation_list], dtype=np.float32)
rotate_relation_embeddings = np.array([get_vector(rotate_relation, n) for n in relation_list], dtype=np.float32)

rgcn_relation_embeddings = np.array([get_vector(rgcn_relation, n) for n in relation_list], dtype=np.float32)
hitter_relation_embeddings = np.array([get_vector(hitter_relation, n) for n in relation_list], dtype=np.float32)

In [None]:
# TransE
def transe_score(h, t, r):
    h_emb = transe_entity_embeddings[h]
    t_emb = transe_entity_embeddings[t]                    
    r_emb = transe_relation_embeddings[r]
    score = -np.linalg.norm(h_emb + r_emb - t_emb, ord = 2)
    return score

# TransH
def transh_score(h, t, r):
    h_emb = transh_entity_embeddings[h]
    t_emb = transh_entity_embeddings[t]                    
    dr_emb = transh_relation_embeddings[r]
    wr_emb = transh_norm_embeddings[r]
    
    h_proj = h_emb - np.dot(wr_emb, h_emb) * wr_emb
    t_proj = t_emb - np.dot(wr_emb, t_emb) * wr_emb
    
    score = -np.linalg.norm(h_proj + dr_emb - t_proj, ord = 2) ** 2
    return score

# TransR
def transr_score(h, t, r):
    h_emb = transr_entity_embeddings[h]
    t_emb = transr_entity_embeddings[t]                    
    r_emb = transr_relation_embeddings[r]
    M_r = transr_matrix[transe_relation.loc[r,'name']].values

    h_proj = M_r @ h_emb
    t_proj = M_r @ t_emb

    score = - np.linalg.norm(h_proj + r - t_proj, ord = 2) ** 2

    return  score

# ComplEx
def complex_score(h, t, r):
    h_re_emb = complex_re_entity_embeddings[h]
    h_im_emb = complex_im_entity_embeddings[h]
    t_re_emb = complex_re_entity_embeddings[t]
    t_im_emb = complex_im_entity_embeddings[t]
    r_re_emb = complex_re_relation_embeddings[r]
    r_im_emb = complex_im_relation_embeddings[r]

    score = np.sum(
          h_re_emb * t_re_emb * r_re_emb
        + h_im_emb * t_im_emb * r_re_emb
        + h_re_emb * t_im_emb * r_im_emb
        - h_im_emb * t_re_emb * r_im_emb
    )

    return score

# RotatE
def rotate_score(h, t, r):
    h_re_emb = rotate_entity_re_embeddings[h]
    h_im_emb = rotate_entity_im_embeddings[h]
    t_re_emb = rotate_entity_re_embeddings[t]
    t_im_emb = rotate_entity_im_embeddings[t]
    r_emb = rotate_relation_embeddings[r]
    embedding_range = np.pi

    phase_relation = r_emb / (embedding_range)
    r_re_emb = np.cos(phase_relation)
    r_im_emb = np.sin(phase_relation)

    re_score = h_re_emb * r_re_emb - h_im_emb * r_im_emb
    im_score = h_re_emb * r_im_emb + h_im_emb * r_re_emb

    re_diff = re_score - t_re_emb
    im_diff = im_score - t_im_emb

    score = - np.sum(np.sqrt(re_diff ** 2 + im_diff ** 2))

    return score

# R-GCN
def rgcn_score(h, t, r):
    h_emb = rgcn_entity_embeddings[h]
    t_emb = rgcn_entity_embeddings[t]                    
    r_emb = rgcn_relation_embeddings[r]
    score = -np.linalg.norm(h_emb + r_emb - t_emb, ord = 2)
    return score

# HittER
def hitter_score(h, t, r):
    h_emb = hitter_entity_embeddings[h]
    t_emb = hitter_entity_embeddings[t]
    r_emb = hitter_relation_embeddings[r]
    score = np.sum(h_emb * r_emb * t_emb)
    return score

In [None]:
# TransE

# drug_gene
drug_gene_test_df_transe = drug_gene_test_df.copy()
drug_gene_test_df_transe['score'] = drug_gene_test_df_transe.apply(
    lambda row: transe_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

# gene_gene
gene_gene_test_df_transe = gene_gene_test_df.copy()
gene_gene_test_df_transe['score'] = gene_gene_test_df_transe.apply(
    lambda row: transe_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

test_transe = pd.concat([drug_gene_test_df_transe, gene_gene_test_df_transe], ignore_index=True)
test_transe.to_csv(r'test_transe.csv', index=False)

In [None]:
# TransH

# drug_gene
drug_gene_test_df_transh = drug_gene_test_df.copy()
drug_gene_test_df_transh['score'] = drug_gene_test_df_transh.apply(
    lambda row: transh_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

# gene_gene
gene_gene_test_df_transh = gene_gene_test_df.copy()
gene_gene_test_df_transh['score'] = gene_gene_test_df_transh.apply(
    lambda row: transh_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

test_transh = pd.concat([drug_gene_test_df_transh, gene_gene_test_df_transh], ignore_index=True)
test_transh.to_csv(r'test_transh.csv', index=False)

In [None]:
# TransR

# drug_gene
drug_gene_test_df_transr = drug_gene_test_df.copy()
drug_gene_test_df_transr['score'] = drug_gene_test_df_transr.apply(
    lambda row: transr_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

# gene_gene
gene_gene_test_df_transr = gene_gene_test_df.copy()
gene_gene_test_df_transr['score'] = gene_gene_test_df_transr.apply(
    lambda row: transr_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

test_transr = pd.concat([drug_gene_test_df_transr, gene_gene_test_df_transr], ignore_index=True)
test_transr.to_csv(r'test_transr.csv', index=False)

In [None]:
# ComplEx

# drug_gene
drug_gene_test_df_complex = drug_gene_test_df.copy()
drug_gene_test_df_complex['score'] = drug_gene_test_df_complex.apply(
    lambda row: complex_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

# gene_gene
gene_gene_test_df_complex = gene_gene_test_df.copy()
gene_gene_test_df_complex['score'] = gene_gene_test_df_complex.apply(
    lambda row: complex_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

test_complex = pd.concat([drug_gene_test_df_complex, gene_gene_test_df_complex], ignore_index=True)
test_complex.to_csv(r'test_complex.csv', index=False)

In [None]:
# RotatE

# drug_gene
drug_gene_test_df_rotate = drug_gene_test_df.copy()
drug_gene_test_df_rotate['score'] = drug_gene_test_df_rotate.apply(
    lambda row: rotate_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

# gene_gene
gene_gene_test_df_rotate = gene_gene_test_df.copy()
gene_gene_test_df_rotate['score'] = gene_gene_test_df_rotate.apply(
    lambda row: rotate_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

test_rotate = pd.concat([drug_gene_test_df_rotate, gene_gene_test_df_rotate], ignore_index=True)
test_rotate.to_csv(r'test_rotate.csv', index=False)

In [None]:
# R-GCN

# drug_gene
drug_gene_test_df_rgcn = drug_gene_test_df.copy()
drug_gene_test_df_rgcn['score'] = drug_gene_test_df_rgcn.apply(
    lambda row: rgcn_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

# gene_gene
gene_gene_test_df_rgcn = gene_gene_test_df.copy()
gene_gene_test_df_rgcn['score'] = gene_gene_test_df_rgcn.apply(
    lambda row: rgcn_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

test_rgcn = pd.concat([drug_gene_test_df_rgcn, gene_gene_test_df_rgcn], ignore_index=True)
test_rgcn.to_csv(r'test_rgcn.csv', index=False)

In [None]:
# HittER

# drug_gene
drug_gene_test_df_hitter = drug_gene_test_df.copy()
drug_gene_test_df_hitter['score'] = drug_gene_test_df_hitter.apply(
    lambda row: hitter_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

# gene_gene
gene_gene_test_df_hitter = gene_gene_test_df.copy()
gene_gene_test_df_hitter['score'] = gene_gene_test_df_hitter.apply(
    lambda row: hitter_score(row['entity1'], row['entity2'], row['candidate']),
    axis=1
)

test_hitter = pd.concat([drug_gene_test_df_hitter, gene_gene_test_df_hitter], ignore_index=True)
test_hitter.to_csv(r'test_hitter.csv', index=False)

# Normalization

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [None]:
transe = pd.read_csv(r'test_transe.csv')
transh = pd.read_csv(r'test_transh.csv')
transr = pd.read_csv(r'test_transr.csv')
complex = pd.read_csv(r'test_complex.csv')
rotate = pd.read_csv(r'test_rotate.csv')
rgcn = pd.read_csv(r'test_rgcn.csv')
hitter = pd.read_csv(r'test_hitter.csv')

In [None]:
scaler = MinMaxScaler()
transe['score'] = scaler.fit_transform(transe[['score']])
transh['score'] = scaler.fit_transform(transh[['score']])
transr['score'] = scaler.fit_transform(transr[['score']])
complex['score'] = scaler.fit_transform(complex[['score']])
rotate['score'] = scaler.fit_transform(rotate[['score']])

# Ensemble

In [None]:
transe['model'] = 'transe'
transh['model'] = 'transh'
transr['model'] = 'transr'
complex['model'] = 'complex'
rotate['model'] = 'rotate'
rgcn['model'] = 'rgcn'
hitter['model'] = 'hitter'

p1_df = pd.concat([transe, transh, transr], ignore_index=True)
p2_df = pd.concat([complex, rotate], ignore_index=True)
p3_df = pd.concat([rgcn, hitter], ignore_index=True)
p12_df = pd.concat([transe, transh, transr, complex, rotate], ignore_index=True)
p23_df = pd.concat([complex, rotate, rgcn, hitter], ignore_index=True)
p13_df = pd.concat([transe, transh, transr, rgcn, hitter], ignore_index=True)
p123_df = pd.concat([transe, transh, transr, complex, rotate, rgcn, hitter], ignore_index=True)

### Best Model

In [None]:
single = hitter

In [None]:
# Precision

single_filter = single.sort_values('score', ascending=False).drop_duplicates(subset=['entity1', 'relation','entity2'])

for top_n in[1, 3, 5, 10, 50, 100, len(single_filter)]:
    top_n_df = single_filter.sort_values('score', ascending=False).head(top_n)
    
    precision_n = len(top_n_df[top_n_df['relation'] == top_n_df['candidate']])/len(top_n_df)
    print(f"Top {top_n} precision: {precision_n:.10f}")

### Hard Voting

In [None]:
all_df = p3_df

In [None]:
top_per_model = (
    all_df.sort_values('score', ascending=False)
    .drop_duplicates(subset=['entity1', 'relation', 'entity2', 'model'])
)

hardvoting = (
    top_per_model
    .groupby(['entity1', 'relation', 'entity2'])['candidate']
    .agg(
        candidate=lambda x: x.value_counts().idxmax(),
        cnt=lambda x: x.value_counts().max()
    )
    .reset_index()
)

In [None]:
# Precision

for top_n in [1, 3, 5, 10, 50, 100, len(hardvoting)]:
    hardvoting_df = hardvoting.sort_values('cnt', ascending=False).head(top_n)

    precision_n = len(hardvoting_df[hardvoting_df['relation'] == hardvoting_df['candidate']]) / len(hardvoting_df)
    print(f"Top {top_n} precision: {precision_n:.10f}")

### Soft Voting

In [None]:
all_df = p3_df

In [None]:
avg_score_df = (
    all_df
    .groupby(['entity1', 'relation', 'entity2', 'candidate'])['score']
    .mean()
    .reset_index()
    .rename(columns={'score': 'avg score'})
)

softvoting = (
    avg_score_df
    .sort_values('avg score', ascending=False)
    .drop_duplicates(subset=['entity1', 'relation', 'entity2'])
    .reset_index(drop=True)
)

In [None]:
# Precision

for top_n in [1, 3, 5, 10, 50, 100, len(softvoting)]:
    softvoting_df = softvoting.sort_values('avg score', ascending=False).head(top_n)

    precision_n = len(softvoting_df[softvoting_df['relation'] == softvoting_df['candidate']]) / len(softvoting_df)
    print(f"Top {top_n} precision: {precision_n:.10f}")

### Weighted Soft Voting

In [None]:
# MRR
weight = {
    'transe': 0.094,
    'transh': 0.097,
    'transr': 0.101,
    'complex': 0.117,
    'rotate': 0.123,
    'rgcn': 0.325,
    'hitter':0.238
}

In [None]:
all_df = p1_df

In [None]:
all_df['weight'] = all_df['model'].map(weight)

weighted_avg_score_df = (
    all_df
    .assign(weighted_score=lambda df: df['score'] * df['weight'])
    .groupby(['entity1', 'relation', 'entity2', 'candidate'])
    .agg(
        weighted_score_sum=('weighted_score', 'sum'),
        weight_sum=('weight', 'sum')
    )
    .assign(weighted_avg=lambda df: df['weighted_score_sum'] / df['weight_sum'])
    .reset_index()
)

weighted_softvoting = (
    weighted_avg_score_df
    .sort_values('weighted_avg', ascending=False)
    .drop_duplicates(subset=['entity1', 'relation', 'entity2'])
    .reset_index(drop=True)
)

In [None]:
# Precision

for top_n in [1, 3, 5, 10, 50, 100, len(weighted_softvoting)]:
    weighted_softvoting_df = weighted_softvoting.sort_values('weighted_avg', ascending=False).head(top_n)

    precision_n = len(weighted_softvoting_df[weighted_softvoting_df['relation'] == weighted_softvoting_df['candidate']]) / len(weighted_softvoting_df)
    print(f"Top {top_n} precision: {precision_n:.10f}")