In [None]:
import pandas as pd
from collections import defaultdict
import os

In [None]:
drug_gene_existing = pd.read_csv(r'drug_gene_exist.csv')
gene_dis_existing = pd.read_csv(r'gene_dis_exist.csv')

In [None]:
drug_gene_pred = pd.read_csv(r'drug_gene_pred.csv')
threshold = drug_gene_pred['Score'].quantile(0.9999)
drug_gene_pred_001 = drug_gene_pred[drug_gene_pred['Score'] >= threshold]
drug_gene_pred_001.to_csv(r'drug_gene_pred_001.csv', index=False)

In [None]:
gene_gene_pred = pd.read_csv(r'gene_gene_pred.csv')
threshold = gene_gene_pred['Score'].quantile(0.9999)
gene_gene_pred_001 = gene_gene_pred[gene_gene_pred['Score'] >= threshold]
gene_gene_pred_001.to_csv(r'gene_gene_pred_001.csv', index=False)

# Pathway Extraction

In [None]:
from tqdm import tqdm
import pandas as pd

In [None]:
drug_gene_existing = pd.read_csv(r'drug_gene_exist.csv')
gene_gene_existing = pd.read_csv(r'gene_gene_exist.csv')
gene_dis_existing = pd.read_csv(r'gene_dis_exist.csv')
drug_gene_predicted = pd.read_csv(r'drug_gene_pred_001.csv')
gene_gene_predicted = pd.read_csv(r'gene_gene_pred_001.csv')

In [None]:
drug_gene = pd.concat([drug_gene_existing, drug_gene_predicted], ignore_index=True)
gene_gene = pd.concat([gene_gene_existing, gene_gene_predicted], ignore_index=True)
gene_dis = gene_dis_existing

In [None]:
unique_drugs = set(drug_gene['Entity1'])
unique_diseases = set(gene_dis['Entity2'])

In [None]:
# Length 2
all_results = []

for drug in tqdm(unique_drugs, desc="Drugs"):
    fromdrugtogene = list(drug_gene['Entity2'][drug_gene['Entity1'] == drug])
    
    for disease in tqdm(unique_diseases, desc="Diseases", leave=False):
        fromdiseasetogene = list(gene_dis['Entity1'][gene_dis['Entity2'] == disease])
        
        common_genes = set(fromdrugtogene) & set(fromdiseasetogene)
        
        for gene in common_genes:
            drug_rows = drug_gene[(drug_gene['Entity1'] == drug) & (drug_gene['Entity2'] == gene)]
            disease_rows = gene_dis[(gene_dis['Entity1'] == gene) & (gene_dis['Entity2'] == disease)]
            
            for _, d_row in drug_rows.iterrows():
                for _, g_row in disease_rows.iterrows():
                    all_results.append({
                        'Length': 2,
                        'Path': [
                            {
                                'Entity1': d_row['Entity1'],
                                'Relation': d_row['Relation'],
                                'Entity2': d_row['Entity2'],
                                'Score': d_row['Score']
                            },
                            {
                                'Entity1': g_row['Entity1'],
                                'Relation': g_row['Relation'],
                                'Entity2': g_row['Entity2'],
                                'Score': g_row['Score']
                            }
                        ],
                        'StartDrug': drug,
                        'EndDisease': disease
                    })

result_df = pd.DataFrame(all_results)
result_df.to_csv(r'output_length2.csv', index=False)

In [None]:
# Length 3
all_results = []

for drug in tqdm(unique_drugs, desc="Drugs"):
    fromdrugtogene1 = list(drug_gene['Entity2'][drug_gene['Entity1'] == drug])
    
    for disease in tqdm(unique_diseases, desc="Diseases", leave=False):
        fromdiseasetogene = list(gene_dis['Entity1'][gene_dis['Entity2'] == disease])

        filtered_gene1 = set(fromdrugtogene1) - set(fromdiseasetogene)
        fromgene1togene2 = list(gene_gene['Entity2'][gene_gene['Entity1'].isin(filtered_gene1)])
        
        common_genes = set(fromgene1togene2) & set(fromdiseasetogene)

        for gene in common_genes:
            gene1_gene2_rows = gene_gene[gene_gene['Entity2'] == gene]
            gene_dis_rows = gene_dis[(gene_dis['Entity1'] == gene) & (gene_dis['Entity2'] == disease)]
            
            for _, gene_gene_row in gene1_gene2_rows.iterrows():
                gene1 = gene_gene_row['Entity1']
                drug_gene_rows = drug_gene[(drug_gene['Entity1'] == drug) & (drug_gene['Entity2'] == gene1)]

                for _, drug_row in drug_gene_rows.iterrows():
                    for _, dis_row in gene_dis_rows.iterrows():
                        all_results.append({
                            'Length': 3,
                            'Path': [
                                {
                                    'Entity1': drug_row['Entity1'],
                                    'Relation': drug_row['Relation'],
                                    'Entity2': drug_row['Entity2'],
                                    'Score': drug_row['Score']
                                },
                                {
                                    'Entity1': gene_gene_row['Entity1'],
                                    'Relation': gene_gene_row['Relation'],
                                    'Entity2': gene_gene_row['Entity2'],
                                    'Score': gene_gene_row['Score']
                                },
                                {
                                    'Entity1': dis_row['Entity1'],
                                    'Relation': dis_row['Relation'],
                                    'Entity2': dis_row['Entity2'],
                                    'Score': dis_row['Score']
                                }
                            ],
                            'StartDrug': drug,
                            'EndDisease': disease
                        })

result_df = pd.DataFrame(all_results)
result_df.to_csv(r'output_length3.csv', index=False)

In [None]:
# Length 4
all_results = []

for drug in tqdm(unique_drugs, desc="Drugs"):
    gene1s = drug_gene[drug_gene['Entity1'] == drug]

    for _, gene1_row in gene1s.iterrows():
        gene1 = gene1_row['Entity2']
        gene2s = gene_gene[gene_gene['Entity1'] == gene1]

        for _, gene2_row in gene2s.iterrows():
            gene2 = gene2_row['Entity2']
            gene3s = gene_gene[gene_gene['Entity1'] == gene2]

            for _, gene3_row in gene3s.iterrows():
                gene3 = gene3_row['Entity2']
                dis_rows = gene_dis[gene_dis['Entity1'] == gene3]

                for _, dis_row in dis_rows.iterrows():
                    disease = dis_row['Entity2']

                    all_results.append({
                        'Length': 4,
                        'Path': [
                            {
                                'Entity1': drug,
                                'Relation': gene1_row['Relation'],
                                'Entity2': gene1,
                                'Score': gene1_row['Score']
                            },
                            {
                                'Entity1': gene1,
                                'Relation': gene2_row['Relation'],
                                'Entity2': gene2,
                                'Score': gene2_row['Score']
                            },
                            {
                                'Entity1': gene2,
                                'Relation': gene3_row['Relation'],
                                'Entity2': gene3,
                                'Score': gene3_row['Score']
                            },
                            {
                                'Entity1': gene3,
                                'Relation': dis_row['Relation'],
                                'Entity2': disease,
                                'Score': dis_row['Score']
                            }
                        ],
                        'StartDrug': drug,
                        'EndDisease': disease
                    })

result_df = pd.DataFrame(all_results)
result_df.to_csv(r'output_length4.csv', index=False)