In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hetionet-json/hetionet_training.json
/kaggle/input/hetionet-output/hetionet_training_output.json


## Install libraries

In [2]:
!pip install pykeen
!pip install torch_geometric

Collecting pykeen
  Downloading pykeen-1.11.0-py3-none-any.whl.metadata (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting more-click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl.metadata (4.3 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.7.0-py3-none-any.whl.metadata (17 kB)
Collecting docdata (from pykeen)
  Downloading docdata-0.0.4-py3-none-any.whl.metadata (13 kB)
Collecting class-resolver>=0.5.1 (from pykeen)
  Downloading class_resolver-0.5.4-py3-none-any.whl.metadata (14 kB)
Collecting torch-max-mem>=0.1.1 (from pykeen)
  Downloading torch_max_mem-0.1.3-py3-none-any.whl.metadata (7.4 kB)
Collecting torch-ppr>=0.0.7 (from pykeen)
  Downloading torch_ppr-0.0.8-py3-none-any.whl.metadata (9.3 kB)
Downloading pykeen-1.11.0-py3-none-any.wh

## Import libraries

In [3]:
import torch
import numpy as np
import networkx as nx
import json
from pykeen.datasets import Hetionet
from pykeen.pipeline import pipeline
from torch_geometric.nn import GCNConv
import torch.nn.functional as F


## Load the Hetionet dataset

In [4]:
# Load Hetionet Dataset
dataset = Hetionet()
triples = dataset.training.mapped_triples

# Extract Entities and Relations
entity_to_id = dataset.entity_to_id
relation_to_id = dataset.relation_to_id

# Filter Nodes
drug_nodes = {e for e in entity_to_id if "Compound" in e}  # Drugs
disease_nodes = {e for e in entity_to_id if "Disease" in e}  # Diseases

print(f"Drugs: {len(drug_nodes)}, Diseases: {len(disease_nodes)}")

Downloading hetionet-v1.0-edges.sif.gz: 0.00B [00:00, ?B/s]

Drugs: 1538, Diseases: 136


# Task-1 (Method-1)
### Using TransE for generating KGE embeddings and then recommend alternate drugs

In [6]:
# Train TransE Model for Task 1
kge_model = pipeline(
    model="TransE",
    dataset="Hetionet",
    training_loop="sLCWA",
    epochs=10,
)

# Get learned entity embeddings
kg_embeddings = kge_model.model.entity_representations[0]

# Function to Recommend Alternate Drugs (Task 1)
def recommend_alternate_drugs(disease_id, top_k=5):
    disease_embed = kg_embeddings(torch.tensor([disease_id])).detach().cpu()
    drug_embeddings = torch.stack([kg_embeddings(torch.tensor([entity_to_id[d]])) for d in drug_nodes])
    similarities = np.dot(drug_embeddings.squeeze().detach().cpu().numpy(), disease_embed.T)
    sorted_indices = np.argsort(similarities, axis=0)[::-1][:top_k]
    recommended_drugs = [list(drug_nodes)[idx.item()] for idx in sorted_indices]
    scores = [similarities[idx].item() for idx in sorted_indices]
    return recommended_drugs, scores


  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


Training epochs on cuda:0:   0%|          | 0/10 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/225k [00:00<?, ?triple/s]

# Task - 2 (Method-1)

In [7]:
# Task 2: Define GCN Model
class DrugGCN(torch.nn.Module):
    def __init__(self, num_nodes, input_dim, hidden_dim, output_dim):
        super(DrugGCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.node_embeddings = torch.nn.Embedding(num_nodes, input_dim)

    def forward(self, edge_index):
        x = self.node_embeddings.weight
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Convert Hetionet Graph to PyTorch Geometric Format
G = nx.DiGraph()
for head, relation, tail in triples:
    G.add_edge(head.item(), tail.item())

edge_index = torch.tensor(list(G.edges)).t().contiguous()
num_nodes = len(entity_to_id)

# Initialize GCN Model
gcn_model = DrugGCN(num_nodes, input_dim=100, hidden_dim=64, output_dim=32)
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.01)

# Train GCN Model (Task 2)
def train_gcn(epochs=100):
    gcn_model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        embeddings = gcn_model(edge_index)
        loss = embeddings.norm(2)  # Regularization
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch} | Loss: {loss.item()}")

train_gcn()


Epoch 0 | Loss: 611.4765625
Epoch 10 | Loss: 125.42223358154297
Epoch 20 | Loss: 54.50395965576172
Epoch 30 | Loss: 22.35668182373047
Epoch 40 | Loss: 8.734188079833984
Epoch 50 | Loss: 3.737562894821167
Epoch 60 | Loss: 2.239772319793701
Epoch 70 | Loss: 1.923384666442871
Epoch 80 | Loss: 1.1335484981536865
Epoch 90 | Loss: 1.0273619890213013


In [9]:
# Function to Recommend Drugs with Side Effect Constraints (Task 2)
def recommend_with_constraints(disease_id, side_effect_threshold=0.5, top_k=5):
    gcn_model.eval()
    embeddings = gcn_model(edge_index).detach().numpy()
    disease_embed = embeddings[disease_id]

    # Compute Similarity with Drug Nodes
    drug_embeddings = np.array([embeddings[entity_to_id[d]] for d in drug_nodes])
    similarities = np.dot(drug_embeddings, disease_embed.T)

    # Apply Side Effect Filtering
    filtered_drugs = [d for d, score in zip(drug_nodes, similarities) if score < side_effect_threshold]
    sorted_drugs = sorted(filtered_drugs, key=lambda d: -similarities[list(drug_nodes).index(d)])[:top_k]
    scores = [similarities[list(drug_nodes).index(d)] for d in sorted_drugs]

    return sorted_drugs, scores

## Generating recommendations for task-1 and task-2

In [18]:
import json

# Load reference data from hetionet_training_output.json
with open("/kaggle/input/hetionet-output/hetionet_training_output.json", "r") as ref_file:
    reference_outputs = json.load(ref_file)

# Create a lookup for reference outputs by type and disease_id
reference_dict = {(ref['type'], ref['disease_id']): ref for ref in reference_outputs}

# Generate JSON Data
def get_drug_recommendations(disease_id, method="alternate_drugs", top_k=5):
    if method == "alternate_drugs":
        recommended_drugs, scores = recommend_alternate_drugs(disease_id, top_k)
    elif method == "alternate_drugs_with_constraints":
        recommended_drugs, scores = recommend_with_constraints(disease_id, top_k=top_k)
    else:
        return None

    # Convert scores to native Python floats for JSON serialization
    scores = [float(score) for score in scores]

    return {
        "type": method,
        "disease_id": list(entity_to_id.keys())[list(entity_to_id.values()).index(disease_id)],
        "candidates": recommended_drugs,
        "scores": scores
    }

# Generate recommendations for a few diseases
sample_diseases = list(disease_nodes)[:3]
recommendations = []
hits_scores = []  # To store Hits scores

for disease in sample_diseases:
    disease_id = entity_to_id[disease]
    
    for method in ["alternate_drugs", "alternate_drugs_with_constraints"]:
        recommendation = get_drug_recommendations(disease_id, method=method)
        recommendations.append(recommendation)

        # Correct Hits@5 calculation
        ref_key = (method, recommendation['disease_id'])
        if ref_key in reference_dict:
            ref_candidates = set(reference_dict[ref_key]['candidates'])
            pred_candidates = recommendation['candidates'][:5]  # Top-5 predicted candidates
            # Check if any of the top-5 predictions are in the reference candidates
            hit = 1 if any(candidate in ref_candidates for candidate in pred_candidates) else 0
        else:
            hit = 0  # No reference found, assume no hit
        
          # Add hits score to the recommendation
        hits_scores.append(hit)

# Print sample recommendations with Hits scores
print("Sample Recommendations :")
for i in range(min(10, len(recommendations))):
    print(json.dumps(recommendations[i], indent=4))
    
    

# Save all recommendations to a new file
with open("new_recommendations.json", "w") as f:
    json.dump(recommendations, f, indent=4)

print("\nNew JSON file saved: new_recommendations.json")

# Print overall Hits@5 score
avg_hits_score = sum(hits_scores) / len(hits_scores) if hits_scores else 0



Sample Recommendations :
{
    "type": "alternate_drugs",
    "disease_id": "Disease::DOID:1964",
    "candidates": [
        "Compound::DB00793",
        "Compound::DB06713",
        "Compound::DB01431",
        "Compound::DB01102",
        "Compound::DB06150"
    ],
    "scores": [
        0.4438948929309845,
        0.41489940881729126,
        0.37776798009872437,
        0.36725741624832153,
        0.3615722060203552
    ]
}
{
    "type": "alternate_drugs_with_constraints",
    "disease_id": "Disease::DOID:1964",
    "candidates": [
        "Compound::DB01006",
        "Compound::DB01591",
        "Compound::DB04794",
        "Compound::DB00202",
        "Compound::DB00171"
    ],
    "scores": [
        0.00037804857129231095,
        0.00013582382234744728,
        0.00012203674123156816,
        0.00011764345254050568,
        0.00011015807103831321
    ]
}
{
    "type": "alternate_drugs",
    "disease_id": "Disease::DOID:11615",
    "candidates": [
        "Compound::DB06730"

# Task -1 (Method-2)

In [19]:

# Load the Hetionet dataset
dataset = Hetionet()
triples = dataset.training.mapped_triples
entity_to_id = dataset.entity_to_id
relation_to_id = dataset.relation_to_id


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [20]:
result = pipeline(
    model='TransE',
    dataset='Hetionet',
    training_loop='sLCWA',
    epochs=10
)
model = result.model

  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


Training epochs on cuda:0:   0%|          | 0/10 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/7032 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/225k [00:00<?, ?triple/s]

In [21]:
# Function to get directly connected drugs for a given disease ID
def get_direct_connections(disease_id, triples):
    direct_drugs = set()
    for head, relation, tail in triples:
        if head == disease_id and relation == relation_to_id['CtD']:
            direct_drugs.add(tail)
    return direct_drugs

# Function to get alternate drugs excluding direct connections
def get_alternate_drugs(disease_id, model, direct_drugs, top_k=10):
    disease_embedding = model.entity_representations[0](torch.tensor([disease_id]))
    scores = []

    for drug_id in range(len(entity_to_id)):
        if drug_id in direct_drugs:
            continue

        drug_embedding = model.entity_representations[0](torch.tensor([drug_id]))
        score = torch.nn.functional.cosine_similarity(disease_embedding, drug_embedding, dim=1)
        scores.append((drug_id, score.item()))

    sorted_candidates = sorted(scores, key=lambda x: x[1], reverse=True)
    return sorted_candidates[:top_k]



In [22]:
def ndcg_at_k(ranked_scores, k=10):
    dcg = 0.0
    for i in range(min(k, len(ranked_scores))):
        dcg += (2**ranked_scores[i] - 1) / np.log2(i + 2)
    sorted_scores = sorted(ranked_scores, reverse=True)
    idcg = 0.0
    for i in range(min(k, len(sorted_scores))):
        idcg += (2**sorted_scores[i] - 1) / np.log2(i + 2)
    return dcg / idcg if idcg > 0 else 0.0

# Function to calculate Hits@k
def hits_at_k(ranked_scores, threshold=0.5, k=10):
    hits = sum(1 for score in ranked_scores[:k] if score >= threshold)
    return hits / min(k, len(ranked_scores))

In [23]:
disease_configs = [
    {"disease_id": 85, "eval_metric": "Hits@10"},
    {"disease_id": 131, "eval_metric": "NDCG@10"}
]

# Process each disease ID
for config in disease_configs:
    disease_id = config["disease_id"]
    eval_metric = config["eval_metric"]

    direct_drugs = get_direct_connections(disease_id, triples)
    candidates = get_alternate_drugs(disease_id, model, direct_drugs)
    candidate_scores = [score for _, score in candidates]

    # Compute the requested evaluation metric
    if eval_metric == "NDCG@10":
        score = ndcg_at_k(candidate_scores, k=10)
        metric_name = "NDCG@10"
    elif eval_metric == "Hits@10":
        score = hits_at_k(candidate_scores, threshold=0.5, k=10)
        metric_name = "Hits@10"
    else:
        score = None
        metric_name = "Unknown Metric"

    # Print results
    print(f"Disease ID: {disease_id}")
    print("Candidate Drugs (Excluding Direct Connections):")
    for drug_id, drug_score in candidates:
        print(f"  Drug ID: {drug_id}, Similarity Score: {drug_score:.4f}")
    if score is not None:
        print(f"{metric_name} Score: {score:.4f}")
    print("-" * 50)

Disease ID: 85
Candidate Drugs (Excluding Direct Connections):
  Drug ID: 85, Similarity Score: 1.0000
  Drug ID: 39703, Similarity Score: 0.6009
  Drug ID: 40095, Similarity Score: 0.5881
  Drug ID: 44418, Similarity Score: 0.5875
  Drug ID: 384, Similarity Score: 0.5846
  Drug ID: 319, Similarity Score: 0.5817
  Drug ID: 34536, Similarity Score: 0.5713
  Drug ID: 13564, Similarity Score: 0.5704
  Drug ID: 42, Similarity Score: 0.5694
  Drug ID: 376, Similarity Score: 0.5671
Hits@10 Score: 1.0000
--------------------------------------------------
Disease ID: 131
Candidate Drugs (Excluding Direct Connections):
  Drug ID: 131, Similarity Score: 1.0000
  Drug ID: 28843, Similarity Score: 0.5908
  Drug ID: 139, Similarity Score: 0.5792
  Drug ID: 3031, Similarity Score: 0.5692
  Drug ID: 44638, Similarity Score: 0.5638
  Drug ID: 13564, Similarity Score: 0.5557
  Drug ID: 13187, Similarity Score: 0.5474
  Drug ID: 39274, Similarity Score: 0.5355
  Drug ID: 5772, Similarity Score: 0.5233
 

# Task - 2 (Method -2 )

In [24]:
# Loads the Hetionet dataset and extracts training triples, entity IDs, and relation IDs for graph processing
print("Loading Hetionet dataset...")
dataset = Hetionet()
triples_factory = dataset.training
triples = triples_factory.mapped_triples
entity_to_id = triples_factory.entity_to_id
relation_to_id = triples_factory.relation_to_id

Loading Hetionet dataset...


In [26]:
from collections import defaultdict
# Identifies key relation IDs and computes normalized side effect risk for compounds based on "Compound–causes–Side Effect" edges
treats_rel = relation_to_id['CtD']  # "Compound–treats–Disease"
side_effect_rel = relation_to_id['CcSE']  # "Compound–causes–Side Effect"

print("Computing side effect risks...")
side_effect_counts = defaultdict(int)
for head, rel, tail in triples:
    if rel.item() == side_effect_rel:
        compound_id = head.item()
        side_effect_counts[compound_id] += 1
max_count = max(side_effect_counts.values(), default=1)
side_effect_risk = {c: count / max_count for c, count in side_effect_counts.items()}

Computing side effect risks...


In [27]:
# Identify compound nodes (heads in 'CtD' or 'CcSE' relations)
compound_ids = set()
for head, rel, tail in triples:
    if rel.item() == treats_rel or rel.item() == side_effect_rel:
        compound_ids.add(head.item())
compound_ids = list(compound_ids)
print(f"Number of compounds identified: {len(compound_ids)}")

Number of compounds identified: 1085


In [29]:
from torch_geometric.data import Data
# Prepare graph data for PyTorch Geometric
edge_index = triples[:, [0, 2]].t()  # [2, num_edges]
edge_type = triples[:, 1]  # [num_edges]
num_nodes = len(entity_to_id)
num_relations = len(relation_to_id)
feature_dim = 16  # Random features for simplicity
x = torch.randn(num_nodes, feature_dim)
graph_data = Data(x=x, edge_index=edge_index, edge_type=edge_type)
print(f"Graph prepared with {num_nodes} nodes and {edge_index.size(1)} edges.")

Graph prepared with 45158 nodes and 1800157 edges.


In [30]:
# Define R-GCN model
class RGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_relations):
        super(RGCN, self).__init__()
        self.conv1 = RGCNConv(in_channels, hidden_channels, num_relations)
        self.conv2 = RGCNConv(hidden_channels, out_channels, num_relations)
    
    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_type)
        return x

In [32]:
from torch_geometric.nn import RGCNConv
# Initialize model and optimizer
model = RGCN(in_channels=feature_dim, hidden_channels=64, out_channels=32, num_relations=num_relations)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# Extract positive "treats" edges for training
treats_edges = triples[triples[:, 1] == treats_rel][:, [0, 2]]  # [num_treats, 2]
print(f"Number of 'treats' edges: {len(treats_edges)}")

Number of 'treats' edges: 599


In [33]:
# Training function
def train():
    model.train()
    optimizer.zero_grad()
    z = model(graph_data.x, graph_data.edge_index, graph_data.edge_type)
    pos_edges = treats_edges
    pos_scores = (z[pos_edges[:, 0]] * z[pos_edges[:, 1]]).sum(dim=1)
    num_neg = len(pos_edges)
    neg_compounds = torch.randint(0, num_nodes, (num_neg,))
    neg_diseases = torch.randint(0, num_nodes, (num_neg,))
    neg_scores = (z[neg_compounds] * z[neg_diseases]).sum(dim=1)
    labels = torch.cat([torch.ones(num_neg), torch.zeros(num_neg)])
    scores = torch.cat([pos_scores, neg_scores])
    loss = F.binary_cross_entropy_with_logits(scores, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [34]:
# Train the model
print("Training R-GCN model...")
for epoch in range(100):
    loss = train()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

Training R-GCN model...
Epoch 0, Loss: 4.3005
Epoch 10, Loss: 1.1157
Epoch 20, Loss: 0.5969
Epoch 30, Loss: 0.4362
Epoch 40, Loss: 0.4188
Epoch 50, Loss: 0.4045
Epoch 60, Loss: 0.3843
Epoch 70, Loss: 0.3854
Epoch 80, Loss: 0.3623
Epoch 90, Loss: 0.3633


In [35]:
# Get final node embeddings
model.eval()
with torch.no_grad():
    z = model(graph_data.x, graph_data.edge_index, graph_data.edge_type)
print("Node embeddings computed.")

Node embeddings computed.


In [36]:
def get_top_drugs(disease_id, side_effect_threshold, top_k=3):
    z_d = z[disease_id]
    # Compute raw dot product scores and apply sigmoid to normalize scores between 0 and 1
    raw_scores = (z[compound_ids] @ z_d)
    scores = torch.sigmoid(raw_scores).cpu().numpy()
    
    # Get side effect risk for each candidate compound
    risks = [side_effect_risk.get(c, 0) for c in compound_ids]
    
    # Filter compounds by the side effect threshold
    filtered_indices = [i for i, r in enumerate(risks) if r <= side_effect_threshold]
    filtered_compounds = [compound_ids[i] for i in filtered_indices]
    filtered_scores = [scores[i] for i in filtered_indices]
    
    # Sort filtered compounds by score in descending order
    sorted_indices = sorted(range(len(filtered_scores)), key=lambda i: filtered_scores[i], reverse=True)
    top_indices = sorted_indices[:top_k]
    top_compounds = [filtered_compounds[i] for i in top_indices]
    top_scores = [float(filtered_scores[i]) for i in top_indices]
    
    return top_compounds, top_scores


In [40]:
# Loads the test cases and reference outputs JSON files from Kaggle input directory after uploading

test_cases_path = "/kaggle/input/hetionet-json/hetionet_training.json"  
reference_path =  "/kaggle/input/hetionet-output/hetionet_training_output.json" 

with open(test_cases_path, 'r') as f:
    test_cases = json.load(f)
with open(reference_path, 'r') as f:
    reference_outputs = json.load(f)

print(f"Loaded {len(test_cases)} test cases and {len(reference_outputs)} reference outputs.")

Loaded 28 test cases and 28 reference outputs.


In [42]:
# Processes only Task 2 test cases, generates predictions, saves them to JSON, and evaluates Hits@3 against reference outputs
output = []
hits_at_3 = []

# Create a lookup for reference outputs by type and disease_id
reference_dict = {(ref['type'], ref['disease_id']): ref for ref in reference_outputs}

for test_case in test_cases:
    if test_case['type'] == 'alternate_drug_narrowed':
        disease_id = test_case['disease_id']
        side_effect_threshold = test_case['criteria']['side_effect_threshold']
        top_compounds, top_scores = get_top_drugs(disease_id, side_effect_threshold, top_k=3)  # Changed top_k=10 to top_k=3
        
        # Generate output entry
        output_entry = {
            "type": "alternate_drug_narrowed",
            "disease_id": disease_id,
            "candidates": top_compounds,
            "scores": top_scores
        }
        output.append(output_entry)
        
       

# Print 3 sample entries from the output along with their Hits@3 scores
print("Sample Output Entries with Hits@3 Scores:")
for i in range(min(3, len(output))):
    print(json.dumps(output[i], indent=2))
  

# Save output to JSON file
output_path = '/kaggle/working/output_task2.json'
with open(output_path, 'w') as f:
    json.dump(output, f, indent=2)
print(f"Output saved to {output_path}")


Sample Output Entries with Hits@3 Scores:
{
  "type": "alternate_drug_narrowed",
  "disease_id": 5,
  "candidates": [
    14577,
    14041,
    13944
  ],
  "scores": [
    0.6917874813079834,
    0.6384003162384033,
    0.6270171403884888
  ]
}
{
  "type": "alternate_drug_narrowed",
  "disease_id": 63,
  "candidates": [
    13778,
    13869,
    14206
  ],
  "scores": [
    0.6313191056251526,
    0.6288237571716309,
    0.6138368844985962
  ]
}
{
  "type": "alternate_drug_narrowed",
  "disease_id": 21,
  "candidates": [
    13422,
    13869,
    13276
  ],
  "scores": [
    0.6372756361961365,
    0.5851418375968933,
    0.5822721123695374
  ]
}
Output saved to /kaggle/working/output_task2.json
