In [None]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np

URI = "neo4j://127.0.0.1:7687"
AUTH = ("neo4j", "12345678")

In [None]:
# Load ground truth
ground_truth_df = pd.read_csv('ground_truth_recommendations.csv')

In [None]:
def find_similar_trials_jaccard(driver, trial_id, top_n=10):
    query = """
    MATCH (input:SubjectNode {name: $trial_id})
    MATCH (input)-[:RELATIONSHIP]-(inputNeighbor:ObjectNode)
    WITH input, COLLECT(DISTINCT inputNeighbor) AS inputNeighbors
    
    MATCH (other:SubjectNode)
    WHERE other <> input
    
    MATCH (other)-[:RELATIONSHIP]-(otherNeighbor:ObjectNode)
    WITH input, inputNeighbors, other, COLLECT(DISTINCT otherNeighbor) AS otherNeighbors
    
    WITH input, other,
         inputNeighbors,
         otherNeighbors,
         [n IN inputNeighbors WHERE n IN otherNeighbors] AS intersection
    WITH input, other,
         SIZE(intersection) AS intersectionSize,
         SIZE(inputNeighbors) + SIZE(otherNeighbors) - SIZE(intersection) AS unionSize
    
    WITH other.name AS similarTrial,
         CASE WHEN unionSize = 0 THEN 0.0 
              ELSE toFloat(intersectionSize) / toFloat(unionSize) 
         END AS similarity
    
    WHERE similarity > 0
    RETURN similarTrial, similarity
    ORDER BY similarity DESC
    LIMIT $top_n
    """
    
    with driver.session() as session:
        result = session.run(query, trial_id=trial_id, top_n=top_n)
        return [(record["similarTrial"], record["similarity"]) for record in result]

In [None]:
# Get recommendations
query_trials = ground_truth_df['query_trial'].unique()
all_recommendations = {}

driver = GraphDatabase.driver(URI, auth=AUTH)

for trial_id in query_trials:
    similar_trials = find_similar_trials_jaccard(driver, trial_id, top_n=20)
    all_recommendations[trial_id] = similar_trials

driver.close()

In [None]:
# Calculate MRR
reciprocal_ranks = []

for query_trial in all_recommendations.keys():
    recommendations = [trial for trial, score in all_recommendations[query_trial]]
    relevant_trials = ground_truth_df[ground_truth_df['query_trial'] == query_trial]['relevant_trial'].tolist()
    
    first_rank = None
    for rank, recommended_trial in enumerate(recommendations, 1):
        if recommended_trial in relevant_trials:
            first_rank = rank
            break
    
    if first_rank:
        reciprocal_ranks.append(1.0 / first_rank)
    else:
        reciprocal_ranks.append(0.0)

mrr = np.mean(reciprocal_ranks)

In [None]:
# Calculate Precision@5 and Recall@5
k = 5
precisions = []
recalls = []

for query_trial in all_recommendations.keys():
    recommendations = [trial for trial, score in all_recommendations[query_trial][:k]]
    relevant_trials = ground_truth_df[ground_truth_df['query_trial'] == query_trial]['relevant_trial'].tolist()
    
    relevant_in_top_k = len([t for t in recommendations if t in relevant_trials])
    total_relevant = len(relevant_trials)
    
    precision = relevant_in_top_k / k
    recall = relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0
    
    precisions.append(precision)
    recalls.append(recall)

precision_at_5 = np.mean(precisions)
recall_at_5 = np.mean(recalls)

In [None]:
print("EVALUATION")
print(f"MRR: {mrr:.4f}")
print(f"Precision@5: {precision_at_5:.4f}")
print(f"Recall@5: {recall_at_5:.4f}")