# Finding Similar Clinical Trials Using Jaccard Similarity (Without GDS)

This notebook finds similar clinical trials using **Jaccard similarity** calculated with pure Cypher queries - **no Graph Data Science library required**.

In [None]:
from neo4j import GraphDatabase
import pandas as pd

# Connection details for LOCAL Neo4j
URI = "neo4j://127.0.0.1:7687"
AUTH = ("neo4j", "12345678")

## Function to Calculate Jaccard Similarity

Jaccard Similarity = |A ∩ B| / |A ∪ B|

For two clinical trials, we compare their connected ObjectNodes (conditions, drugs, outcomes, etc.)

In [None]:
def find_similar_trials_jaccard(driver, trial_id, top_n=10):
    """
    Find similar trials using Jaccard similarity
    """
    query = """
    // Find the input trial node
    MATCH (input:SubjectNode {name: $trial_id})
    
    // Get all neighbors of the input trial
    MATCH (input)-[:RELATIONSHIP]-(inputNeighbor:ObjectNode)
    WITH input, COLLECT(DISTINCT inputNeighbor) AS inputNeighbors
    
    // Find all other SubjectNodes
    MATCH (other:SubjectNode)
    WHERE other <> input
    
    // Get all neighbors of the other trial
    MATCH (other)-[:RELATIONSHIP]-(otherNeighbor:ObjectNode)
    WITH input, inputNeighbors, other, COLLECT(DISTINCT otherNeighbor) AS otherNeighbors
    
    // Calculate Jaccard similarity
    WITH input, other,
         inputNeighbors,
         otherNeighbors,
         [n IN inputNeighbors WHERE n IN otherNeighbors] AS intersection
    WITH input, other,
         SIZE(intersection) AS intersectionSize,
         SIZE(inputNeighbors) + SIZE(otherNeighbors) - SIZE(intersection) AS unionSize
    
    // Calculate Jaccard coefficient
    WITH other.name AS similarTrial,
         CASE WHEN unionSize = 0 THEN 0.0 
              ELSE toFloat(intersectionSize) / toFloat(unionSize) 
         END AS similarity
    
    WHERE similarity > 0
    RETURN similarTrial, similarity
    ORDER BY similarity DESC
    LIMIT $top_n
    """
    
    with driver.session() as session:
        result = session.run(query, trial_id=trial_id, top_n=top_n)
        return [(record["similarTrial"], record["similarity"]) for record in result]

## Check if Trial Exists

In [None]:
def check_trial_exists(driver, trial_id):
    """
    Check if a trial ID exists in the database
    """
    query = """
    MATCH (n:SubjectNode {name: $trial_id})
    RETURN COUNT(n) > 0 AS exists
    """
    
    with driver.session() as session:
        result = session.run(query, trial_id=trial_id)
        return result.single()["exists"]

## Test with Single Trial ID

In [None]:
# Single trial example
trial_id = "NCT00752622"  # Change this to any trial ID you want

try:
    driver = GraphDatabase.driver(URI, auth=AUTH)
    
    # Check if trial exists
    if not check_trial_exists(driver, trial_id):
        print(f"Trial {trial_id} not found in database!")
    else:
        print(f"Finding similar trials to {trial_id}...\n")
        
        # Find similar trials
        similar_trials = find_similar_trials_jaccard(driver, trial_id, top_n=10)
        
        if similar_trials:
            print(f"Top 10 Similar Trials to {trial_id}:")
            print("=" * 60)
            for i, (similar_trial, similarity) in enumerate(similar_trials, 1):
                print(f"{i:2d}. {similar_trial}: {similarity:.4f}")
        else:
            print("No similar trials found.")
            
except Exception as e:
    print(f"Error: {e}")
finally:
    driver.close()

## Test with Multiple Trial IDs

In [None]:
# Multiple trials example
trial_ids = ["NCT00385736", "NCT00386607", "NCT03518073"]

try:
    driver = GraphDatabase.driver(URI, auth=AUTH)
    
    for trial_id in trial_ids:
        print(f"\n{'='*70}")
        print(f"Trial ID: {trial_id}")
        print(f"{'='*70}")
        
        # Check if trial exists
        if not check_trial_exists(driver, trial_id):
            print(f"Trial {trial_id} not found in database!")
            continue
        
        # Find similar trials
        similar_trials = find_similar_trials_jaccard(driver, trial_id, top_n=10)
        
        if similar_trials:
            print(f"\nTop 10 Similar Trials:")
            for i, (similar_trial, similarity) in enumerate(similar_trials, 1):
                print(f"{i:2d}. {similar_trial}: Similarity = {similarity:.4f}")
        else:
            print("No similar trials found.")
            
except Exception as e:
    print(f"Error: {e}")
finally:
    driver.close()

## Interactive Input

In [None]:
# Interactive version - get input from user
try:
    driver = GraphDatabase.driver(URI, auth=AUTH)
    
    trial_input = input("Enter trial ID(s) separated by commas: ")
    trial_ids = [t.strip() for t in trial_input.split(',') if t.strip()]
    
    for trial_id in trial_ids:
        print(f"\n{'='*70}")
        print(f"Trial ID: {trial_id}")
        print(f"{'='*70}")
        
        # Check if trial exists
        if not check_trial_exists(driver, trial_id):
            print(f"❌ Trial {trial_id} not found in database!")
            continue
        
        # Find similar trials
        similar_trials = find_similar_trials_jaccard(driver, trial_id, top_n=10)
        
        if similar_trials:
            print(f"\n✓ Top 10 Similar Trials:")
            for i, (similar_trial, similarity) in enumerate(similar_trials, 1):
                print(f"{i:2d}. {similar_trial}: Similarity = {similarity:.4f}")
        else:
            print("No similar trials found.")
            
except Exception as e:
    print(f"Error: {e}")
finally:
    driver.close()