In [None]:
!pip install neomodel torch scikit-learn transformers numpy==1.26.4

In [None]:
from neomodel import (StructuredNode, StringProperty, RelationshipTo, db, config, UniqueIdProperty)
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from neomodel import db


config.DATABASE_URL = "bolt://neo4j:<PASSWORD>@localhost:7687"

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def encode_text(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state[:, 0, :]
    return embeddings.squeeze(0).cpu().numpy()

def store_embeddings_in_neo4j():
    query = """
    MATCH (p:Paper)
    WHERE p.embedding IS NULL
    RETURN p.title AS title, p.abstract AS abstract, ID(p) AS id
    """
    results, _ = db.cypher_query(query)
    
    for row in results:
        abstract = row[1]
        paper_id = row[2]
        
        embedding = encode_text(abstract).tolist()  

        update_query = """
        MATCH (p:Paper)
        WHERE ID(p) = $id
        SET p.embedding = $embedding
        """
        db.cypher_query(update_query, {'id': paper_id, 'embedding': embedding})
    
    print(f"Stored embeddings for {len(results)} papers without embeddings.")



In [None]:
store_embeddings_in_neo4j()

In [None]:
from scipy.spatial.distance import cosine

def fetch_filtered_papers(query):
    cypher_query = """
    MATCH (p:Paper)
    WHERE p.abstract CONTAINS $query OR p.title CONTAINS $query
    RETURN p.title AS title, p.embedding AS embedding, ID(p) AS id
    """
    results, _ = db.cypher_query(cypher_query, {'query': query})
    return results

def compute_similarity(query_embedding, paper_embedding):
    return 1 - cosine(query_embedding, paper_embedding)


def semantic_search(query, top_n=5, similarity_threshold=0.2):
    query_embedding = encode_text(query)

    filtered_papers = fetch_filtered_papers(query)

    if not filtered_papers:
        return "No papers found that match the query."

    results_with_similarity = []
    for paper in filtered_papers:
        if paper[1] is not None:
            title = paper[0]
            paper_embedding = torch.tensor(paper[1])  
            similarity = compute_similarity(query_embedding, paper_embedding)

            if similarity >= similarity_threshold:
                results_with_similarity.append((title, similarity))

    if not results_with_similarity:
        return "No relevant papers found with sufficient similarity."

    results_with_similarity.sort(key=lambda x: x[1], reverse=True)
    return results_with_similarity[:top_n]


In [None]:
query = "cosmological models"

top_papers = semantic_search(query)

if isinstance(top_papers, str):
    print(top_papers)  
else:
    for title, score in top_papers:
        print(f"Title: {title}, Similarity: {score}")