In [38]:
# !pip install neomodel scikit-learn
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [26]:
from neomodel import (StructuredNode, StringProperty, RelationshipTo, db, config, UniqueIdProperty)
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import spacy

nlp = spacy.load('en_core_web_sm')

config.DATABASE_URL = "bolt://neo4j:<PASSWORD>@localhost:7687"

# #Define the Entity Node
# class Entity(StructuredNode):
#     entity_id = UniqueIdProperty()
#     name = StringProperty(unique_index=True)
#     label = StringProperty()

# # Define the Paper Node
# class Paper(StructuredNode):
#     paper_id = UniqueIdProperty() 
#     title = StringProperty()
#     abstract = StringProperty()
#     mentions = RelationshipTo(Entity, "MENTIONS")

In [16]:
def extract_keywords_tfidf(abstracts, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(abstracts)
    feature_names = vectorizer.get_feature_names_out()

    top_keywords = []
    for i in range(tfidf_matrix.shape[0]):
        tfidf_scores = tfidf_matrix[i].toarray().flatten()
        top_indices = np.argsort(tfidf_scores)[-top_n:]
        top_features = [feature_names[idx] for idx in top_indices]
        top_keywords.append(top_features)
    
    return top_keywords

In [32]:
def process_papers():
    query = "MATCH (p:Paper) RETURN p.paper_id AS paper_id, p.title AS title, p.abstract AS abstract"
    results, _ = db.cypher_query(query)
    
    paper_ids = [result[0] for result in results]
    abstracts = [result[2] for result in results]
    
    # Extract keywords using TF-IDF
    top_keywords = extract_keywords_tfidf(abstracts)
    
    for paper_id, keywords in zip(paper_ids, top_keywords):
        # Create or get the Paper node
        paper_node = Paper.nodes.get_or_none(paper_id=paper_id)
        if paper_node:
            for keyword in keywords:
                # Create or get the Entity node for each keyword
                entity_node = Entity.nodes.get_or_none(name=keyword)
                if not entity_node:
                    entity_node = Entity(name=keyword).save()
                
                # Create the MENTIONS relationship if it doesn't exist
                if not paper_node.mentions.is_connected(entity_node):
                    paper_node.mentions.connect(entity_node)

In [34]:
process_papers()

In [35]:
def process_query(query):
    doc = nlp(query)
    keywords = [token.text.lower() for token in doc if token.pos_ in ['NOUN', 'PROPN', 'ADJ']]
    return keywords

def search(query):
    entities = process_query(query)
    print(f"Extracted Entities: {entities}")
    
    query = """
    MATCH (p:Paper)-[:MENTIONS]->(e:Entity)
    WHERE e.name IN $entities
    RETURN p.title AS title, p.abstract AS abstract, COLLECT(e.name) AS entities
    """
    results, _ = db.cypher_query(query, {'entities': entities})
    
    return [{"title": title, "abstract": abstract, "entities": entities} for title, abstract, entities in results]

In [37]:
search_query = "Find something about matrices"
search_results = search(search_query)

for result in search_results:
    print(f"Title: {result['title']}")
    print(f"Abstract: {result['abstract']}")
    print(f"Entities: {', '.join(result['entities'])}")
    print("-" * 80)

Extracted Entities: ['matrices']
Title: A general approach to few-cycle intense laser interactions with complex
  atoms
Abstract:   A general {it ab-initio} and non-perturbative method to solve the
time-dependent Schrodinger equation (TDSE) for the interaction of a strong
attosecond laser pulse with a general atom, i.e., beyond the models of
quasi-one-electron or quasi-two-electron targets, is described. The field-free
Hamiltonian and the dipole matrices are generated using a flexible $B$-spline
$R$-matrix method. This numerical implementation enables us to construct
term-dependent, non-orthogonal sets of one-electron orbitals for the bound and
continuum electrons. The solution of the TDSE is propagated in time using the
Arnoldi-Lanczos method, which does not require the diagonalization of any large
matrices. The method is illustrated by an application to the multi-photon
excitation and ionization of Ne atoms. Good agreement with $R$-matrix Floquet
calculations for the generalized cros