In [1]:
import dotenv
dotenv.load_dotenv('.env', override=True)

import warnings
warnings.simplefilter("ignore", category=FutureWarning)

In [2]:
import os
from huggingface_hub import InferenceClient

API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
MISTRAL_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
client = InferenceClient(api_key=API_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain_community.vectorstores import Neo4jVector
from transformers import AutoTokenizer, AutoModel
import torch
import os

class CustomHuggingFaceEmbeddings:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_text(self, text):
        try:
            inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        except Exception as e:
            print(f"Error during tokenization: {e}")
            return []
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    
    def embed_query(self, text):
        return self.embed_text(text)
    
    def embed_documents(self, text):
        return self.embed_text(text)

vector_index = Neo4jVector.from_existing_graph(
    CustomHuggingFaceEmbeddings(),
    url=os.environ['NEO4J_URI'],
    username=os.environ['NEO4J_USERNAME'],
    password=os.environ['NEO4J_PASSWORD'],
    index_name='articles',
    node_label="Article",
    text_node_properties=['name', 'abstract'],
    embedding_node_property='embedding',
)

In [4]:
def query_from_mistral(context: str, user_input: str):
    messages = [
        {"role": "system", "content": f"Use the following context to answer the query:\n{context}"},
        {"role": "user", "content": user_input},
    ]
    completion = client.chat.completions.create(
        model=MISTRAL_MODEL_NAME,
        messages=messages,
        max_tokens=500,
    )
    return completion.choices[0].message["content"]

In [5]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver(
            os.environ['NEO4J_URI'], 
            auth=(os.environ['NEO4J_USERNAME'], os.environ['NEO4J_PASSWORD'],))

def query_article_keywords(name):
    with driver.session() as session:
        query = """
        MATCH (a:Article)-[:CONTAIN]->(k:Keyword)
        WHERE a.name = $name
        RETURN k
        """
        result = session.run(query, name=name)
        return [record["k"] for record in result]

In [6]:
def extract_data(documents):
    result = []

    for doc in documents:
        publication_date = doc.metadata.get('date_publication', "N/A")
        page_content = doc.page_content.strip().split("\n")
        
        title = "N/A"
        abstract = "N/A"

        for line in page_content:
            if line.lower().startswith("name:"):
                title = line[len("name:"):].strip()
            elif line.lower().startswith("abstract:"):
                abstract = line[len("abstract:"):].strip()

        doc_data = {
            "Publication Date": publication_date,
            "Title": title,
            "Abstract": abstract,
        }
        result.append(doc_data)

    return result


In [7]:
query = "which articles discuss Molecular docking"
similar  = vector_index.similarity_search(query, k=5)
print(similar[0].page_content)


name: Identifying molecular targets of Aspiletrein-derived steroidal saponins in lung cancer using network pharmacology and molecular docking-based assessments
abstract: © 2023, The Author(s).Lung cancer is one of the leading cancers and causes of cancer-related deaths worldwide. Due to its high prevalence and mortality rate, its clinical management remains a significant challenge. Previously, the in vitro anticancer activity of Aspiletrein A, a steroid and a saponin from Aspidistra letreae, against non-small cell lung cancer (NSCLC) cells was reported. However, the anticancer molecular mechanism of other Aspiletreins from A. letreae remains unknown. Using in silico network pharmacology approaches, the targets of Aspiletreins were predicted using the Swiss Target Prediction database. In addition, key mediators in NSCLC were obtained from the Genetic databases. The compound-target interacting networks were constructed using the STRING database and Cytoscape, uncovering potential target

In [8]:
data_dict = extract_data(similar)

for data in data_dict:
    title = data['Title']
    keywords = query_article_keywords(title)
    keywords = [dict(node)['text'] for node in keywords]
    data['keywords'] = ','.join(keywords)

context = '\n'.join([
    f"Title: {doc['Title']}\n"
    f"Abstract: {doc['Abstract']}\n"
    f"Publication Date: {doc['Publication Date']}\n"
    f"Keywords: {doc['keywords']}"
    for doc in data_dict
])

print(context)


Title: Identifying molecular targets of Aspiletrein-derived steroidal saponins in lung cancer using network pharmacology and molecular docking-based assessments
Abstract: © 2023, The Author(s).Lung cancer is one of the leading cancers and causes of cancer-related deaths worldwide. Due to its high prevalence and mortality rate, its clinical management remains a significant challenge. Previously, the in vitro anticancer activity of Aspiletrein A, a steroid and a saponin from Aspidistra letreae, against non-small cell lung cancer (NSCLC) cells was reported. However, the anticancer molecular mechanism of other Aspiletreins from A. letreae remains unknown. Using in silico network pharmacology approaches, the targets of Aspiletreins were predicted using the Swiss Target Prediction database. In addition, key mediators in NSCLC were obtained from the Genetic databases. The compound-target interacting networks were constructed using the STRING database and Cytoscape, uncovering potential target

In [9]:
response = query_from_mistral(context.strip(), query)
print(response)

1. Title: Identifying molecular targets of Aspiletrein-derived steroidal saponins in lung cancer using network pharmacology and molecular docking-based assessments
   Abstract: This study uses molecular docking to investigate the interaction between key identified targets (STAT3, VEGFA, HSP90AA1, FGF2, and IL2) and Aspiletreins in non-small cell lung cancer. Publication Date: 2023-12-01

2. Title: Discovery of Novel EGFR Inhibitor Targeting Wild-Type and Mutant Forms of EGFR: In Silico and In Vitro Study
   Abstract: The study uses molecular docking to evaluate the potential of several compounds to suppress wild-type and mutant EGFR, and to understand the interactions of the most promising compound, PD13, with EGFR at the molecular level.Publication Date: 2023-04-01

3. Title: Structure–activity relationships and molecular docking analysis of Mcl-1 targeting renieramycin T analogues in patient-derived lung cancer cells
   Abstract: This study performs molecular docking to investigate t