In [1]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import ConfigurableField
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase
import os
from langchain_community.vectorstores import Neo4jVector
from langchain_community.graphs import Neo4jGraph
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.llms import Ollama

In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()


OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')
NEO4J_URI=os.getenv('NEO4J_URI')
NEO4J_USERNAME=os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD=os.getenv('NEO4J_PASSWORD')
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD

In [3]:
graph = Neo4jGraph()

In [4]:
loader = DirectoryLoader('pdfs-new', glob="*.txt", show_progress=True) #Replace the directory with your dataset
docs = loader.load()
print(docs)

100%|██████████| 1/1 [00:02<00:00,  2.08s/it]






In [5]:
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(docs[:3])

In [4]:
from langchain_openai import ChatOpenAI
llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")


#TODO: Add the Ollama compatiblity for LLMGraphTransformer


In [7]:
llm_transformer = LLMGraphTransformer(llm=llm)

In [8]:
graph_documents = llm_transformer.convert_to_graph_documents(documents)

In [9]:
graph_documents 

[GraphDocument(nodes=[Node(id='Nasa', type='Organization'), Node(id='Discovery', type='Mission class'), Node(id='New Frontiers', type='Mission class'), Node(id='Principal Investigators', type='Person')], relationships=[Relationship(source=Node(id='Nasa', type='Organization'), target=Node(id='Discovery', type='Mission class'), type='COMPETED'), Relationship(source=Node(id='Nasa', type='Organization'), target=Node(id='New Frontiers', type='Mission class'), type='COMPETED'), Relationship(source=Node(id='Principal Investigators', type='Person'), target=Node(id='Nasa', type='Organization'), type='LEAD')], source=Document(page_content='Title: What are competed planetary missions? Link: https://www.planetary.org/articles/what-are-competed-planetary-missions Detailed Text: Casey Dreier•May 30, 2024 When The Planetary Society advocates for a balanced portfolio of planetary exploration missions at NASA, we frequently mention the importance ofcompetedmissions as part of that balance. In essence, 

In [10]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [11]:
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

In [12]:
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase

In [13]:
def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    display(widget)
    return widget

In [14]:
showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

GraphWidget(layout=Layout(height='800px', width='100%'))

In [15]:
from langchain_openai import OpenAIEmbeddings
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [16]:
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

[]

In [17]:
from langchain_core.pydantic_v1 import BaseModel, Field
# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

In [18]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate

In [19]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

In [20]:
entity_chain = prompt | llm.with_structured_output(Entities)

In [21]:
entity_chain.invoke({"question": "In which year detection of gravitational wave happened?"})

Entities(names=['detection', 'gravitational wave'])

In [22]:
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

In [23]:
def generate_full_text_query(input: str) -> str:
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

In [24]:
# Fulltext index query
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result


In [25]:
print(structured_retriever("What are competed planetary missions"))




In [26]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [27]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

In [28]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [29]:
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

In [30]:
_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [44]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise and give a detailed explanation of the question with links and references.
Answer:"""

In [45]:
prompt = ChatPromptTemplate.from_template(template)

In [46]:

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [47]:
response = chain.invoke({"question": "What are competed planetary missions? give a link to the article as well"})
print(response)

Search query: What are competed planetary missions? give a link to the article as well
Competed planetary missions are missions where various scientific teams compete for funding opportunities released by NASA every few years to propose new planetary exploration missions. These missions fall under two classes: Discovery and New Frontiers, with cost caps of $500 million and $900 million respectively. Competed missions are led by Principal Investigators who assemble science and engineering teams to submit detailed proposals to NASA. These missions are science-focused, scientist-led, and historically cost less and fly more frequently than flagship missions. For more information, you can refer to the article "What are competed planetary missions?" on The Planetary Society website: [Link](https://www.planetary.org/articles/what-are-competed-planetary-missions).
