In [2]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_ollama import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough

In [None]:
import os
from langchain_community.graphs import Neo4jGraph

#Connect to db and set up graph
#DB uri, username and password are obtained from https://console.neo4j.io/?product=aura-db&tenant=eeec5ebf-3699-4806-b795-ba33f3d1c246#databases 
# and creating an instace
os.environ["NEO4J_URI"] = "NEO4J_URI"
os.environ["NEO4J_USERNAME"] = "NEO4J_USERNAME"
os.environ["NEO4J_PASSWORD"] = "NEO4J_PASSWORD"


graph = Neo4jGraph()

In [22]:
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader

# Read the article
raw_documents = PyPDFLoader("PATH-TO-DOCUMENTS").load()

# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents)
print(documents)

In [23]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.llms.ollama import Ollama 

prompt_template = (
    """
    You are a knowledge graph engineer working on a project to extract structured data from a document. \n
    Your task is to extract structured data from the following input. \n
    Make sure each node contains the following property: \n
    - Context \n

    Context should only be the documents name. \n
"""
)

prompt = ChatPromptTemplate.from_template(prompt_template)

default_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            prompt_template,
        ),
        (
            "human",
            (
                "Tip: Make sure to answer in the correct format and do "
                "not include any explanations. "
                "Use the given format to extract information from the "
                "following input:"
            ),
        ),
    ]
)


llm = Ollama(model='llama3.1',temperature=0)
llm_transformer = LLMGraphTransformer(llm=llm)


# Extract graph data
#This create the graph from the documents
graph_documents = llm_transformer.convert_to_graph_documents(documents)

# Store to neo4j
#this is where the graph is stored in the neo4j database
graph.add_graph_documents(
  graph_documents, 
  baseEntityLabel=True, 
  include_source=True
)

In [24]:
from langchain.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Neo4jVector

#Embedding model
embedding = OllamaEmbeddings(model="jina/jina-embeddings-v2-base-en:latest")


vector_index = Neo4jVector.from_existing_graph(
    embedding=embedding,
    search_type = "hybrid",
    node_label = "Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)


In [25]:
from typing import List
from pydantic import BaseModel, Field
from langchain.prompts import ChatPromptTemplate


# Define the Entities class
class Entities(BaseModel):
    """Identifying information about entities."""
    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that appear in the text",
    )


# Create the prompt template
prompt1 = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting any important entities from the text . "
            "Return only a list of string extracted entities and nothing else"
            "format to return should be something like entity1,entity2...",
            
        ),
        (
            "human",
            "extract information from the following input: {question}",
        ),
    ]
)

def extract_entities(question: str) -> Entities:
    # Format the prompt with the provided question
    formatted_prompt = prompt1.format(question=question)

    # Get the response from the LLM
    response = llm.invoke(formatted_prompt) 

    # Process the response to extract entity names (assuming response is a string)
    entity_names = process_response(response)  
    print(entity_names)

    return Entities(names=entity_names)

def process_response(response: str) -> List[str]:
    return [name.strip() for name in response.split(",") if name.strip()]



In [26]:
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text
    search. It processes the input string by splitting it into words and 
    appending a similarity threshold (~2 changed characters) to each
    word, then combines them using the AND operator. Useful for mapping
    entities from user questions to database values, and allows for some 
    misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

In [27]:
# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned in the question.
    """
    result = ""
    entities = extract_entities(question)
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:10})
            YIELD node, score
            CALL {
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [28]:

def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
    {structured_data}
    Unstructured data:
    {"#Document ". join(unstructured_data)}
        """
    return final_data

In [29]:
import json

def retrieverJson(question: str):
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    return unstructured_data

In [30]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | llm
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [31]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
System:

Use natural language and be concise.
Add contextual information in your answer. Eg 'the title of the document is: '
'The answer to your given question is' Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
ans = retrieverJson("Who all can avail car lease?")
print(ans[0])