# QA Model using Knowledge Graph and RAG

### Install the requirements for this file

In [None]:
pip install -r requirments.txt

Import all Neccessry Libraries

In [None]:
import os
from langchain_neo4j import Neo4jGraph
from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase
from typing import Tuple, List, Optional
from langchain_community.vectorstores import Neo4jVector
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import OpenAIEmbeddings
from pydantic import BaseModel, Field
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain.schema.output_parser import StrOutputParser
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.chains.openai_functions import create_structured_output_runnable
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda, RunnableBranch, RunnableParallel


Set Environment Variables

In [None]:
# Load environment variables
api_key = os.getenv('OPENAI_API_KEY')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
url = os.getenv('NEO4J_URI')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

Connect to the Neo4j Database

In [None]:
# Establish connection to Neo4j
graph = Neo4jGraph(url=url, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

**Option1: if you want to get information from the wikipedia**

In [None]:
search = input("Enter the topic you want to train the model on: ")
raw_document = WikipediaLoader(query=search).load()

**Option2: if u have the data as a pdf**

In [None]:
pdf_path = "Constitution.pdf" # Change with actual file path
loader = PyPDFLoader(pdf_path)
raw_document = loader.load()

Split the text into small chunks

In [None]:
text_splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=48)
documents = text_splitter.split_documents(raw_document)

In [None]:
# Initialize Language Model
llm = ChatOpenAI(api_key=api_key, model_name="gpt-3.5-turbo", temperature=0)

Data to Graph

In [None]:
llm_transformer = LLMGraphTransformer(llm = llm) #Transformer that transform data to graph

In [None]:
graph_document = llm_transformer.convert_to_graph_documents(documents)

In [None]:
graph.add_graph_documents(
    graph_document,
    baseEntityLabel = True,
    include_source = True
)

**Showing the Graph stored in the Neo4j Database**

In [None]:
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t"

In [None]:
def showGraph(cypher: str=default_cypher):
  driver = GraphDatabase.driver(
      uri = url,
      auth = (NEO4J_USERNAME, NEO4J_PASSWORD)
  )
  session = driver.session()
  widget = GraphWidget(graph=session.run(cypher).graph())
  widget.node_label_mapping = 'id'
  display(widget)
  return widget

In [None]:
showGraph()

Creating Vectors indexes of the Graph

In [None]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(api_key = api_key),
    search_type = "hybrid",
    node_label = "Document",
    text_node_properties = ["text"],
    embedding_node_property = "embedding",
    url = url,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD
)

In [None]:
class Entities(BaseModel):
  """Identifying information about entities"""
  names: List[str] = Field(
      ...,
      destription = "All entitiies that appears in the text",
  )

Prompt for extarcting Information

In [None]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

Create a chain to show all the relationships

In [None]:
entity_chain = prompt | llm.with_structured_output(Entities)

In [None]:
entity_chain.invoke({"question": "Which is the largest Planet?"})

In [None]:
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

Generat the Query for the question

In [None]:
def generate_full_text_query(input: str) -> str:
    """Generates a full-text query string for Neo4j.

    Args:
        input: The input string to generate the query from.

    Returns:
        A full-text query string.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " OR ".join([f"{word}~2" for word in words])

    return full_text_query.strip()

Structure the Query generated to work on the Neo4j Cypher query

In [None]:
def structured_retriever(question: str, max_entities = 5, max_results = 100) -> str:
    """Retrieves information from the graph based on entities in the question.

    Args:
        question: The user's question.
        max_entities: Maximum number of entities to extract (default: 5).
        max_results: Maximum number of results to return (default: 100).

    Returns:
        A formatted string containing the retrieved information.
    """

    result = ""
    try:
        entities = entity_chain.invoke({"question": question})
    except Exception as e:
        return f"Error extracting entities: {e}"

    for entity in entities.names[:max_entities]:
            response = graph.query(
                """CALL db.index.fulltext.queryNodes('entity', $query)
                YIELD node,score
                WITH node ORDER BY score DESC LIMIT $entity_limit
                CALL {
                    WITH node
                    MATCH (node)-[r]->(neighbor)
                    WHERE type(r) IN ['MENTIONS', 'RELATED_TO', 'DEFINES']
                    RETURN node.id + ' -[' + type(r) + ']-> ' + neighbor.id + ': ' + coalesce(neighbor.text, '') AS output
                    UNION ALL
                    WITH node
                    MATCH (node)<-[r]-(neighbor)
                    WHERE type(r) IN ['MENTIONS', 'RELATED_TO', 'DEFINES']
                    RETURN neighbor.id + ' -[' + type(r) + ']-> ' + node.id + ': ' + coalesce(node.text, '') AS output
                }
                RETURN output LIMIT $result_limit
                """,
                {"query": generate_full_text_query(entity),
                 "entity_limit": 5,
                 "result_limit": max_results}
            )

            if response:
                result += "\n".join([el['output'] for el in response if el['output'] is not None]) + "\n"
            else:
                result += f"No results found for entity: {entity}\n"

    return result.strip()

In [None]:
def retriever(question: str):
  print(f"Search query: {question}")
  structured_data = structured_retriever(question)
  unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
  final_data = f"""Structured data:
  {structured_data}
  unstructured data:
  {"#Document ".join(unstructured_data)}
  """
  return final_data


Template to extract Standalone question

In [None]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its own language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

In [None]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

Create a conversation from the chat history

In [None]:
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
  buffer = []
  for human, ai in chat_history:
    buffer.append(HumanMessage(content=human))
    buffer.append(AIMessage(content=ai))
  return buffer

Add chat history and question to find the relationship

In [None]:
_search_query = RunnableBranch(
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name = "HasChatHistoryCheck"
        ),
        RunnablePassthrough.assign(
            chat_history = lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(api_key = api_key)
        | StrOutputParser()

    ),
    RunnableLambda(lambda x: x["question"]),

)

Prompt Template to answer in Natural language

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)

Chain to search through the graph to get the answer

In [None]:
chain = (
    RunnableParallel(
        {"context": _search_query | retriever,
        "question": RunnablePassthrough()}
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
while True:
  question = input("Enter your Question or 'exit' to exit: ")
  if question.lower() == "exit":
    print("GoodBye")
    break
  else:
    print(chain.invoke({"question": question}))