# QA Model using Knowledge Graph and RAG

Import all Neccessry Libraries

In [1]:
import os
import glob
from langchain_neo4j import Neo4jGraph
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase
from typing import Tuple, List
from langchain_community.vectorstores import Neo4jVector
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import OpenAIEmbeddings
from pydantic import BaseModel, Field
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain.schema.output_parser import StrOutputParser
from langchain.schema import AIMessage, HumanMessage
from langchain.chains.openai_functions import create_structured_output_runnable
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda, RunnableBranch, RunnableParallel


Set Environment Variables

In [3]:
# Load environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

Connect to the Neo4j Database

In [4]:
# Establish connection to Neo4j
graph = Neo4jGraph(url = NEO4J_URI, username = NEO4J_USERNAME, password = NEO4J_PASSWORD)

**Store all PDFs in a Single Folder**

In [None]:
def load_pdfs_from_folder(folder_path: str) -> dict:
  """Load all PDFs from a given folder and its subfolders, and store them in a dictionary.

  Args:
    folder_path: The path to the folder containing subfolders with PDF files.

  Returns:
    A dictionary with the subfolder names as keys and the content of all PDFs in each subfolder as values.
  """
  folder_dict = {}
  j=1
  for subfolder in os.listdir(folder_path):
    subfolder_path = os.path.join(folder_path, subfolder)
    if os.path.isdir(subfolder_path):
      pdf_files = glob.glob(os.path.join(subfolder_path, "*.pdf"))
      documents = []
      print(j, " ", subfolder)
      i = 1
      j+=1
      for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        print("  ",i, ".  ", pdf_file)
        documents.extend(loader.load())
        i += 1
      folder_dict[subfolder] = documents
  return folder_dict

folder_path = "/home/vaibhavksir01/Downloads/project/Knowledge Graph/Class 6 textbooks"
raw_document = load_pdfs_from_folder(folder_path)

Split the text into small chunks and sorting 

In [7]:
text_splitter = TokenTextSplitter(chunk_size = 1024, chunk_overlap = 48)
documents = {key: sorted(text_splitter.split_documents(value), key=lambda doc: doc.metadata['source']) for key, value in raw_document.items()}

In [8]:
for key, value in documents.items():
    for doc in value:
        unit = doc.metadata['source'].split(". ") 
        unit = unit[len(unit)-1].replace('.pdf','')
        doc.metadata['creator'] = key
        doc.metadata['producer'] = key
        if 'moddate' in doc.metadata:
            doc.metadata.__delitem__('moddate')
        if 'creationdate' in doc.metadata:
            doc.metadata.__delitem__('creationdate')
        doc.metadata['source'] = unit

In [5]:
# Initialize Language Model
llm = ChatOpenAI(api_key = OPENAI_API_KEY, model_name = "gpt-3.5-turbo", temperature = 0)

Data to Graph

In [6]:
llm_transformer = LLMGraphTransformer(llm = llm) #Transformer that transform data to graph



In [12]:
#Transforming the Data in a form that can be plotted as a Knowledge graph
graph_document = {}
for key, value in documents.items():
    print("start",key)
    graph_document[key]=llm_transformer.convert_to_graph_documents(value)
    print("complete\n")

start English
complete

start Science
complete

start Social Science
complete



In [13]:
#Plotting the Knowledge Graph
for key, value in graph_document.items():
    print(key)
    graph.add_graph_documents(
        value,
        baseEntityLabel=True,
        include_source=True
    )

English
Science
Social Science


**Showing the Graph stored in the Neo4j Database**

In [7]:
default_cypher = "MATCH p=(s)-[r]->(t) RETURN p"

In [8]:
def showGraph(cypher: str = default_cypher):
  driver = GraphDatabase.driver(
      uri = NEO4J_URI,
      auth = (NEO4J_USERNAME, NEO4J_PASSWORD)
  )
  session = driver.session()
  widget = GraphWidget(graph = session.run(cypher).graph())
  widget.node_label_mapping = 'id'
  return widget

In [9]:
showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

Creating Vectors indexes of the Graph

In [10]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(api_key = OPENAI_API_KEY),
    search_type = "hybrid",
    node_label = "Document",
    text_node_properties = ["text"],
    embedding_node_property = "embedding",
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD
)

In [11]:
class Entities(BaseModel):
  """Identifying information about entities"""
  marks: dict = Field(
    ...,
    description="""Dictionary containing the number of questions for each mark from 1 to 10.
    Example- input generate 3questions of 1,2,5 marks respectively
    output- {'1': 1, '2': 1, '3': 0, '4': 0, '5': 1, '6': 0, '7': 0, '8': 0, '9': 0, '10': 0}
    """,
  )


Prompt for extarcting Information

In [12]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting entities from the text. The entities should be identified and categorized based on the given format.",
        ),
        (
            "human",
            "Use the given format to extract information from the following input: {question}",
        ),
    ]
)


Create a chain to show all the relationships

In [13]:
entity_chain = prompt | llm.with_structured_output(Entities)

In [14]:
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

[]

Generat the Query for the question

In [15]:
def generate_full_text_query(input: str) -> str:
    """Generates a full-text query string for Neo4j.

    Args:
        input: The input string to generate the query from.

    Returns:
        A full-text query string.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " OR ".join([f"{word}~2" for word in words])

    return full_text_query.strip()

Structure the Query generated to work on the Neo4j Cypher query

In [16]:
def structured_retriever(question: str, max_entities = 5, max_results = 1000) -> str:
    """Retrieves information from the graph based on entities in the question.

    Args:
        question: The user's question.
        max_entities: Maximum number of entities to extract (default: 5).
        max_results: Maximum number of results to return (default: 1000).

    Returns:
        A formatted string containing the retrieved information.
    """

    result = ""
    try:
        entities = entity_chain.invoke({"question": question})
    except Exception as e:
        return f"Error extracting entities: {e}"

    for entity in entities.names[:max_entities]:
            response = graph.query(
                """CALL db.index.fulltext.queryNodes('entity', $query)
                YIELD node,score
                WITH node ORDER BY score DESC LIMIT $entity_limit
                CALL {
                    WITH node
                    MATCH (node)-[r]->(neighbor)
                    WHERE type(r) IN ['MENTIONS', 'RELATED_TO', 'DEFINES']
                    RETURN node.id + ' -[' + type(r) + ']-> ' + neighbor.id + ': ' + coalesce(neighbor.text, '') AS output
                    UNION ALL
                    WITH node
                    MATCH (node)<-[r]-(neighbor)
                    WHERE type(r) IN ['MENTIONS', 'RELATED_TO', 'DEFINES']
                    RETURN neighbor.id + ' -[' + type(r) + ']-> ' + node.id + ': ' + coalesce(node.text, '') AS output
                }
                RETURN output LIMIT $result_limit
                """,
                {"query": generate_full_text_query(entity),
                 "entity_limit": 5,
                 "result_limit": max_results}
            )

            if response:
                result += "\n".join([el['output'] for el in response if el['output'] is not None]) + "\n"
            else:
                result += f"No results found for entity: {entity}\n"  
    return result.strip()

In [17]:
def retriever(question: str):
  print(f"\nSearch query: {question}\n")
  structured_data = structured_retriever(question)
  unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
  final_data = f"""Structured data:
  {structured_data}
  unstructured data:
  {"#Document ".join(unstructured_data)}
  """
  return final_data

Template to extract Standalone question

In [18]:
_template = """
You are extracting entities from the text.
Use the given format to extract information from the following input: {question}
"""

In [19]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

Create a conversation from the chat history

In [20]:
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
  buffer = []
  for human, ai in chat_history:
    buffer.append(HumanMessage(content=human))
    buffer.append(AIMessage(content=ai))
  return buffer

Add chat history and question to find the relationship

In [21]:
_search_query = RunnableBranch(
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name = "HasChatHistoryCheck"
        ),
        RunnablePassthrough.assign(
            chat_history = lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(api_key = OPENAI_API_KEY)
        | StrOutputParser()
    ),
    RunnableLambda(lambda x: x["question"]),
)

Prompt Template to answer in Natural language

In [22]:
template = """you are a question generator, you generate questions based on following context:
{context}

the difficulty to answer the question dependes on the marks aloted to it with 1 marks being easiest and 10 being the most difficult

Question: {question}
Use natural language and be concise.
Answer:"""

In [23]:
prompt = ChatPromptTemplate.from_template(template)

Chain to search through the graph to get the answer

In [24]:
chain = (
    RunnableParallel(
        {"context": _search_query | retriever,
        "question": RunnablePassthrough()}
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [25]:
import warnings
import logging

warnings.filterwarnings("ignore")

logging.getLogger().setLevel(logging.CRITICAL)

In [28]:
while True:
  question = input("Enter your Question or 'exit' to exit: ")
  if question.lower() == "exit":
    print("\nGoodBye")
    break
  else:
    print(chain.invoke({"question": question}))


Search query: create a question about si units for 5 marks with its answer

What is the significance of SI units in standard units of measurement? 
Answer: The SI units are internationally adopted standard units of measurement that provide consistency and accuracy in scientific and everyday measurements.

GoodBye
