<a href="https://colab.research.google.com/github/sunnysavita10/Indepth-GENAI/blob/main/RAG_With_Knowledge_graph(Neo4j).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **langchain-core**

contains simple, core abstractions that have emerged as a standard, as well as LangChain Expression Language as a way to compose these components together. This package is now at version 0.1 and all breaking changes will be accompanied by a minor version bump.

## **langchain-community**
contains all third party integrations. We will work with partners on splitting key integrations out into standalone packages over the next month.

## **langchain**
contains higher-level and use-case specific chains, agents, and retrieval algorithms that are at the core of your application's cognitive architecture. We are targeting a launch of a stable 0.1 release for langchain in early January.#

In [1]:
# Import statements for various modules and classes needed for the script.
import os  # Basic operating system functionality
from langchain_core.runnables import (  # Runnable classes for different types of operations
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate  # Template for chat prompts
from langchain_core.prompts.prompt import PromptTemplate  # Template for prompts
from typing import Tuple, List, Optional  # Type hints for function signatures
from langchain_core.messages import AIMessage, HumanMessage  # Message classes for AI and human interaction
from langchain_core.output_parsers import StrOutputParser  # Parser for string outputs
from langchain_core.runnables import ConfigurableField  # Configurable fields for operations
from yfiles_jupyter_graphs import GraphWidget  # Widget for displaying graphs in Jupyter
from neo4j import GraphDatabase  # Neo4j graph database connection
from langchain_community.vectorstores import Neo4jVector  # Vector store using Neo4j
from langchain_community.graphs import Neo4jGraph  # Graph operations using Neo4j
from langchain.text_splitter import TokenTextSplitter  # Text splitter based on tokens
from langchain_groq import ChatGroq  # Query language for chatting
from langchain_experimental.graph_transformers import LLMGraphTransformer  # Experimental graph transformer
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
import os
from dotenv import load_dotenv

In [2]:
# Load environment variables from the system
NEO4J_URI = os.environ.get("NEO4J_URI")  # Retrieves the Neo4j URI from environment variables
NEO4J_USERNAME = os.environ.get("NEO4J_USERNAME")  # Retrieves the Neo4j username
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")  # Retrieves the Neo4j password
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')  # Retrieves the Groq API key

# Setting environment variables for Neo4j and GROQ API in the current session
os.environ["NEO4J_URI"] = NEO4J_URI  # Stores the Neo4j URI in the environment variables
os.environ["NEO4J_USERNAME"] = NEO4J_USERNAME  # Stores the Neo4j username
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD  # Stores the Neo4j password
os.environ['Groq_api_key'] = GROQ_API_KEY  # Stores the Groq API key in environment variables


In [3]:
# Establish a connection to the Neo4j graph database
graph = Neo4jGraph(
    url=NEO4J_URI,        # Neo4j database URI
    username=NEO4J_USERNAME,  # Username for authentication
    password=NEO4J_PASSWORD,  # Password for authentication
)

  graph = Neo4jGraph(


In [None]:
from langchain.schema import Document

# Load the synthetic data
data = [
    {
        "query": "How can I reset my password?",
        "retrieved_knowledge": "To reset your password, go to the login page and click on 'Forgot Password'. Enter your registered email, and we will send you a reset link.",
        "bot_response": "You can reset your password by clicking on 'Forgot Password' on the login page. A reset link will be sent to your email. Let me know if you need further assistance!"
    },
    {
        "query": "What is your refund policy?",
        "retrieved_knowledge": "Our refund policy allows customers to request a refund within 30 days of purchase if they are not satisfied with the product. The refund will be processed within 5-7 business days.",
        "bot_response": "We offer a 30-day refund policy. If you're not satisfied with the product, you can request a refund, which will be processed within 5-7 business days. Need help initiating a refund?"
    },
    {
        "query": "Do you offer live support?",
        "retrieved_knowledge": "Yes, we offer live chat support from 9 AM to 6 PM (PST) on weekdays. You can also reach us via email outside these hours.",
        "bot_response": "Yes, we provide live chat support from 9 AM to 6 PM (PST) on weekdays. You can also email us anytime, and we'll get back to you as soon as possible!"
    },
    {
        "query": "Where is my order?",
        "retrieved_knowledge": "You can track your order by logging into your account and navigating to the 'Orders' section. If you need further assistance, contact our support team.",
        "bot_response": "You can track your order in the 'Orders' section of your account. If you need additional help, feel free to reach out!"
    },
    {
        "query": "How do I cancel my subscription?",
        "retrieved_knowledge": "To cancel your subscription, go to 'Account Settings' and select 'Manage Subscription'. Click on 'Cancel Subscription' and follow the on-screen instructions.",
        "bot_response": "You can cancel your subscription by going to 'Account Settings' > 'Manage Subscription' > 'Cancel Subscription'. Let me know if you need guidance!"
    },
    {
        "query": "How do I change my billing information?",
        "retrieved_knowledge": "To update your billing information, navigate to 'Account Settings' and select 'Billing'. You can update your payment method and billing details there.",
        "bot_response": "You can change your billing information by going to 'Account Settings' > 'Billing'. Let me know if you need any assistance!"
    }
]

# Convert data into LangChain Document format
raw_documents = [Document(page_content=item["retrieved_knowledge"], metadata={"query": item["query"]}) for item in data]


In [5]:
len(raw_documents)

6

In [6]:
raw_documents[:3]

[Document(metadata={'query': 'How can I reset my password?'}, page_content="To reset your password, go to the login page and click on 'Forgot Password'. Enter your registered email, and we will send you a reset link."),
 Document(metadata={'query': 'What is your refund policy?'}, page_content='Our refund policy allows customers to request a refund within 30 days of purchase if they are not satisfied with the product. The refund will be processed within 5-7 business days.'),
 Document(metadata={'query': 'Do you offer live support?'}, page_content='Yes, we offer live chat support from 9 AM to 6 PM (PST) on weekdays. You can also reach us via email outside these hours.')]

In [7]:
# Initialize a text splitter that divides text into chunks of 512 tokens
# with an overlap of 24 tokens to maintain context between chunks.
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)

# Split the first 100 raw documents into smaller chunks using the text splitter.
documents = text_splitter.split_documents(raw_documents)

In [8]:
llm = ChatGroq(temperature=0, model="gemma2-9b-it")

In [9]:
# Initialize a language model (LLM) using ChatGroq with a specified temperature setting.
# "temperature=0" ensures deterministic responses, reducing randomness.
# The model being used is "llama-3.3-70b-versatile".
llm = ChatGroq(temperature=0, model="gemma2-9b-it")

# Create an instance of LLMGraphTransformer, which will process documents
# into a graph-based structure using the LLM.
llm_transformer = LLMGraphTransformer(llm=llm)

# Convert the previously split text documents into graph-based representations.
# This transformation helps in structuring knowledge for better retrieval.
graph_documents = llm_transformer.convert_to_graph_documents(documents)

In [10]:
graph_documents[:5]

[GraphDocument(nodes=[Node(id='Login Page', type='Webpage', properties={})], relationships=[Relationship(source=Node(id='Login Page', type='Webpage', properties={}), target=Node(id='Forgot Password', type='Button', properties={}), type='CONTAINS', properties={})], source=Document(metadata={'query': 'How can I reset my password?'}, page_content="To reset your password, go to the login page and click on 'Forgot Password'. Enter your registered email, and we will send you a reset link.")),
 GraphDocument(nodes=[Node(id='Refund Policy', type='Policy', properties={}), Node(id='Customer', type='Person', properties={}), Node(id='Product', type='Product', properties={})], relationships=[Relationship(source=Node(id='Refund Policy', type='Policy', properties={}), target=Node(id='Customer', type='Person', properties={}), type='ALLOWS', properties={}), Relationship(source=Node(id='Customer', type='Person', properties={}), target=Node(id='Product', type='Product', properties={}), type='PURCHASES', 

In [11]:
# Add the graph-based documents to the existing knowledge graph.
graph.add_graph_documents(
    graph_documents,   # The structured graph documents generated from the LLM.
    baseEntityLabel=True,  # Enables labeling of base entities for better organization and retrieval.
    include_source=True  # Retains the original source information for traceability.
)

In [12]:
# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

In [13]:
def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    display(widget)
    return widget

In [14]:
showGraph()

GraphWidget(layout=Layout(height='640px', width='100%'))

GraphWidget(layout=Layout(height='640px', width='100%'))

In [15]:
google_api_key = os.environ.get("GEMINI_API_KEY")


# Use the API key safely
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=google_api_key)

In [16]:
vector_index = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [17]:
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

[]

In [18]:
from langchain_core.pydantic_v1 import BaseModel, Field
# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [19]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

In [20]:
entity_chain = prompt | llm.with_structured_output(Entities)

In [21]:
entity_chain.invoke({"question": "'Jessica Rios ?"}).names

['Jessica Rios']

In [22]:
def generate_full_text_query(input: str) -> str:
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()


In [23]:
# Fulltext index query
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [24]:
print(structured_retriever("email of Marisa Obrien and 'Jessica Rios',??"))

  words = [el for el in remove_lucene_chars(input).split() if el]





In [25]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [26]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

In [27]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [28]:
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

In [29]:
_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatGroq(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [30]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""

In [31]:
prompt = ChatPromptTemplate.from_template(template)

In [32]:
chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [34]:
chain.invoke({"question":  "What is your refund policy?"})

Search query: What is your refund policy?




"You can request a refund within 30 days of purchase if you're not satisfied with the product.  \n"