### Hybrid Search + Knowledge Graph
Combining hybrid (vector + keyword) and graph retrieval methods for Advanced RAG

In [9]:
import langchain, langchain_core, langchain_community
from langchain_groq import ChatGroq
import os

In [36]:
llm = ChatGroq(
    temperature=0, model_name="gemma2-9b-it", api_key=os.environ.get("GROQ_KEY")
)
NEO4J_URI = "xxx"
NEO4J_USERNAME = "xxx"
NEO4J_PASSWORD = "xxx"

In [24]:
from langchain_community.graphs import Neo4jGraph

langchain.verbose = True
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

In [4]:
!pip --quiet install wikipedia


[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from langchain.document_loaders import WikipediaLoader
raw_doc = WikipediaLoader(query="Sri Lanka").load()

In [5]:
!pip --quiet install tiktoken


[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


##### Split Documents

In [6]:
import tiktoken
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
docs = text_splitter.split_documents(raw_doc[:3])

In [7]:
docs[0]

Document(metadata={'title': 'Sri Lanka', 'summary': "Sri Lanka, historically known as Ceylon, and officially the Democratic Socialist Republic of Sri Lanka, is an island country in South Asia. It lies in the Indian Ocean, southwest of the Bay of Bengal, separated from the Indian peninsula by the Gulf of Mannar and the Palk Strait. It shares a maritime border with the Maldives in the southwest and India in the northwest.\nSri Lanka has a population of approximately 22 million and is home to many cultures, languages and ethnicities. The Sinhalese people form the majority of the population, followed by the Sri Lankan Tamils, who are the largest minority group and are concentrated in northern Sri Lanka; both groups have played an influential role in the island's history. Other long-established groups include the Moors, Indian Tamils, Burghers, Malays, Chinese, and Vedda.\nSri Lanka's documented history goes back 3,000 years, with evidence of prehistoric human settlements dating back 125,00

##### Documents to graph transformer

In [13]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_graph_transformer = LLMGraphTransformer(llm=llm)


In [14]:
graph_docs = llm_graph_transformer.convert_to_graph_documents(docs)

In [15]:
graph_docs

[GraphDocument(nodes=[Node(id='Sri Lanka', type='Country'), Node(id='Ceylon', type='Historical_name'), Node(id='Democratic Socialist Republic Of Sri Lanka', type='Official_name'), Node(id='South Asia', type='Continent'), Node(id='Indian Ocean', type='Ocean'), Node(id='Bay Of Bengal', type='Bay'), Node(id='Gulf Of Mannar', type='Gulf'), Node(id='Palk Strait', type='Strait'), Node(id='Maldives', type='Country'), Node(id='India', type='Country'), Node(id='Sinhalese', type='Ethnicity'), Node(id='Sri Lankan Tamils', type='Ethnicity'), Node(id='Moors', type='Ethnicity'), Node(id='Indian Tamils', type='Ethnicity'), Node(id='Burghers', type='Ethnicity'), Node(id='Malays', type='Ethnicity'), Node(id='Chinese', type='Ethnicity'), Node(id='Vedda', type='Ethnicity'), Node(id='Pāli Canon', type='Text'), Node(id='Silk Road', type='Trade_route'), Node(id='Anuradhapura Period', type='Historical_period'), Node(id='Kingdom Of Kotte', type='Kingdom'), Node(id='Portuguese', type='Empire'), Node(id='Dutch 

In [20]:
graph.refresh_schema()
print(graph.schema)

Node properties:

Relationship properties:

The relationships:



In [28]:
graph.add_graph_documents(
    graph_documents=graph_docs, baseEntityLabel=True, include_source=True
)

In [29]:
graph.refresh_schema()
print(graph.schema)

Node properties:
Person {id: STRING}
Document {title: STRING, id: STRING, text: STRING, summary: STRING, source: STRING}
Country {id: STRING}
Historical_name {id: STRING}
Official_name {id: STRING}
Continent {id: STRING}
Ocean {id: STRING}
Bay {id: STRING}
Gulf {id: STRING}
Strait {id: STRING}
Ethnicity {id: STRING}
Text {id: STRING}
Trade_route {id: STRING}
Historical_period {id: STRING}
Kingdom {id: STRING}
Empire {id: STRING}
Group {id: STRING}
Military {id: STRING}
Location {id: STRING}
Conflict {id: STRING}
Territory {id: STRING}
Ethnic group {id: STRING}
Building {id: STRING}
Organization {id: STRING}
Government {id: STRING}
Nationality {id: STRING}
Place {id: STRING}
Language {id: STRING}
Team {id: STRING}
Sport_status {id: STRING}
Concept {id: STRING}
Venue {id: STRING}
Date {id: STRING}
Relationship properties:

The relationships:
(:Person)-[:DEMANDED]->(:Person)
(:Person)-[:REPLACEMENT]->(:Language)
(:Person)-[:MEMBER]->(:Organization)
(:Person)-[:NICKNAME]->(:Team)
(:Person)

In [30]:
from typing import Tuple, List, Optional

In [32]:
!pip --quiet install langchain-voyageai


[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
from langchain_community.vectorstores import Neo4jVector
from langchain_voyageai import VoyageAIEmbeddings
embedd_model = VoyageAIEmbeddings(
    voyage_api_key=os.environ.get("VOYAGE_KEY"), model="voyage-2"
)

batch size None


In [71]:
vector_store = Neo4jVector.from_existing_graph(
    embedding=embedd_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="Country_index",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding",
)



In [73]:
graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]"
)

[]

In [74]:
from langchain_core.pydantic_v1 import BaseModel, Field

# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

In [75]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

In [76]:
entity_chain = prompt | llm.with_structured_output(Entities)

In [77]:
entity_chain.invoke({"question": "Where was Amelia Earhart born?"}).names

['Amelia Earhart']

In [79]:
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars


def generate_full_text_query(input: str) -> str:
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

In [80]:
# Fulltext index query
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el["output"] for el in response])
    return result

In [84]:
print(structured_retriever("Vadda People"))




In [86]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [
        el.page_content for el in vector_store.similarity_search(question)
    ]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [87]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

In [88]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [94]:
from langchain_core.messages import HumanMessage, AIMessage


def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

In [96]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.output_parsers import StrOutputParser

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | llm
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x: x["question"]),
)

In [99]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""

In [100]:
prompt = ChatPromptTemplate.from_template(template)

In [101]:
chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [104]:
chain.invoke({"question": "Vadda People"})

Search query: Vadda People


'The Vedda are one of the long-established ethnic groups in Sri Lanka. \n'