In [13]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)

In [14]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate

In [15]:
from typing import Tuple, List, Optional

In [16]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser

In [17]:
from langchain_core.runnables import ConfigurableField

In [18]:
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase


In [19]:
import os
import json

from dotenv import load_dotenv
load_dotenv()

True

In [20]:
print(os.environ['NEO4J_URI'])
print(os.environ['NEO4J_USERNAME'])
print(os.environ['NEO4J_PASSWORD'])

neo4j://localhost:7999
neo4j
password


In [21]:
from langchain_community.vectorstores import Neo4jVector

In [22]:
with open("./config.json", "r") as f:
    config = json.load(f)

In [23]:
from langchain_community.graphs import Neo4jGraph

In [24]:
graph = Neo4jGraph()

In [25]:
# from langchain.document_loaders import WikipediaLoader
# raw_documents = WikipediaLoader(query="Elizabeth I").load()

In [26]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

DATA_SOURCE = config['DATA']

raw_documents = []

for i in os.listdir(DATA_SOURCE):
    if i.endswith(".md"):
        loader = UnstructuredMarkdownLoader(f"{DATA_SOURCE}/{i}")
        data = loader.load()
        raw_documents.append(data[0])

In [27]:
len(raw_documents)

45

In [28]:
raw_documents

[Document(metadata={'source': './data/0.md'}, page_content='LOLC Technologies Knowledge Base\n\nContact: +94 011 500 1121 (076 884 0426)\n\nAbout LOLC Technologies\n\nLOLC Technologies Limited, a fully owned subsidiary of LOLC Group, incorporated in 2004, is an expert in IT Systems integrations. The company holds ISO 9000, ISO 20000, and ISO 27001 certifications.\n\nMission: Facilitating businesses to reach greater heights through innovative and cutting-edge technology solutions. Powering impossibilities by providing tailor-made solutions to clients across the globe.\n\nWhat We Are\n\nBringing you tomorrows technology today.\n\nWhat We Do\n\nOur IT Solutions:\n\nERP and EPM: Solutions to ease your day-to-day financial and operational activities and increase your business efficiency. [Learn More]\n\nMulti-Cloud Technology: Gives organizations more flexibility to optimize performance, control costs, and leverage the best cloud technologies available. [Learn More]\n\nService Transformatio

In [29]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents)

In [30]:
from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.2, 
    check_every_n_seconds=1,
)

In [31]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", rate_limiter=rate_limiter, verbose=True)

In [32]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer = LLMGraphTransformer(llm=llm)

In [33]:
import time
from tqdm import tqdm

graph_documents = []
for i, doc in enumerate(tqdm(documents)):
	graph_documents.extend(llm_transformer.convert_to_graph_documents([doc]))
	# time.sleep(1)

100%|██████████| 53/53 [05:07<00:00,  5.80s/it]


In [34]:
graph_documents

[GraphDocument(nodes=[], relationships=[], source=Document(metadata={'source': './data/0.md'}, page_content='LOLC Technologies Knowledge Base\n\nContact: +94 011 500 1121 (076 884 0426)\n\nAbout LOLC Technologies\n\nLOLC Technologies Limited, a fully owned subsidiary of LOLC Group, incorporated in 2004, is an expert in IT Systems integrations. The company holds ISO 9000, ISO 20000, and ISO 27001 certifications.\n\nMission: Facilitating businesses to reach greater heights through innovative and cutting-edge technology solutions. Powering impossibilities by providing tailor-made solutions to clients across the globe.\n\nWhat We Are\n\nBringing you tomorrows technology today.\n\nWhat We Do\n\nOur IT Solutions:\n\nERP and EPM: Solutions to ease your day-to-day financial and operational activities and increase your business efficiency. [Learn More]\n\nMulti-Cloud Technology: Gives organizations more flexibility to optimize performance, control costs, and leverage the best cloud technologies

In [35]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [36]:
# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

In [37]:
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase

In [38]:
from typing import Tuple, List, Optional

In [39]:
from langchain_community.vectorstores import Neo4jVector

In [40]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

vector_index = Neo4jVector.from_existing_graph(
    GoogleGenerativeAIEmbeddings(model="models/text-embedding-004"),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [41]:
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

[]

In [42]:
from langchain_core.pydantic_v1 import BaseModel, Field


class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [43]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate

In [44]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

In [45]:
entity_chain = prompt | llm.with_structured_output(Entities)

In [46]:
entity_chain.invoke({"question": "Who is Prasanna Siriwardane"}).names

['Prasanna Siriwardane']

In [47]:
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

In [48]:
def generate_full_text_query(input: str) -> str:
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()


In [49]:
import time

In [50]:
# Fulltext index query
def structured_retriever(question: str) -> str:
    start_time = time.time()
    result = ""
    entities = entity_chain.invoke({"question": question})
    entity_queries = [generate_full_text_query(entity) for entity in entities.names]

    if entity_queries:
        query = """
        UNWIND $queries AS query
        CALL db.index.fulltext.queryNodes('entity', query, {limit:2})
        YIELD node, score
        CALL (node, score) {
            MATCH (node)-[r:!MENTIONS]->(neighbor)
            RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
            UNION ALL
            MATCH (node)<-[r:!MENTIONS]-(neighbor)
            RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
        }
        RETURN output LIMIT 25
        """
        response = graph.query(query, {"queries": entity_queries})
        result = "\n".join([el['output'] for el in response])

    print("OPTIMIZED_STRUCTURED_RETRIEVER: --- %s seconds ---" % (time.time() - start_time))
    return result

In [51]:
print(structured_retriever("Who is Prasanna Siriwardane?"))

OPTIMIZED_STRUCTURED_RETRIEVER: --- 5.812939405441284 seconds ---
Prasanna Siriwardena - DEPUTY_CEO -> Lolc Technology Services Ltd
Prasanna Siriwardena - CHIEF_INFORMATION_OFFICER -> Lolc Holdings
Prasanna Siriwardena - DEPUTY_CEO_OF -> Lolc Technologies
Prasanna Siriwardena - HOLDS_POSITION -> Chief Information Officer
Prasanna Siriwardena - WORKS_AT -> Lolc Holdings
Prasanna Siriwardena - HOLDS_POSITION -> Deputy Ceo
Prasanna Siriwardena - WORKS_AT -> Lolc Technology Services Ltd


  words = [el for el in remove_lucene_chars(input).split() if el]


In [None]:
prompt = PromptTemplate(template=prompt_template, input_variables=["company_name"])

# Create an OpenAI LLM
llm = OpenAI(temperature=0)

# Create a LangChain LLMChain with the prompt template and LLM
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Query the chain
response = llm_chain.run(company_name="lolc tech")

In [116]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    print("STRUCTURED_DATA: ", structured_data)
    # structured_data = ""
    start_time = time.time()
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    print("OPTIMIZED_NORMAL_RETRIEVER: --- %s seconds ---" % (time.time() - start_time))
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [117]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

In [118]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [119]:
CONDENSE_QUESTION_PROMPT

PromptTemplate(input_variables=['chat_history', 'question'], input_types={}, partial_variables={}, template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,\nin its original language.\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:')

In [120]:
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

In [121]:
_search_query = RunnableBranch(
    # If input includes chat_history, include it in the follow up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatGoogleGenerativeAI(temperature=0, model="gemini-2.0-flash", rate_limiter=rate_limiter)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [122]:
template = """You are a helpful front desk assistant for LOLC Technologies. Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""

In [123]:
prompt = ChatPromptTemplate.from_template(template)

In [124]:
chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [127]:
question = "Who are the leadership team in your company? what are their details and backgrounds"

In [128]:
chain.invoke({"question": question})

Search query: Who are the leadership team in your company? what are their details and backgrounds
OPTIMIZED_STRUCTURED_RETRIEVER: --- 1.7180919647216797 seconds ---
STRUCTURED_DATA:  




OPTIMIZED_NORMAL_RETRIEVER: --- 0.3914673328399658 seconds ---


'The leadership team includes:\n\n*   **Conrad Dias:** Director of LOLC Holdings PLC, Director/CEO of LOLC Finance PLC, and Chairman of LOLC Technology Limited. He has over 25 years of experience in Trading, Banking, Finance, Assets Management, and Manufacturing. He holds an MBA from the University of Leicester.\n*   **Prasanna Siriwardena:** Deputy CEO of LOLC Technology Services Ltd, Chief Information Officer of LOLC Holdings, General – IT. He has over 18 years of experience in Information Security and Information Technology Management. He holds an MSc in Information Technology from Keele University – UK.\n*   **Chamini Attanayake:** Chief Operating Officer.'

In [107]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='You are a helpful front desk assistant for LOLC Technologies. Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\nUse natural language and be concise.\nAnswer:'), additional_kwargs={})])

In [131]:
chain.invoke(
    {
        "question": "Who are the leadership team in your company? what are their details and backgrounds",
        "chat_history": [("Who are you?", "Im an assistant for LOLC Technologies")],
    }
)

Search query: Who are the leadership team in LOLC Technologies? What are their details and backgrounds?
OPTIMIZED_STRUCTURED_RETRIEVER: --- 5.736302852630615 seconds ---
STRUCTURED_DATA:  Lolc Technologies - EXPERIENCE -> Services
Lolc Technologies - EXPERIENCE -> Telecommunication
Lolc Technologies - EXPERIENCE -> Manufacturing
Lolc Technologies - PROVIDE -> Oracle Supply Chain Management
Lolc Technologies - EXPERIENCE -> Bfsi
Lolc Technologies - PROVIDE -> Erp Implementation
Lolc Technologies - PROVIDE -> Oracle Ebs Financials
Lolc Technologies - PARTNER -> Oracle
Lolc Technologies - PROVIDE -> Oracle E-Business Suite
Lolc Technologies - PROVIDE -> Epm Implementation
Lolc Technologies - PROVIDE -> Cloud Services
Lolc Technologies - PROVIDE -> On-Premise Services
Lolc Technologies - PARTNERS_WITH -> Huawei
Lolc Technologies - DEVELOPS -> Chatbot
Lolc Technologies - PARTNER_WITH -> Microsoft
Lolc Technologies - PARTNER_WITH -> Yenasys
Lolc Technologies - ORGANIZED -> Sports Day 2024
Lo



OPTIMIZED_NORMAL_RETRIEVER: --- 0.38660550117492676 seconds ---


'The leadership team includes:\n\n*   **Conrad Dias:** Chairman of LOLC Technology Limited. He has over 25 years of experience in sectors such as Trading and Banking.\n*   **Prasanna Siriwardena:** Deputy CEO of LOLC Technologies.\n*   **Chamini Attanayake:** Chief Operating Officer.'

In [None]:
"The leadership team includes:\n\n*   **Mr. Conrad Dias:** Director – LOLC Holdings PLC, Director / CEO – LOLC Finance PLC, Chairman of LOLC Technology Limited. He has over 25 years of experience in Trading and Banking.\n*   **Prasanna Siriwardena:** Deputy CEO - LOLC Technology Services Ltd cum Chief Information Officer - LOLC Holdings, General – IT.\n*   **Chamini Attanayake:** Chief Operating Officer"

In [204]:
entities = [x['n']['id'] for x in graph.query("""MATCH (n)
WHERE n:Organization OR n:Person
RETURN n
""")]

In [205]:
entities

['Lolc Technologies Limited',
 'Conrad Dias',
 'Lolc Technology Limited',
 'Prasanna Siriwardena',
 'Lolc Technology Services Ltd',
 'Lolc Holdings',
 'Chamini Attanayake',
 'Leco',
 'Windforce Plc',
 'Rusiri Cooray',
 'Ntb',
 'Lolc Cambodia Plc',
 'Hutch',
 'Lalith Fernando',
 'Lolc Technologies',
 'Michael Jordan',
 'Thisan Samarasinghe',
 'Nileka Madurapperuma',
 'Bhanuka Gunathilake',
 'Abhishek Jayakody',
 'Imasha Udayangi',
 'Nalan Perera',
 'Jayani Botheju',
 'Sujeewa Premathilake',
 'Nimesh Lakshitha',
 'Dilantha Matharaarachchi',
 'Ashirwada Methsarani',
 'Customer',
 'Huawei',
 'Oracle',
 'Lolc Holdings Plc',
 'Lolc Finance Plc',
 'Hirdaramani Group Of Companies',
 'Vanik Incorporations Limited',
 'Eagle Ndb Fund Management Company Limited',
 'National Bank Of Kuwait',
 'Kmpg',
 'East West Information Systems Ltd',
 'Jayantha Ke',
 'Jayantha Kelegama']

In [None]:

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI



# Define the prompt template
prompt_template = """
Identify only the interchangeable entities or persons in the following context:
{companies}
____________________

For example: J.Smith interchangeable with John Smith
If one doesnt relate to any other dont include them. Only include entities youre highly confident. Return the interchangeable entities in the following format:
{{
    1 : ["entity1", "entity2"],
    2 : ["entity3", "entity4"]
}}
"""

# Create the PromptTemplate and LLMChain
prompt = PromptTemplate(template=prompt_template, input_variables=["companies"])

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", rate_limiter=rate_limiter, verbose=True, temperature=0)

# Create the LLMChain with the prompt and LLM
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Run the chain with the input entities
response = llm_chain.run(companies=entities)

# Output the response
print(response)


Here's the analysis of interchangeable entities:

{
    1 : ["Lolc Technologies Limited", "Lolc Technology Limited", "Lolc Technologies"],
    2 : ["Lolc Holdings", "Lolc Holdings Plc"],
    3 : ["Jayantha Ke", "Jayantha Kelegama"]
}


: 