In [7]:
pip install json-repair

Collecting json-repairNote: you may need to restart the kernel to use updated packages.

  Downloading json_repair-0.22.0-py3-none-any.whl.metadata (7.3 kB)
Downloading json_repair-0.22.0-py3-none-any.whl (12 kB)
Installing collected packages: json-repair
Successfully installed json-repair-0.22.0


In [11]:
pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install langchain-community

Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.llms import Ollama
from langchain.docstore.document import Document
from langchain_experimental.graph_transformers.llm import UnstructuredRelation, examples
from langchain.schema import SystemMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.chat_models import ChatOllama

In [2]:
llm = ChatOllama(temperature=0, model = "llama3")

In [3]:

system_prompt = """
You are a data scientist working for a company that is building a knowledge graph database. 
Your task is to extract information from data and convert it into a knowledge graph database.
Provide a set of Nodes in the form [head, head_type, relation, tail, tail_type].
It is important that the head and tail exists as nodes that are related by the relation. If you can't pair a relationship with a pair of nodes don't add it.
When you find a node or relationship you want to add try to create a generic TYPE for it that describes the entity you can also think of it as a label.
You must generate the output in a JSON format containing a list with JSON objects. Each object should have the keys: "head", "head_type", "relation", "tail", and "tail_type".
"""

system_message = SystemMessage(content=system_prompt)
parser = JsonOutputParser(pydantic_object=UnstructuredRelation)

human_prompt = PromptTemplate(
    template="""
Examples:
{examples}

For the following text, extract entities and relations as in the provided example.
{format_instructions}\nText: {input}""",
    input_variables=["input"],
    partial_variables={
        "format_instructions": parser.get_format_instructions(),
        "node_labels": None,
        "rel_types": None,
        "examples": examples,
    },
)

human_message_prompt = HumanMessagePromptTemplate(prompt=human_prompt)

chat_prompt = ChatPromptTemplate.from_messages(
    [system_message, human_message_prompt]
)

In [4]:
llm_transformer = LLMGraphTransformer(llm=llm, prompt=chat_prompt)

In [5]:
text = """[US DEPARTMENT OF COMMERCE] Bureau of Industry and Security - Appropriate Federal Register Citations: 73 F.R. 63678 10/27/08. Name - AHMED, YASMIN 612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired. [US STATE DEPARTMENT] Directorate of Defence Trade Controls list of Parties Debarred for Arms Export Control Act convictions. 69 FR 17468, 04/02/2004. PRIMARY NAME - Ahmed, Yasmin (a.k.a. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order expired. [BIOGRAPHY] To be determined. [IDENTIFICATION] Naturalized US citizen. Worked in United Arab Emirates (reported 2003). Tariq Ahmed (spouse)."""

print(text)

[US DEPARTMENT OF COMMERCE] Bureau of Industry and Security - Appropriate Federal Register Citations: 73 F.R. 63678 10/27/08. Name - AHMED, YASMIN 612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired. [US STATE DEPARTMENT] Directorate of Defence Trade Controls list of Parties Debarred for Arms Export Control Act convictions. 69 FR 17468, 04/02/2004. PRIMARY NAME - Ahmed, Yasmin (a.k.a. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order expired. [BIOGRAPHY] To be determined. [IDENTIFICATION] Naturalized US citizen. Worked in United Arab Emirates (reported 2003). Tariq Ahmed (spouse).


In [6]:
type(text)

str

In [7]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24)
splits = text_splitter.split_text(text)

#documents = text_splitter.create_documents(splits)

In [8]:
type(splits)

list

In [9]:
splits

['[US DEPARTMENT OF COMMERCE] Bureau of Industry and Security - Appropriate Federal Register Citations: 73 F.R. 63678 10/27/08. Name - AHMED, YASMIN 612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired',
 'I, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired. [US STATE DEPARTMENT] Directorate of Defence Trade Controls list of Parties Debarred for Arms Export Control Act convictions. 69 FR 17468, 04/02/2004. PRIMARY NAME - Ahmed, Yasmin (a.k.a. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order',
 '. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order expired. [BIOGRAPHY] To be determined. [IDENTIFICATION] Naturalized US citizen. Worked in United Arab Emirates (reported 2003). Tariq Ahmed (spouse).']

In [10]:
docs = [Document(page_content=t) for t in splits]

docs

[Document(page_content='[US DEPARTMENT OF COMMERCE] Bureau of Industry and Security - Appropriate Federal Register Citations: 73 F.R. 63678 10/27/08. Name - AHMED, YASMIN 612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired'),
 Document(page_content='I, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired. [US STATE DEPARTMENT] Directorate of Defence Trade Controls list of Parties Debarred for Arms Export Control Act convictions. 69 FR 17468, 04/02/2004. PRIMARY NAME - Ahmed, Yasmin (a.k.a. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order'),
 Document(page_content='. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order expired. [BIOGRAPHY] To be determined. [IDENTIFICATION] Naturalized US citizen. Worked in United Arab Emirates (reported 2003). Tariq Ahmed (spouse).')]

In [11]:
# documents = Document(page_content= text, metatdata={
#                 "source": "userinput"
#             })

In [12]:
graph_doc = llm_transformer.convert_to_graph_documents(docs)

In [16]:
for node in graph_doc[1].nodes:
    print(node)

id='Ahmed, Yasmin' type='Person'
id='Yasmin Tariq; Fatimah Mohammad' type='Alternate Name'
id='Parties Debarred for Arms Export Control Act convictions' type='List'


In [17]:
for rel in graph_doc[1].relationships:
    print(rel)

source=Node(id='Ahmed, Yasmin', type='Person') target=Node(id='Parties Debarred for Arms Export Control Act convictions', type='List') type='IS_DEBARRED'
source=Node(id='Ahmed, Yasmin', type='Person') target=Node(id='Yasmin Tariq; Fatimah Mohammad', type='Alternate Name') type='HAS_PRIMARY_NAME'


### With 512 token size

In [18]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
splits = text_splitter.split_text(text)

In [19]:
splits

['[US DEPARTMENT OF COMMERCE] Bureau of Industry and Security - Appropriate Federal Register Citations: 73 F.R. 63678 10/27/08. Name - AHMED, YASMIN 612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired. [US STATE DEPARTMENT] Directorate of Defence Trade Controls list of Parties Debarred for Arms Export Control Act convictions. 69 FR 17468, 04/02/2004. PRIMARY NAME - Ahmed, Yasmin (a.k.a. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order expired. [BIOGRAPHY] To be determined. [IDENTIFICATION] Naturalized US citizen. Worked in United Arab Emirates (reported 2003). Tariq Ahmed (spouse).']

In [20]:
docs1 = [Document(page_content=t) for t in splits]

docs1

[Document(page_content='[US DEPARTMENT OF COMMERCE] Bureau of Industry and Security - Appropriate Federal Register Citations: 73 F.R. 63678 10/27/08. Name - AHMED, YASMIN 612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK. Denied Persons List effective term 16 Oct 2008 - 16 Oct 2015. Oct 2015 - denial order expired. [US STATE DEPARTMENT] Directorate of Defence Trade Controls list of Parties Debarred for Arms Export Control Act convictions. 69 FR 17468, 04/02/2004. PRIMARY NAME - Ahmed, Yasmin (a.k.a. Yasmin Tariq; Fatimah Mohammad). [REGULATIONS HISTORY] Oct 2015 - denial order expired. [BIOGRAPHY] To be determined. [IDENTIFICATION] Naturalized US citizen. Worked in United Arab Emirates (reported 2003). Tariq Ahmed (spouse).')]

In [21]:
graph_doc = llm_transformer.convert_to_graph_documents(docs1)

In [22]:
for node in graph_doc[0].nodes:
    print(node)

id='Naturalized US citizen' type='Identity'
id='Denied Persons List' type='List'
id='YASMIN AHMED' type='Person'
id='Directorate of Defence Trade Controls' type='Department'
id='612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK.' type='Location'
id='United Arab Emirates' type='Country'
id='73 F.R. 63678' type='Publication'
id='US STATE DEPARTMENT' type='Organization'
id='Parties Debarred for Arms Export Control Act convictions' type='List'
id='US DEPARTMENT OF COMMERCE' type='Organization'


In [24]:
for rel in graph_doc[0].relationships:
    print(rel)

source=Node(id='US DEPARTMENT OF COMMERCE', type='Organization') target=Node(id='73 F.R. 63678', type='Publication') type='PUBLISHED_IN'
source=Node(id='YASMIN AHMED', type='Person') target=Node(id='612 BUSINESS CENTRE, MUMTAZ HASAN ROAD OFF I.I. CHUNDRIGAR ROAD, KARACHI, PK.', type='Location') type='LIVES_AT'
source=Node(id='YASMIN AHMED', type='Person') target=Node(id='Denied Persons List', type='List') type='LISTED_AS'
source=Node(id='US STATE DEPARTMENT', type='Organization') target=Node(id='Directorate of Defence Trade Controls', type='Department') type='MAINTAINS'
source=Node(id='YASMIN AHMED', type='Person') target=Node(id='Parties Debarred for Arms Export Control Act convictions', type='List') type='LISTED_AS'
source=Node(id='YASMIN AHMED', type='Person') target=Node(id='Naturalized US citizen', type='Identity') type='IDENTIFIED_AS'
source=Node(id='YASMIN AHMED', type='Person') target=Node(id='United Arab Emirates', type='Country') type='WORKED_AT'
