In [44]:
from dotenv import load_dotenv
import os
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_ollama.llms import OllamaLLM
import asyncio
from langchain_openai import ChatOpenAI

In [45]:
# Load environment variables
load_dotenv()

NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

In [None]:
# Define the LLM
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.0, # 
)

In [47]:
graph_transformer = LLMGraphTransformer(llm=llm)

In [51]:
# Read the document
with open("/Users/siddharthdileep/extracter/01471587/shortened-filings-text/AA_MzE1ODIxMjk1NmFkaXF6a2N4.txt") as f:
    text = f.read()
text

'VODAFONE LIMITED\nCompany No: 01471587\nANNUAL REPORT AND FINANCIAL STATEMENTS\nFOR THE YEAR ENDED 31 MARCH 2016\nA5FWQ1PA*\nTUESDAY\nA25\n20/09/2016\n#92\nCOMPANIES HOUSE\n\n--- End of Page 1 ---\n\nVODAFONE LIMITED\nCompany No: 01471587\nSTRATEGIC REPORT (continued)\nStrategic report for the year ended 31 March 2016\nThe directors present their strategic report for the year ended 31 March 2016.\nReview of the business and future developments\nThe overall strategy of Vodafone Limited ("the Company") is aligned to that of Vodafone Group Plc ("the\nGroup"), which is outlined in the Group Strategy Review in the Group\'s Annual Report and Financial Statements\nfor the year ended 31 March 2016, which does not form part of this report.\nThe Company has transitioned to FRS 101 - Reduced Disclosure Framework for the preparation of these\nfinancial statements. Comparative information has been restated as appropriate; see note 1 to the financial\nstatements for the impact of the adoption and t

In [52]:
pages = text.split("--- End of Page")
documents = [Document(page_content=page.strip()) for page in pages if page.strip()]
documents = documents[:2]  # Limit to first 2 pages for testing

In [53]:
graph_documents = await graph_transformer.aconvert_to_graph_documents(documents)

In [54]:
print("NODES", graph_documents[0].nodes)
print("RELATIONSHIPS", graph_documents[0].relationships)

NODES [Node(id='Vodafone Limited', type='Company', properties={}), Node(id='01471587', type='Company number', properties={}), Node(id='31 March 2016', type='Date', properties={}), Node(id='20/09/2016', type='Date', properties={}), Node(id='Annual Report And Financial Statements', type='Document', properties={}), Node(id='Companies House', type='Organization', properties={})]
RELATIONSHIPS [Relationship(source=Node(id='Vodafone Limited', type='Company', properties={}), target=Node(id='01471587', type='Company number', properties={}), type='HAS_COMPANY_NUMBER', properties={}), Relationship(source=Node(id='Vodafone Limited', type='Company', properties={}), target=Node(id='Annual Report And Financial Statements', type='Document', properties={}), type='HAS_DOCUMENT', properties={}), Relationship(source=Node(id='Annual Report And Financial Statements', type='Document', properties={}), target=Node(id='31 March 2016', type='Date', properties={}), type='COVERS_YEAR_ENDED', properties={}), Relat

In [55]:
for i, gd in enumerate(graph_documents):
    print(f"--- Document {i+1} ---")
    print("NODES:")
    for node in gd.nodes:
        print(node)
    print("RELATIONSHIPS:")
    for rel in gd.relationships:
        print(rel)
    print()

--- Document 1 ---
NODES:
id='Vodafone Limited' type='Company' properties={}
id='01471587' type='Company number' properties={}
id='31 March 2016' type='Date' properties={}
id='20/09/2016' type='Date' properties={}
id='Annual Report And Financial Statements' type='Document' properties={}
id='Companies House' type='Organization' properties={}
RELATIONSHIPS:
source=Node(id='Vodafone Limited', type='Company', properties={}) target=Node(id='01471587', type='Company number', properties={}) type='HAS_COMPANY_NUMBER' properties={}
source=Node(id='Vodafone Limited', type='Company', properties={}) target=Node(id='Annual Report And Financial Statements', type='Document', properties={}) type='HAS_DOCUMENT' properties={}
source=Node(id='Annual Report And Financial Statements', type='Document', properties={}) target=Node(id='31 March 2016', type='Date', properties={}) type='COVERS_YEAR_ENDED' properties={}
source=Node(id='Annual Report And Financial Statements', type='Document', properties={}) targe

In [56]:
from langchain.callbacks import get_openai_callback

# Prepare the document
documents = [Document(page_content=text.strip())]

with get_openai_callback() as cb:
    graph_documents = await graph_transformer.aconvert_to_graph_documents(documents)

    # After this point, you can inspect the callback
    print(f"Prompt tokens used: {cb.prompt_tokens}")
    print(f"Completion tokens used: {cb.completion_tokens}")
    print(f"Total tokens used: {cb.total_tokens}")
    print(f"Total cost (USD): ${cb.total_cost:.6f}")


Prompt tokens used: 23019
Completion tokens used: 509
Total tokens used: 23528
Total cost (USD): $0.003758


In [57]:
print("Graph Documents:")
for i, gd in enumerate(graph_documents):
    print(f"--- Document {i+1} ---")
    print("NODES:")
    for node in gd.nodes:
        print(node)
    print("RELATIONSHIPS:")
    for rel in gd.relationships:
        print(rel)
    print()

Graph Documents:
--- Document 1 ---
NODES:
id='Vodafone Limited' type='Company' properties={}
id='Vodafone Group Plc' type='Company' properties={}
id='H Lamprell' type='Person' properties={}
id='P Riviere' type='Person' properties={}
id='N Jeffery' type='Person' properties={}
id='D Galli' type='Person' properties={}
id='Hj Hoencamp' type='Person' properties={}
id='Pricewaterhousecoopers Llp' type='Company' properties={}
id='Vodafone House' type='Location' properties={}
id='Newbury' type='Location' properties={}
id='Berkshire' type='Location' properties={}
id='England And Wales' type='Location' properties={}
RELATIONSHIPS:
source=Node(id='Vodafone Limited', type='Company', properties={}) target=Node(id='Vodafone Group Plc', type='Company', properties={}) type='SUBSIDIARY_OF' properties={}
source=Node(id='Vodafone Limited', type='Company', properties={}) target=Node(id='H Lamprell', type='Person', properties={}) type='DIRECTOR' properties={}
source=Node(id='Vodafone Limited', type='Compa

In [58]:
# store it in Neo4j
# https://python.langchain.com/docs/how_to/graph_constructing/
from langchain_neo4j import Neo4jGraph
graph = Neo4jGraph(
    url="bolt://localhost:7687",
    username=os.getenv("NEO4J_USER"),
    password=os.getenv("NEO4J_PASSWORD"),
    refresh_schema=False
)
graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)

In [59]:
graph2_transformer = LLMGraphTransformer(
    llm=llm,
    node_properties=True,
)
documents = [Document(page_content=text.strip())]
graph_documents_props = await graph2_transformer.aconvert_to_graph_documents(documents)


In [60]:
# print node, properties
print("Graph Documents with Properties:")
for i, gd in enumerate(graph_documents_props):
    print(f"--- Document {i+1} ---")
    print("NODES:")
    for node in gd.nodes:
        print(node)
    print("RELATIONSHIPS:")
    for rel in gd.relationships:
        print(rel)
    print()

Graph Documents with Properties:
--- Document 1 ---
NODES:
id='Vodafone Limited' type='Company' properties={'companyNo': '01471587', 'yearEnded': '2016-03-31'}
id='Vodafone Group Plc' type='Company' properties={}
id='H Lamprell' type='Person' properties={}
id='P Riviere' type='Person' properties={'appointmentDate': '2016-03-11'}
id='N Jeffery' type='Person' properties={'appointmentDate': '2016-09-01'}
id='D Galli' type='Person' properties={'resignationDate': '2016-05-01'}
id='Hj Hoencamp' type='Person' properties={'resignationDate': '2016-09-01'}
id='Vodafone House' type='Location' properties={'address': 'The Connection, Newbury, Berkshire, RG14 2FN, England'}
RELATIONSHIPS:
source=Node(id='Vodafone Limited', type='Company', properties={}) target=Node(id='Vodafone Group Plc', type='Company', properties={}) type='SUBSIDIARY_OF' properties={}
source=Node(id='Vodafone Limited', type='Company', properties={}) target=Node(id='H Lamprell', type='Person', properties={}) type='DIRECTOR' proper

In [61]:
graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)

In [64]:
company_number = "01471587"

files = os.listdir(f"{company_number}/shortened-filings-text")
files = [f for f in files if f.endswith(".txt")]
print(f"Files in {company_number}-filings-txt: {files}")
print(len(files))

Files in 01471587-filings-txt: ['SH20_MzQ2NDAzMzQ1OWFkaXF6a2N4.txt', 'MISC_MzExNTgxNzg2NmFkaXF6a2N4.txt', 'AP01_MzE0Njk3MTQ2NWFkaXF6a2N4.txt', 'PSC02_MzQzNzU2Nzc0NGFkaXF6a2N4.txt', '288a_MTYzNzQ2MzA0YWRpcXprY3g.txt', 'NEWINC_MzM2NjY0MjlhZGlxemtjeA.txt', '363x_NzU3MDgxNDZhZGlxemtjeA.txt', '288_MzM1NTg5MjRhZGlxemtjeA.txt', 'PSC07_MzQzNzU2Nzc0M2FkaXF6a2N4.txt', 'CS01_MzI1NzQwODQxOWFkaXF6a2N4.txt', 'CAP-SS_MzA2NDIxNDMwMmFkaXF6a2N4.txt', '287_MTQwMTYwODM4YWRpcXprY3g.txt', 'AA_MzE1ODIxMjk1NmFkaXF6a2N4.txt', 'TM01_MzA0NDgyMjA1M2FkaXF6a2N4.txt', '288b_MTg4NDQ4ODNhZGlxemtjeA.txt', 'CERTNM_MzA5MDcyNTQ4NmFkaXF6a2N4.txt', 'CH01_MzAwMTI4NDUxNGFkaXF6a2N4.txt', 'PSC05_MzQ3MDA2NzAwMWFkaXF6a2N4.txt', '123_MTM4MzYzMjUxYWRpcXprY3g.txt', 'AR01_MzA5NjI4OTA1MWFkaXF6a2N4.txt', 'CC04_MzAwNDQ4ODU0NmFkaXF6a2N4.txt', 'MR04_MzIxOTAwNTU1MmFkaXF6a2N4.txt', 'SH01_MzA3OTc4ODU5MmFkaXF6a2N4.txt', 'AP03_MzMxNDg4MTk1NGFkaXF6a2N4.txt', 'RP04TM01_MzQ0NjUwOTQxOGFkaXF6a2N4.txt', '363a_ODcyNjYyNTRhZGlxemtjeA.txt', 'SH19_MzA2N

In [66]:
text = ""
for file in files:
    with open(f"{company_number}/shortened-filings-text/{file}", "r") as f:
        data = f.read()
        text += data.strip() + "\n"
print("Total length of text:", len(text))        

Total length of text: 238516


In [67]:
# looping over each file and runnig the graph transformer
for file in files:
    with open(f"{company_number}/shortened-filings-text/{file}", "r") as f:
        data = f.read()
        documents = [Document(page_content=data.strip())]

        with get_openai_callback() as cb:
            graph_documents = await graph_transformer.aconvert_to_graph_documents(documents)

            # After this point, you can inspect the callback
            print(f"Prompt tokens used: {cb.prompt_tokens}")
            print(f"Completion tokens used: {cb.completion_tokens}")
            print(f"Total tokens used: {cb.total_tokens}")
            print(f"Total cost (USD): ${cb.total_cost:.6f}")

        graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)


Prompt tokens used: 1309
Completion tokens used: 378
Total tokens used: 1687
Total cost (USD): $0.000423
Prompt tokens used: 5217
Completion tokens used: 354
Total tokens used: 5571
Total cost (USD): $0.000995
Prompt tokens used: 1183
Completion tokens used: 121
Total tokens used: 1304
Total cost (USD): $0.000250
Prompt tokens used: 1303
Completion tokens used: 174
Total tokens used: 1477
Total cost (USD): $0.000300
Prompt tokens used: 1171
Completion tokens used: 154
Total tokens used: 1325
Total cost (USD): $0.000268
Prompt tokens used: 6987
Completion tokens used: 713
Total tokens used: 7700
Total cost (USD): $0.001476
Prompt tokens used: 3027
Completion tokens used: 671
Total tokens used: 3698
Total cost (USD): $0.000857
Prompt tokens used: 1755
Completion tokens used: 490
Total tokens used: 2245
Total cost (USD): $0.000557
Prompt tokens used: 1123
Completion tokens used: 229
Total tokens used: 1352
Total cost (USD): $0.000306
Prompt tokens used: 1165
Completion tokens used: 247
To