In [129]:
## 1. Setup & Imports
import os
import asyncio
from rdflib import Graph as RDFGraph
from neo4j import GraphDatabase

from neo4j_graphrag.indexes import create_vector_index
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.llm.openai_llm import OpenAILLM
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.experimental.components.resolver import SinglePropertyExactMatchResolver
from neo4j_graphrag.retrievers import VectorRetriever, Text2CypherRetriever
from neo4j_graphrag.generation import GraphRAG

from src.utils import getSchemaFromOnto, getNLOntology, getPKs

from dotenv import load_dotenv

In [130]:
load_dotenv()

## 2. Load Environment Variables
api_key = os.getenv("OPENAI_API_KEY")
uri = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

AUTH = (user, password)

In [131]:
print(api_key)
print(uri)
print(user)
print(password)

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
bolt://localhost:7687
neo4j
password


In [132]:
# Connect to Neo4j
driver = GraphDatabase.driver(uri, auth=AUTH)

In [133]:
## 2. Create Vector Index
INDEX_NAME = "chunk-index"
DIMENSION = 3072
create_vector_index(
    driver,
    INDEX_NAME,
    label="Chunk",
    embedding_property="embedding",
    dimensions=DIMENSION,
    similarity_fn="cosine",
)

In [134]:
from langchain.document_loaders import PyMuPDFLoader  

# Creating a function to read Multiple PDF files  
def process_pdfs_in_directory(directory_path):  

    documents = []

    for filename in os.listdir(directory_path):  
        if filename.endswith(".pdf"):  
            file_path = os.path.join(directory_path, filename) 
            pdf_loader = PyMuPDFLoader(file_path=file_path)
            document = pdf_loader.load()
            print(f"File loading done for: {filename}")
            documents.append(document)

    return documents

In [135]:
all_docs = process_pdfs_in_directory("data/")  # Specify the directory containing your PDF files

File loading done for: 2022 Q3 AAPL.pdf


In [136]:
# Initialize the base list  
base_docs = []  
  
# Flatten the all_docs structure and extend base_docs  
for doc_list in all_docs:
    base_docs.extend(doc_list)  
  
print('Length of basedocs is now ' + str(len(base_docs))) 

Length of basedocs is now 28


In [137]:
import pickle  
file_path = 'data/Pickle_File/base_docs.pkl'  
  
# Serialize the list of Document objects and save it to a file  
with open(file_path, 'wb') as file:  
    pickle.dump(base_docs, file)  
  
print(f"List of documents saved to {file_path}")

List of documents saved to data/Pickle_File/base_docs.pkl


In [138]:
import pickle   
  
# Specify the file path where your list of documents is saved  
file_path = 'data/Pickle_File/base_docs.pkl' 
  
# Deserialize the file content back into a list of Document objects  
with open(file_path, 'rb') as file:  
    base_docs = pickle.load(file)
  
print("List of documents loaded successfully: " + str(len(base_docs)))

List of documents loaded successfully: 28


In [139]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)

chunks = text_splitter.split_documents(base_docs)
len(chunks)

67

In [140]:
chunks[0]

Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2022-07-29T06:03:21-04:00', 'source': 'data/2022 Q3 AAPL.pdf', 'file_path': 'data/2022 Q3 AAPL.pdf', 'total_pages': 28, 'format': 'PDF 1.4', 'title': '0000320193-22-000070', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-Q filed on 2022-07-29 for the period ending 2022-06-25', 'keywords': '0000320193-22-000070; ; 10-Q', 'moddate': '2022-07-29T06:03:28-04:00', 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', 'modDate': "D:20220729060328-04'00'", 'creationDate': "D:20220729060321-04'00'", 'page': 0}, page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-Q\n(Mark One)\n☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the quarterly period ended June\xa025, 2022\nor\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF TH

In [141]:
for doc in chunks:  
    del doc.metadata['source']  
    file_path = doc.metadata['file_path']  
    doc.metadata['file_name'] = os.path.basename(file_path)  
    del doc.metadata['file_path']  

In [142]:
from pprint import pprint
pprint(chunks[0])

Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2022-07-29T06:03:21-04:00', 'total_pages': 28, 'format': 'PDF 1.4', 'title': '0000320193-22-000070', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-Q filed on 2022-07-29 for the period ending 2022-06-25', 'keywords': '0000320193-22-000070; ; 10-Q', 'moddate': '2022-07-29T06:03:28-04:00', 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', 'modDate': "D:20220729060328-04'00'", 'creationDate': "D:20220729060321-04'00'", 'page': 0, 'file_name': '2022 Q3 AAPL.pdf'}, page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-Q\n(Mark One)\n☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the quarterly period ended June\xa025, 2022\nor\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor t

In [None]:
with open("onto/improved_financial_ontology.ttl", "r") as f:
    onto = f.read()
    print(onto)

@prefix ex: <http://example.org/financial/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ex:disclosesRisk a <http://www.w3.org/2002/07/owl#ObjectProperty> ;
    rdfs:label "Discloses Risk" ;
    rdfs:comment "A report that describes risk factors." ;
    rdfs:domain ex:FinancialReport ;
    rdfs:range ex:RiskFactor .

ex:dividendDeclared a <http://www.w3.org/2002/07/owl#DatatypeProperty> ;
    rdfs:label "Dividend Declared" ;
    rdfs:comment "Reported dividend for the time period." ;
    rdfs:domain ex:FinancialReport .

ex:hasStockInfo a <http://www.w3.org/2002/07/owl#ObjectProperty> ;
    rdfs:label "Has Stock Info" ;
    rdfs:comment "The company has associated stock information." ;
    rdfs:domain ex:Company ;
    rdfs:range ex:StockInformation .

ex:includes a <http://www.w3.org/2002/07/owl#ObjectProperty> ;
    rdfs:label "Includes" ;
    rdfs:comment "A report includes an income statement." ;
    rdfs:domain ex:FinancialReport ;
    rdfs:range ex:IncomeStatement .


In [None]:
## 3. Load Ontology and Schema
from IPython.display import display

g = RDFGraph()
g.parse("onto/financial_report_ontology.ttl")
neo4j_schema = getSchemaFromOnto(g) # Load the schema from the ontology
display(neo4j_schema)

SchemaConfig(entities={'FinancialReport': {'label': 'FinancialReport', 'description': 'A 10-Q or 10-K financial disclosure submitted by a public company.', 'properties': [{'name': 'reportNumber', 'type': 'STRING', 'description': 'The SEC form number such as 10-Q or 10-K.'}]}, 'Company': {'label': 'Company', 'description': 'A publicly traded corporation that submits financial reports.', 'properties': []}, 'FinancialStatement': {'label': 'FinancialStatement', 'description': 'A general class for financial statements.', 'properties': []}, 'IncomeStatement': {'label': 'IncomeStatement', 'description': 'An income statement detailing revenue, expenses, and net income.', 'properties': []}, 'BalanceSheet': {'label': 'BalanceSheet', 'description': 'A balance sheet showing assets, liabilities, and equity.', 'properties': []}, 'CashFlowStatement': {'label': 'CashFlowStatement', 'description': 'A statement of cash inflows and outflows.', 'properties': []}, 'ShareholdersEquityStatement': {'label': '

In [None]:
entities=neo4j_schema.entities.values()
entities

dict_values([{'label': 'FinancialReport', 'description': 'A 10-Q or 10-K financial disclosure submitted by a public company.', 'properties': [{'name': 'reportNumber', 'type': 'STRING', 'description': 'The SEC form number such as 10-Q or 10-K.'}]}, {'label': 'Company', 'description': 'A publicly traded corporation that submits financial reports.', 'properties': []}, {'label': 'FinancialStatement', 'description': 'A general class for financial statements.', 'properties': []}, {'label': 'IncomeStatement', 'description': 'An income statement detailing revenue, expenses, and net income.', 'properties': []}, {'label': 'BalanceSheet', 'description': 'A balance sheet showing assets, liabilities, and equity.', 'properties': []}, {'label': 'CashFlowStatement', 'description': 'A statement of cash inflows and outflows.', 'properties': []}, {'label': 'ShareholdersEquityStatement', 'description': 'Tracks changes in equity of shareholders over a period.', 'properties': []}, {'label': 'FinancialMetric

In [None]:
relations=neo4j_schema.relations.values()
relations

dict_values([{'label': 'hasFinancialStatement', 'description': 'Links a report to the financial statements it contains.', 'properties': []}, {'label': 'hasMetric', 'description': 'Connects a statement to its metrics like revenue or net income.', 'properties': []}, {'label': 'hasRiskFactor', 'description': 'Connects a report to its described risk factors.', 'properties': []}, {'label': 'hasLegalProceeding', 'description': 'Legal proceedings discussed in the report.', 'properties': []}, {'label': 'hasStockInfo', 'description': 'Connects the report to stock/share-related data.', 'properties': []}, {'label': 'relatedTo', 'description': 'General relationship to a company or segment.', 'properties': []}])

In [None]:
nl_ontology = getNLOntology(g)
display(nl_ontology) # Load the NL ontology from the graph

'\nNode Labels:\nFinancialReport: A 10-Q or 10-K financial disclosure submitted by a public company.\nCompany: A publicly traded corporation that submits financial reports.\nFinancialStatement: A general class for financial statements.\nIncomeStatement: An income statement detailing revenue, expenses, and net income.\nBalanceSheet: A balance sheet showing assets, liabilities, and equity.\nCashFlowStatement: A statement of cash inflows and outflows.\nShareholdersEquityStatement: Tracks changes in equity of shareholders over a period.\nFinancialMetric: A quantitative value in a financial report, like Net Income or Revenue.\nStockInformation: Details about company shares, dividends, etc.\nRiskFactor: Qualitative risks disclosed by a company.\nLegalProceeding: A legal action or case referenced in the financial report.\nMarketDisclosure: Narrative discussion on business, market conditions, and trends.\n\nNode Properties:\nreportNumber: Attribute that applies to entities of type FinancialRep

In [None]:
# from langchain_openai import ChatOpenAI
# # from langchain_ollama import ChatOllama
# from langchain_openai import OpenAIEmbeddings

# llm_gpt_4o_mini = ChatOpenAI(api_key=api_key, temperature=0, model="gpt-4o-mini")

# llm_embedding_large_3 = OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-large")

# llm_llama31_8b = ChatOllama(
#     model="llama3.1:8b",
#     temperature=0,
#     base_url = "http://10.5.61.140:5000/"
# )