# SEC 10-Q Knowledge Graph Construction

This notebook demonstrates constructing a knowledge graph from SEC 10-Q filings using LangChain. The approach uses LLM-based extraction to identify entities and relationships without pre-defining a schema.

In [None]:
# Install required packages
%pip install --quiet langchain langchain-neo4j langchain-openai neo4j python-dotenv networkx matplotlib

In [4]:
import os
from pathlib import Path
from dotenv import load_dotenv
import neo4j
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
import networkx as nx
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Initialize OpenAI client
llm = ChatOpenAI(temperature=0, 
                 model_name="gpt-4o", 
                 api_key="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")
llm_transformer = LLMGraphTransformer(llm=llm)

In [None]:
import os
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import TokenTextSplitter
from langchain.vectorstores.utils import filter_complex_metadata

# Initialize empty list to store all documents
documents = []

# Get the docs directory path
docs_path = "../../data/sec-10-q/docs"

# Loop through all files in the docs directory
for filename in os.listdir(docs_path):
    # Check if the file is an AAPL PDF
    if filename.endswith("AAPL.pdf"):
        # Construct full file path
        file_path = os.path.join(docs_path, filename)
        
        # Load and process the PDF
        try:
            raw_documents = PyPDFLoader(file_path=file_path).load()
            
            # Split the documents
            text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
            split_documents = text_splitter.split_documents(raw_documents)
            
            # Filter metadata
            processed_documents = filter_complex_metadata(split_documents)
            
            # Append to our collection
            documents.extend(processed_documents)
            
            print(f"Processed: {filename}")
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

# Now all_documents contains the processed documents from all AAPL PDFs
print(f"Total documents processed: {len(documents)}")

In [None]:
graph_documents = llm_transformer.convert_to_graph_documents(tqdm(documents))

In [5]:
# save graph_documents
import pickle

with open("graph_documents_appl.pkl", "wb") as f:
    pickle.dump(graph_documents, f)

In [6]:
# Load graph_documents
import pickle

with open("graph_documents_appl.pkl", "rb") as f:
    graph_documents = pickle.load(f)

In [9]:
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph

graph = NetworkxEntityGraph()

# Add nodes to the graph
for doc in graph_documents:
    for node in doc.nodes:
        graph.add_node(node.id)

for doc in graph_documents:
    for edge in doc.relationships:
        graph._graph.add_edge(
            edge.source.id,
            edge.target.id,
            relation=edge.type,
        )

In [None]:
graph.get_triples()

In [None]:
# Draw the graph
plt.figure(figsize=(10, 10))
nx.draw(graph._graph, with_labels=True)

The knowledge graph has been constructed and stored in Neo4j. You can now query it using Cypher or use it for downstream tasks like question answering.

In [10]:
from langchain.chains import GraphQAChain

graph_chain = GraphQAChain.from_llm(
    llm=llm, 
    graph=graph, 
    verbose=True
)

In [None]:
graph_chain.invoke(input="Where was Apple Inc. Incorporated?")

In [None]:
graph_chain.invoke(input=" On April 1, 2023, what was the Amount of CASH_BEGINNING_BALANCE?")

In [None]:
graph_chain.invoke(input="What assets does Apple Inc. have?")

In [None]:
graph_chain.invoke(input="Apple inc. What was the amount for Cash Used In Investing Activities in 2023 Q3?")

In [None]:
graph_chain.invoke(input="What was Apple Inc's Products gross margin percentage for the third quarter of 2022? Provide the percentage rounded to one decimal place.")

In [None]:
# Load the CSV file
df = pd.read_csv("../../data/sec-10-q/synthetic_qna_data_7_gpt4o.csv")

# Filter for rows where Source Docs contains only AAPL
apple_df = df[df['Source Docs'].str.contains('AAPL', na=False)]

# Take first 10 samples
apple_df = apple_df.head(10)

# Evaluate the model
correct = 0
for i, row in apple_df.iterrows():
    question = row["New Question"]
    answer = row["New Answer"]
    print(f"\nQuestion: {question}")
    print(f"Expected Answer: {answer}")
    response = graph_chain.invoke(input=question)
    print(f"Model Response: {response}")
    if response == answer:
        correct += 1
        
print(f"\nAccuracy: {correct / 10}")