# SEC 10-Q Knowledge Graph Construction

This notebook demonstrates constructing a knowledge graph from SEC 10-Q filings using LangChain. The approach uses LLM-based extraction to identify entities and relationships without pre-defining a schema.

In [1]:
import os
from pathlib import Path
from typing import List, Dict, Any, Set, Tuple
import networkx as nx
from dataclasses import dataclass, field
import numpy as np
from Levenshtein import distance
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import json
from tqdm.auto import tqdm
import heapq

@dataclass(order=True)
class EntityMatch:
    """Represents a matched entity with similarity score."""
    similarity: float
    graph_entity: str = field(compare=False)
    query_entity: str = field(compare=False)

@dataclass
class KnowledgeChain:
    """Represents a chain of entities and relationships in the graph."""
    path: List[str]  # Alternating entities and relationships
    score: float
    terminal_node: str

class ChainExtractor:
    """Extracts candidate chains from query using LLM."""
    
    def __init__(self, num_chains: int = 1, verbose: bool = False):
        self.llm = ChatOpenAI(
            model="gpt-4o",
            temperature=0,
        )
        self.llm = self.llm.bind(response_format={"type": "json_object"})
        self.num_chains = num_chains
        self.verbose = verbose
        
    def extract_chains(self, query: str) -> List[List[str]]:
        """Extract candidate entity-relationship chains from query."""
        prompt = f"""Given this question, identify {self.num_chains} possible chain(s) of entities and relationships that would answer it.
        Express each chain as: entity1 -> relation1 -> entity2 -> relation2 -> ...
        Return JSON object with key "chains" containing array of chains.
        
        Question: {query}
        
        Example output:
        {{"chains": [
            ["Apple Inc", "REPORTED", "Revenue", "VALUE_ON", "Q3 2022"],
            ["Apple Inc", "HAS", "Products", "MARGIN_PERCENTAGE", "Q3 2022"]
        ]}}
        """
        
        response = self.llm.invoke(prompt)
        try:
            chains = json.loads(response.content)["chains"]
            return chains
        except:
            if self.verbose:
                print("Failed to parse LLM output as JSON")
            return []

class EntityMatcher:
    """Matches query entities to graph entities using Levenshtein distance."""
    
    def __init__(self, verbose: bool = False):
        self.verbose = verbose
        
    def get_matches(self, query_entities: List[str], graph_entities: List[str], 
                   beam_width: int) -> Dict[str, List[EntityMatch]]:
        """Find top matches for each query entity using Levenshtein distance."""
        matches = {}
        
        for query_entity in query_entities:
            scores = []
            if self.verbose:
                print(f"\nMatching entity: {query_entity}")
            
            for graph_entity in graph_entities:
                dist = distance(query_entity.lower(), graph_entity.lower())
                max_len = max(len(query_entity), len(graph_entity))
                similarity = 1 - (dist / max_len)
                
                heapq.heappush(scores, (-similarity, EntityMatch(
                    graph_entity=graph_entity,
                    query_entity=query_entity,
                    similarity=similarity
                )))
            
            matches[query_entity] = []
            if self.verbose:
                print("Top matches:")
            for _ in range(min(beam_width, len(scores))):
                _, match = heapq.heappop(scores)
                if self.verbose:
                    print(f"  {match.graph_entity} (similarity: {match.similarity:.3f})")
                matches[query_entity].append(match)
                
        return matches

class BeamSearchExplorer:
    """Explores graph using beam search to find relevant knowledge chains."""
    
    def __init__(self, graph: nx.DiGraph, beam_width: int = 3, max_depth: int = 5, verbose: bool = False):
        self.graph = graph
        self.beam_width = beam_width
        self.max_depth = max_depth
        self.verbose = verbose
        
    def explore(self, start_entities: List[str], target_chain: List[str]) -> List[KnowledgeChain]:
        """Perform beam search from start entities following target chain pattern."""
        beam = [(0, [entity], entity) for entity in start_entities]
        chains = []
        
        if not isinstance(self.beam_width, int):
            if self.verbose:
                print("Warning: Invalid beam width, using default of 3")
            self.beam_width = 3
            
        if self.verbose:
            print(f"\nStarting beam search (width={self.beam_width}, max_depth={self.max_depth})")
            print(f"Target pattern: {' -> '.join(target_chain)}")
        
        for depth in range(self.max_depth):
            if not beam:
                if self.verbose:
                    print(f"  Depth {depth}: No more candidates to explore")
                break
                
            if self.verbose:
                print(f"\n  Depth {depth}:")
                
            candidates = []
            for score, path, current in beam:
                if self.verbose:
                    print(f"    Exploring from: {current}")
                edges = list(self.graph.edges(current, data=True))
                if self.verbose:
                    print(f"    Found {len(edges)} outgoing edges")
                
                for _, neighbor, rel_data in edges:
                    relation = rel_data.get('relation', '')
                    new_path = path + [relation, neighbor]
                    chain_score = self._score_chain_match(new_path, target_chain)
                    
                    if self.verbose and chain_score > 0.5:
                        print(f"      {' -> '.join(new_path)} (score: {chain_score:.3f})")
                    
                    candidates.append((chain_score, new_path, neighbor))
                    chains.append(KnowledgeChain(path=new_path, score=chain_score, terminal_node=neighbor))
            
            beam = heapq.nlargest(self.beam_width, candidates, key=lambda x: x[0])
            if self.verbose:
                print(f"\n    Selected top {len(beam)} candidates for next iteration")
            
        if self.verbose:
            print(f"\nExploration complete. Found {len(chains)} total chains")
        return chains
    
    def _score_chain_match(self, path: List[str], target: List[str]) -> float:
        """Score chain match using Levenshtein distance."""
        score = 0
        min_len = min(len(path), len(target))
        for i in range(min_len):
            path_item = path[i].lower()
            target_item = target[i].lower()
            dist = distance(path_item, target_item)
            max_len = max(len(path_item), len(target_item))
            similarity = 1 - (dist / max_len)
            score += similarity
        return score / max(len(path), len(target))

class EnhancedKGRAG:
    """Main class implementing the enhanced KG-RAG system."""
    
    def __init__(self, 
                 graph: nx.DiGraph,
                 llm: ChatOpenAI,
                 beam_width: int = 3,
                 max_depth: int = 5,
                 top_k: int = 3,
                 num_chains: int = 1,
                 verbose: bool = False
                 ):
        self.graph = graph
        self.chain_extractor = ChainExtractor(num_chains, verbose)
        self.entity_matcher = EntityMatcher(verbose)
        self.explorer = BeamSearchExplorer(graph, beam_width, max_depth, verbose)
        self.llm = llm
        self.top_k = top_k
        self.beam_width = beam_width
        self.verbose = verbose
        
    def query(self, question: str) -> dict:
        """Process query and return structured response with reasoning."""
        if self.verbose:
            print("\n" + "="*80)
            print(f"Processing query: {question}")
            print("="*80)
            print("\n📋 Extracting candidate chains...")
        
        candidate_chains = self.chain_extractor.extract_chains(question)
        if self.verbose:
            print("Candidate chains:")
            for chain in candidate_chains:
                print(f"  {' -> '.join(chain)}")
        
        all_knowledge_chains = []
        for chain_pattern in candidate_chains:
            if self.verbose:
                print(f"\nProcessing chain pattern: {' -> '.join(chain_pattern)}")
                print(f"\n🔍 Matching initial entity: {chain_pattern[0]}")
            
            graph_entities = list(self.graph.nodes())
            matches = self.entity_matcher.get_matches([chain_pattern[0]], graph_entities, self.beam_width)
            
            if self.verbose:
                print("\n🔎 Exploring graph...")
            
            start_entities = [match.graph_entity for match in matches.get(chain_pattern[0], [])]
            if self.verbose:
                print(f"Starting from entities: {start_entities}")
            
            chain_results = self.explorer.explore(start_entities, chain_pattern)
            top_chains = sorted(chain_results, key=lambda x: x.score, reverse=True)[:self.top_k]
            
            if self.verbose:
                print(f"\nTop {self.top_k} paths for this pattern:")
                for result in top_chains:
                    print(f"  Score {result.score:.3f}: {' -> '.join(result.path)}")
            
            all_knowledge_chains.extend(top_chains)
        
        knowledge = self._format_knowledge(all_knowledge_chains)
        if self.verbose:
            print("\n📚 Knowledge context for LLM:")
            print(knowledge)
            print("\n💭 Generating answer...")
        
        response = self._generate_answer(question, knowledge)
        if self.verbose:
            print("\n🎯 Final response:")
            print(json.dumps(response, indent=2))
            print("\n" + "="*80)
        
        return response
    
    def _format_knowledge(self, chains: List[KnowledgeChain]) -> str:
        formatted = []
        for chain in chains:
            path_str = " -> ".join(chain.path)
            formatted.append(f"Knowledge trace (score={chain.score:.2f}): {path_str}")
        return "\n".join(formatted)
    
    def _generate_answer(self, question: str, knowledge: str) -> dict:
        """Generate structured answer with reasoning and final value."""
        prompt = f"""Based on the following knowledge traces from a graph database, answer this question: {question}

                    Knowledge:
                    {knowledge}

                    You must provide your response as a JSON object with this structure:
                    {{
                        "reasoning": "Your detailed step-by-step analysis showing:
                            1. What specific information you found in the knowledge traces
                            2. How you verified and cross-referenced the information
                            3. Any calculations performed
                            4. How you validated the final answer",
                        "answer": "final numerical value only, properly formatted with no units unless question specifies otherwise"
                    }}

                    Important Rules:
                    - Base your answer ONLY on the provided knowledge traces
                    - Numbers must be whole integers without comma separators, unless specified
                    - Percentages must be whole numbers without % sign
                    - The answer field must contain ONLY the numerical value, no text or units
                    - Your entire response must be valid JSON
                    """
        
        response = self.llm.invoke(prompt)
        return json.loads(response.content)

In [None]:
import os
import pickle
from tqdm import tqdm
import networkx as nx
from langchain_openai import ChatOpenAI
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer

llm = ChatOpenAI(temperature=0, 
                 model_name="gpt-4o", 
                 api_key=os.environ["OPENAI_API_KEY"]
                 )

llm_transformer = LLMGraphTransformer(llm=llm)


# Check if "graph_documents.pkl" exists
if os.path.exists("graph_documents.pkl"):
    # Load graph_documents from the file
    with open("graph_documents.pkl", "rb") as f:
        graph_documents = pickle.load(f)
    print("Loaded graph_documents from graph_documents.pkl")
else:
    # Convert documents to graph documents
    graph_documents = llm_transformer.convert_to_graph_documents(tqdm(documents))
    
    # Save graph_documents to the file
    with open("graph_documents.pkl", "wb") as f:
        pickle.dump(graph_documents, f)
    print("Converted documents to graph_documents and saved to graph_documents.pkl")



graph = NetworkxEntityGraph()

# Add nodes to the graph
for doc in graph_documents:
    for node in doc.nodes:
        graph.add_node(node.id)

for doc in graph_documents:
    for edge in doc.relationships:
        graph._graph.add_edge(
            edge.source.id,
            edge.target.id,
            relation=edge.type,
        )

In [None]:
# Example usage:
if __name__ == "__main__":
    # Initialize components
    llm = ChatOpenAI(temperature=0,
                     model_name="gpt-4o", 
                     api_key=os.environ["OPENAI_API_KEY"],
                     )
    llm = llm.bind(response_format={"type": "json_object"})
        
    # Initialize system
    kg_rag = EnhancedKGRAG(graph._graph, llm, beam_width=10, max_depth=8, top_k=100, num_chains=2, verbose=True)
    # kg_rag = EnhancedKGRAG(graph._graph, llm, beam_width=10, max_depth=10, num_chains=1, top_k_per_chain=10)
    
    # Example query
    question = "What was the Products gross margin percentage for Apple for the quarter ended July 1, 2023, as reported in their Q3 2023 10-Q? Provide the answer rounded to one decimal place."
    answer = kg_rag.query(question)
    # print(f"Question: {question}")
    # print(f"Answer: {answer}")

In [None]:
import datetime
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
from tqdm.auto import tqdm
from typing import Optional
import json
import networkx as nx

def evaluate_kg_rag(
    rag_system,
    data_path: str,
    eval_dir: Optional[Path] = None,
) -> None:
    """Run evaluation of KG-RAG system on test dataset."""
    
    # Setup directories and load data
    eval_dir = eval_dir or Path("evaluation_results_kg")
    eval_dir.mkdir(exist_ok=True)
    
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = eval_dir / f"evaluation_results_{timestamp}.txt"
    
    # Load and optionally sample data
    df = pd.read_csv(data_path)
    
    # Prepare results storage
    results = []
    correct = 0
    total = len(df)
    
    with open(output_file, 'w') as f:
        # Write header
        f.write("KG-RAG System Evaluation Results\n")
        f.write(f"Evaluation Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total Questions: {total}\n")
        f.write("=" * 80 + "\n\n")
        
        # Evaluate each question
        for i, row in tqdm(df.iterrows(), total=total, desc="Evaluating questions"):
            question = row["New Question"]
            expected_answer = row["New Answer"]
            
            try:
                # Get model response
                response = rag_system.query(question)
                model_answer = response["answer"]
                model_reasoning = response["reasoning"]
                
                # Validate answer format
                try:
                    model_answer = float(model_answer)
                    is_correct = float(model_answer) == float(expected_answer)
                    if is_correct:
                        correct += 1
                except ValueError:
                    model_answer = "N/A (ValueError)"
                    is_correct = False
                except Exception as e:
                    model_answer = f"N/A ({str(e)})"
                    is_correct = False
                
                # Write question details
                f.write(f"Question {i+1}/{total}:\n")
                f.write(f"Question: {question}\n")
                f.write(f"Expected Answer: {expected_answer}\n")
                f.write(f"Model Answer: {model_answer}\n")
                f.write(f"Reasoning:\n{model_reasoning}\n")
                f.write(f"Correct: {is_correct}\n")
                f.write("-" * 80 + "\n\n")
                
                # Store result
                results.append({
                    'question_id': i+1,
                    'question': question,
                    'expected': expected_answer,
                    'answer': model_answer,
                    'reasoning': model_reasoning,
                    'correct': is_correct
                })
                
            except Exception as e:
                f.write(f"Error processing question {i+1}: {str(e)}\n")
                f.write("-" * 80 + "\n\n")
                
                results.append({
                    'question_id': i+1,
                    'question': question,
                    'expected': expected_answer,
                    'answer': f"ERROR: {str(e)}",
                    'reasoning': "Error occurred during processing",
                    'correct': False
                })
        
        # Write summary
        accuracy = correct / total
        f.write("\nEvaluation Summary\n")
        f.write("=" * 80 + "\n")
        f.write(f"Total Questions: {total}\n")
        f.write(f"Correct Answers: {correct}\n")
        f.write(f"Accuracy: {accuracy:.2%}\n")
    
    # Save detailed results
    results_df = pd.DataFrame(results)
    results_df.to_csv(eval_dir / f"evaluation_detailed_results_{timestamp}.csv", index=False)
    
    print(f"Evaluation complete. Results saved to {output_file}")
    print(f"Detailed results saved to {eval_dir}/evaluation_detailed_results_{timestamp}.csv")
    print(f"\nFinal Accuracy: {accuracy:.2%}")

if __name__ == "__main__":
    load_dotenv()
    
    # Initialize KG-RAG system
    # graph = nx.DiGraph()  # Load your graph here
    rag_system = EnhancedKGRAG(
        graph=graph._graph,
        llm=ChatOpenAI(temperature=0, model="gpt-4o"),
        beam_width=3,
        max_depth=5,
        top_k=3,
        verbose=False
    )
    
    # Run evaluation
    evaluate_kg_rag(
        rag_system=rag_system,
        data_path="../../data/sec-10-q/synthetic_qna_data_7_gpt4o_v2_mod1_50.csv",
        eval_dir=Path("evaluation_results_kg")
    )

In [5]:
import pandas as pd
from pathlib import Path
import datetime
from tqdm.auto import tqdm
import json

def run_evaluation(kg_rag, df, eval_dir, experiment_name):
    """Run evaluation for a specific hyperparameter configuration."""
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = eval_dir / f"evaluation_results_{experiment_name}_{timestamp}.txt"
    
    results = []
    correct = 0
    total = len(df)
    
    with open(output_file, 'w') as f:
        f.write(f"KG-RAG System Evaluation Results - {experiment_name}\n")
        f.write(f"Evaluation Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Parameters: {experiment_name}\n")
        f.write(f"Total Questions: {total}\n")
        f.write("=" * 80 + "\n\n")
        
        for i, row in tqdm(df.iterrows(), total=total, desc=f"Evaluating {experiment_name}"):
            question = row["New Question"]
            expected_answer = row["New Answer"]
            
            try:
                response = kg_rag.query(question)
                model_answer = response["answer"]
                model_reasoning = response["reasoning"]
                
                try:
                    model_answer = float(model_answer)
                    is_correct = float(model_answer) == float(expected_answer)
                    if is_correct:
                        correct += 1
                except ValueError:
                    model_answer = "N/A (ValueError)"
                    is_correct = False
                except Exception as e:
                    model_answer = f"N/A ({str(e)})"
                    is_correct = False
                    
            except Exception as e:
                model_answer = f"ERROR: {str(e)}"
                model_reasoning = "Error occurred during processing"
                is_correct = False
            
            f.write(f"Question {i+1}/{total}:\n")
            f.write(f"Question: {question}\n")
            f.write(f"Expected Answer: {expected_answer}\n")
            f.write(f"Model Answer: {model_answer}\n")
            f.write(f"Reasoning:\n{model_reasoning}\n")
            f.write(f"Correct: {is_correct}\n")
            f.write("-" * 80 + "\n\n")
            
            results.append({
                'question_id': i+1,
                'question': question,
                'expected': expected_answer,
                'answer': model_answer,
                'reasoning': model_reasoning,
                'correct': is_correct
            })
        
        accuracy = correct / total
        f.write("\nEvaluation Summary\n")
        f.write("=" * 80 + "\n")
        f.write(f"Total Questions: {total}\n")
        f.write(f"Correct Answers: {correct}\n")
        f.write(f"Accuracy: {accuracy:.2%}\n")
    
    results_df = pd.DataFrame(results)
    results_df.to_csv(eval_dir / f"evaluation_detailed_results_{experiment_name}_{timestamp}.csv", index=False)
    
    return accuracy

def hyperparameter_search(graph, llm, df):
    """Run hyperparameter search experiments."""
    eval_dir = Path("evaluation_results_kg_hyperparam")
    eval_dir.mkdir(exist_ok=True)
    
    experiments = [
        {"name": "baseline", "params": {"beam_width": 10, "max_depth": 8, "top_k": 100, "num_chains": 2}},
        {"name": "wide_beam", "params": {"beam_width": 25, "max_depth": 8, "top_k": 100, "num_chains": 2}},
        {"name": "narrow_beam", "params": {"beam_width": 5, "max_depth": 8, "top_k": 100, "num_chains": 2}},
        {"name": "deep_search", "params": {"beam_width": 10, "max_depth": 12, "top_k": 100, "num_chains": 2}},
        {"name": "shallow_search", "params": {"beam_width": 10, "max_depth": 5, "top_k": 100, "num_chains": 2}},
        {"name": "high_topk", "params": {"beam_width": 10, "max_depth": 8, "top_k": 200, "num_chains": 2}},
        {"name": "low_topk", "params": {"beam_width": 10, "max_depth": 8, "top_k": 50, "num_chains": 2}},
        {"name": "multi_chain", "params": {"beam_width": 10, "max_depth": 8, "top_k": 100, "num_chains": 5}},
        {"name": "aggressive", "params": {"beam_width": 25, "max_depth": 12, "top_k": 200, "num_chains": 5}},
        {"name": "conservative", "params": {"beam_width": 5, "max_depth": 5, "top_k": 50, "num_chains": 1}}
    ]
    
    results = {}
    for exp in experiments:
        print(f"\nRunning experiment: {exp['name']}")
        print(f"Parameters: {exp['params']}")
        
        kg_rag = EnhancedKGRAG(
            graph=graph,
            llm=llm,
            verbose=False,
            **exp['params']
        )
        
        accuracy = run_evaluation(kg_rag, df, eval_dir, exp['name'])
        results[exp['name']] = {
            'params': exp['params'],
            'accuracy': accuracy
        }
        
        print(f"Accuracy: {accuracy:.2%}")
    
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    with open(eval_dir / f"hyperparameter_search_results_{timestamp}.json", 'w') as f:
        json.dump(results, f, indent=2)
    
    return results

In [None]:
# Load the CSV file
df = pd.read_csv("../../data/sec-10-q/synthetic_qna_data_7_gpt4o_v2_mod1_50.csv")
df = df.head(10)

# Run hyperparameter search
results = hyperparameter_search(graph._graph, llm, df)