# Subgraph enhance Text Retrieval

## Environment Setting

In [2]:
import os

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY" 
os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1"

## Load PDF and Retriever

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.chains import LLMChain

### load ESRS pdf

In [35]:
loader = PyPDFLoader("data/ESRS E1 Climate Change November 2022.pdf")
pages = loader.load_and_split()

### RecursiveCharacterTextSplitter

In [36]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(pages)

### OpenAI Embedding

In [11]:
import chromadb
from langchain_openai import OpenAIEmbeddings

embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")
persist_directory = './chromadb'
vectordb = Chroma.from_documents(documents=pages, embedding=embedding_function, persist_directory=persist_directory)


  warn_deprecated(


In [8]:
retriever = vectordb.as_retriever(search_kwargs={'k': 5})

## Extract triples from graph

In [13]:
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import NamespaceManager, RDF, RDFS, SKOS, OWL

# Define a function to extract and format triples from a Turtle file
def extract_and_format_triples(file_path):
    # Create an RDF graph
    graph = Graph()
    
    # Read the Turtle file
    with open(file_path, 'r') as file:
        turtle_data = file.read()
    
    # Parse the Turtle data
    graph.parse(data=turtle_data, format='ttl')
    
    # Add common namespaces
    ns_manager = NamespaceManager(graph)
    ns_manager.bind('rdf', RDF)
    ns_manager.bind('rdfs', RDFS)
    ns_manager.bind('skos', SKOS)
    ns_manager.bind('owl', OWL)
    
    formatted_triples = []
    
    # Iterate through all triples in the graph
    for subj, pred, obj in graph:
        # Format each part of the triple
        subj_str = format_entity(subj, ns_manager)
        pred_str = format_entity(pred, ns_manager)
        obj_str = format_entity(obj, ns_manager, is_object=True)
        if pred_str == "prefLabel":
            indicator_str = obj_str
        
        # Append the formatted triple to the list
        formatted_triples.append((subj_str, pred_str, obj_str))

    # Replace subjects in formatted_triples with indicator_str
    if indicator_str:
        indicator_str_clean = indicator_str.strip('"')  # Remove quotes if indicator_str is a literal
        formatted_triples = [(indicator_str_clean, pred, obj) for _, pred, obj in formatted_triples]
    
    # Format the triples as strings for final output
    formatted_triples = [f"({subj}, {pred}, {obj})" for subj, pred, obj in formatted_triples]
    
    return formatted_triples

# Helper function to format entities and remove specified prefixes
def format_entity(entity, ns_manager, is_object=False):
    if isinstance(entity, URIRef):
        entity_str = None
        
        # Get a namespace prefix if available
        for prefix, uri in ns_manager.namespaces():
            if str(entity).startswith(str(uri)):
                entity_str = str(entity).replace(str(uri), "")
                break

        if not entity_str:
            # Fallback to last part of the URI
            entity_str = entity.split("/")[-1] if "/" in entity else str(entity)

        # Remove specified prefixes
        entity_str = remove_prefixes(entity_str)

    elif isinstance(entity, Literal):
        # Format literal values with quotes
        entity_str = f'"{entity}"'
    else:
        entity_str = str(entity)

    return entity_str

# Function to remove specified prefixes
def remove_prefixes(entity_str):
    prefixes_to_remove = ["rso:", "rdf:", "rdfs:", "skos:", "owl:"]
    for prefix in prefixes_to_remove:
        if entity_str.startswith(prefix):
            entity_str = entity_str[len(prefix):]
            break
    return entity_str

# Specify the path to your RDF Turtle file
rdf_file_path = 'Data/Indicator3.ttl'

# Extract and format triples from the Turtle file
formatted_triples = extract_and_format_triples(rdf_file_path)

# Print the formatted triples
for triple in formatted_triples:
    print(triple)


(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., type, AbsoluteIndicator)
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., measuresGRITopic, GRITopic3_305)
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., hasUnit, TON_Metric)
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., type, Indicator)
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., identifier, "305-1-a")
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., type, NamedIndividual)
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., hasApplicability, mandatory)
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., prefLabel, "Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent.")
(Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent., inDisclosure, Disclosure9_305-1)
(Gross direct (Scope 1) GHG emissions in metric tons o

## Text Understanding and Information Retrieval

### Prompt for triple to text

In [14]:
from langchain.prompts import PromptTemplate

triple_to_text_prompt = PromptTemplate(
    input_variables=["triples"],
    template=(
        "You are a text generation assistant that helps in transforming RDF triples into natural language sentences. "
        "An RDF triple consists of a subject, a predicate, and an object. Your task is to construct clear and concise sentences from the given triples. "
        "Follow the format: subject, predicate, object. Use appropriate connecting words and verbs to form meaningful sentences. "
        "Here are some definitions and guidelines:\n"
        
        "Definition: A sustainability reporting indicator is defined as 'a single measurement from which quantitative conclusions on the environmental phenomenon can be inferred'.\n"
        
        "There are 4 kinds of predicates that can be used for extracting triples:\n"
        "1. Has quantity kind: Relates the indicator to the measurement quantity kind.\n"
        "2. Has measurement phenomenon: Relates the indicator to the environmental phenomenon measured.\n"
        "3. Has unit: Relates the indicator to the required unit derived from sustainability standards.\n"
        "4. Has applicability: Relates the indicator to its applicability status, which can be either optional (indicated by 'if applicable') or mandatory (indicated by 'shall').\n"
        
        "Make sure to explain the relationships in a way that a layperson can understand. Here are some examples:\n"
        
        "Example 1:\n"
        "Input Triple: (CoreTopic4_Wst, type, CoreTopic)\n"
        "Output Sentence: 'CoreTopic4_Wst is a type of CoreTopic.'\n\n"
        
        "Example 2:\n"
        "Input Triple: (Total fuel consumption within the organization from non-renewable sources, in joules or multiples, and including fuel types used, inStandard, Standard3_GRI305)\n"
        "Output Sentence: 'Total fuel consumption within the organization from non-renewable sources, in joules or multiples, and including fuel types used, is included in the Standard3_GRI305.'\n\n"

        "Now, given the following {triples}, construct text sentences:\n\n"
        "Output Sentence:\n"
    
    )
)



In [15]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4-turbo", temperature=0) # gpt-3.5-turbo-16k, gpt-4-turbo

triple2text_chain = LLMChain(llm=llm, prompt=triple_to_text_prompt)

triples2text_list = []
# Process each triple and generate text
for triple in formatted_triples:
    input_data = {"triples": triple}
    output = triple2text_chain.invoke(input_data)
    triples2text_list.append(output)


  warn_deprecated(


In [16]:
# Initialize the concatenated text
total_text = ""

# Iterate over the list and concatenate the text values
for triples2text in triples2text_list:
    total_text += triples2text['text'] + " "

# Print the concatenated text
print("Concatenated Text:\n", total_text)

Concatenated Text:
 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent is a type of AbsoluteIndicator.' 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent measures the GRITopic3_305.' 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent are measured in metric tons.' 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent is a type of Indicator.' 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent is identified by the code "305-1-a".' 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent is a type of NamedIndividual.' 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent are mandatory to report.' 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent' is labeled as "Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent." 'Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent are disclosed in Disclosure9_305-1.' 'Gro

In [17]:
# Define the prompt template for summarizing sentences
summarize_sentences_prompt = PromptTemplate(
    input_variables=["sentences"],
    template=(
        "You are a text generation assistant that helps summarize information. Your task is to summarize the following sentences into a concise paragraph and DON'T miss any informantion:\n\n"
        "{sentences}\n\n"
        "Summarize the sentences in a concise paragraph and provide only the summarized text."
    )
)

In [18]:
text_summarize = LLMChain(llm=llm, prompt=summarize_sentences_prompt)
final_text = text_summarize.invoke({"sentences", total_text})

In [19]:
# Output the summarized text
print(final_text['text'])

Gross direct (Scope 1) GHG emissions, measured in metric tons of CO2 equivalent, are a mandatory reporting metric defined by GRI and identified by the code "305-1-a." This metric is classified as an AbsoluteIndicator, Indicator, and NamedIndividual, and it measures various aspects including the GRITopic3_305, CoreTopic3_Emi, GrossDirectScope1GHGEmissions, and the environmental dimension. It is quantified in terms of mass and is included in Disclosure9_305-1 and Standard3_GRI305. The emissions are specifically labeled as "Gross direct (Scope 1) GHG emissions in metric tons of CO2 equivalent."


### Retrieval with text

In [21]:
results = retriever.invoke(final_text['text'])

In [22]:
import pdfplumber

# Function to extract and display pages number
def extract_page_numbers(documents):
    page_numbers = []
    for document in documents:
        metadata = document.metadata
        page_number = metadata.get('page')
        if page_number is not None:
            page_numbers.append(page_number+1)
    return page_numbers


extract_page_numbers(results)


[12, 34, 36, 32, 33, 17, 31, 11, 35, 18, 19, 37, 38, 13, 4]