In [2]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"
user = "neo4j"
password = "neo4j123"   # <-- put your password

driver = GraphDatabase.driver(uri, auth=(user, password))

with driver.session() as session:
    result = session.run("RETURN 'Neo4j is connected!' AS message")
    for record in result:
        print(record["message"])

driver.close()


Neo4j is connected!


In [4]:
from core.db.graph_db import EsgGraphDB

db = EsgGraphDB()

triples = [
    ("Company A", "must_follow", "GDPR"),
    ("GDPR", "requires", "Data Encryption Policy")
]

db.insert_triples(triples)

result = db.query("MATCH (n)-[r]->(m) RETURN n.name, r.type, m.name")
print(result)

db.close()

[{'n.name': 'Company A', 'r.type': 'must_follow', 'm.name': 'GDPR'}, {'n.name': 'GDPR', 'r.type': 'requires', 'm.name': 'Data Encryption Policy'}]


In [5]:
import json

with open("databases/esg_db_mapping.json", "r") as f:
    chunks = json.load(f)

In [9]:
chunks

{'b7d7c917d5f04bb0-0f30': {'document': 'GRI 301: Materials 2016\n301\nEFFECTIVE DATE: 1 JULY 2018\nTOPIC STANDARD\nGRI 301: Materials 2016\nTopic Standard\nEffective Date\nThis Standard is effective for reports or other materials published on or after 1 July 2018\nResponsibility\nThis Standard is issued by the Global Sustainability Standards Board (GSSB). Any feedback on the GRI Standards\ncan be submitted to gssbsecretariat@globalreporting.org for the consideration of the GSSB.\nDue Process\nThis Standard was developed in the public interest and in accordance with the requirements of the GSSB Due\nProcess Protocol. It has been developed using multi-stakeholder expertise, and with regard to authoritative\nintergovernmental instruments and widely held expectations of organizations relating to social, environmental, and\neconomic responsibilities.\nLegal Liability\nThis document, designed to promote sustainability reporting, has been developed by the Global Sustainability\nStandards Boar

In [11]:
from llms.rag import RAG
from core.utils.documents_processor import DocumentProcessor
from llms.esg import ESG

INFO:faiss.loader:Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.
INFO:faiss:Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes.


In [12]:
rag_system = RAG(config_path="config/config.yaml")

INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:core.db.vector_db:Imported FAISS index and document mapping.


In [25]:
from llms.esg import ESG

In [26]:
folder_path = "output"

In [27]:
rag_system.index_documents(folder=folder_path, use_esg=True)

INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:core.llm.base:System prompt loaded from config/prompts/document_processor.txt


In [28]:
query = "What are the requirements of 301-2"
result = rag_system.answer_query(query, use_esg=True, k=3)

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"


In [29]:
text = result['context']

In [30]:
def extract_triples_with_ollama(text):
    prompt = f"""
Extract Subject, Predicate, and Object triples from the following text:

\"\"\"
{text}
\"\"\"

Format the output as JSON:
[
  {{"subject": "..." , "predicate": "...", "object": "..."}},
  ...
]
Only output valid JSON.
"""

    response = ollama.chat(
        model='llama3',  # or 'mistral', 'mixtral', 'gemma' etc.
        messages=[{'role': 'user', 'content': prompt}]
    )

    content = response['message']['content']

    # Parse JSON
    import json
    try:
        triples = json.loads(content)
        return triples
    except json.JSONDecodeError:
        print("Failed to parse triples. Raw output:")
        print(content)
        return []


In [35]:
from llms.esg import ESG

In [36]:
esg_model = ESG()

query = "What are the requirements of 301-2?"
triples = esg_model.extract_triples(query)

for t in triples:
    print(f"Subject: {t['subject']} → Predicate: {t['predicate']} → Object: {t['object']}")


INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:core.llm.base:System prompt loaded from config/prompts/esg.txt
INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:core.db.vector_db:Imported FAISS index and document mapping.
INFO:llms.esg:ESG initialized.


AttributeError: 'ESG' object has no attribute 'extract_triples'

In [51]:
from core.llm.base import BaseLLM
from llms.rag import RAG
import logging

logger = logging.getLogger(__name__)

class ESG(BaseLLM):
    """
    ESG is an ESG-focused LLM that inherits from BaseLLM.
    """
    
    def __init__(self, config_path: str = "config/config.yaml"):
        super().__init__(config_path=config_path)
        self.system_prompt = self._get_system_prompt("config/prompts/esg.txt")
        self.rag = RAG(config_path=config_path)
        logger.info("ESG initialized.")
        
    def answer_query(self, query: str) -> str:
        """
        Process a user's ESG query
        """
        logger.info("Received query: %s", query)
        if len(query.strip().split()) < 3:
            clarification = ("Your query seems a bit brief. "
                            "Could you please provide more details regarding your ESG standards question?")
            logger.info("Query too brief; returning clarification prompt.")
            return clarification
        result = self.rag.answer_query(query, use_esg=True, k=5)
        context = result.get("context", "")
        final_prompt = (
            f"{self.system_prompt}\n\n"
            f"Context:\n{context}\n\n"
            f"User Query: {query}\n\n"
            "Provide a detailed answer regarding ESG standards. "
            "If more information is needed, ask follow-up questions."
        )
        logger.debug("Constructed final prompt: %s", final_prompt)
        answer = self.generate(final_prompt)
        # logger.info("Generated answer: %s", answer)
        return answer
    
    def ask_followup(self, followup: str) -> str:
        """
        Process a follow-up question.
        """
        logger.info("Received follow-up question: %s", followup)
        result = self.rag.answer_query(followup, use_esg=True, k=5)
        answer = result.get("answer", "I'm sorry, I couldn't retrieve ESG information for your follow-up query.")
        logger.info("Follow-up answer: %s", answer)
        return answer

    def summarize_text(self, text: str) -> str:
        """
        Generate a summary for a given text chunk.
        """
        final_prompt = f"{self.system_prompt}\n\nSummarize the following text:\n\n{text}"
        return self.generate(final_prompt)
    
    def mini_report(self, text: str) -> str:
        """
        Generate a  mini-report for a given text chunk.
        """
        final_prompt = f"{self.system_prompt}\n\nGenerate mini-report for the following text:\n\n{text}"
        return self.generate(final_prompt)
    
    def _validate_triples(self, triples: list) -> list:
        """
        Validate and clean triples.
        Keep only triples that have subject, predicate, and object.
        """
        valid_triples = []
        for t in triples:
            if (
                isinstance(t, dict)
                and t.get('subject') 
                and t.get('predicate') 
                and t.get('object')
            ):
                valid_triples.append((t['subject'], t['predicate'], t['object']))
            else:
                print(f"⚠️ Skipping invalid triple: {t}")
        return valid_triples
    
    def extract_triples(self, query: str) -> list:
        """
        Extract Subject, Predicate, Object triples for a given ESG query.
        """
        logger.info("Received query for triple extraction: %s", query)
        result = self.rag.answer_query(query, use_esg=True, k=5)
        context = result.get("context", "")
        final_prompt = (
            "Extract all Subject, Predicate, Object triples from the following text.\n\n"
            "ONLY OUTPUT PURE JSON ARRAY. NO EXPLANATIONS. NO COMMENTS.\n"
            "Format:\n"
            "[\n"
            "  {\"subject\": \"...\", \"predicate\": \"...\", \"object\": \"...\"}\n"
            "]\n\n"
            f"Context:\n{context}\n\n"
        )
        logger.debug("Constructed final triple extraction prompt: %s", final_prompt)
        triples_json = self.generate(final_prompt)

        # 🔥 Print raw output for debugging
        print("---- RAW LLM OUTPUT ----")
        print(triples_json)
        print("------------------------")

        import json
        import re

        # Clean the output
        raw_output = triples_json.strip()
        cleaned_output = re.search(r'\[.*\]', raw_output, re.DOTALL)
        if cleaned_output:
            cleaned_output = cleaned_output.group(0)
        else:
            cleaned_output = raw_output  # fallback
        try:
            triples = json.loads(cleaned_output)
        except json.JSONDecodeError:
            logger.error("Failed to parse triples JSON output.")
            triples = []

        # ✅ Validate and clean the extracted triples
        triples = self._validate_triples(triples)

        return triples



In [54]:
# Instantiate your ESG model
esg_model = ESG()

# Your ESG-related query
query = "What are the requirements of 301-2?"

# Extract triples from the query
triples = esg_model.extract_triples(query)

# Print the extracted triples
for subject, predicate, object_ in triples:
    print(f"Subject: {subject} → Predicate: {predicate} → Object: {object_}")

INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:core.llm.base:System prompt loaded from config/prompts/esg.txt
INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:core.db.vector_db:Imported FAISS index and document mapping.
INFO:__main__:ESG initialized.
INFO:__main__:Received query for triple extraction: What are the requirements of 301-2?
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
INFO:core.llm.base:Generating response with params: {'model': 'llama3:8b', 'temperature': 0.7, 'max_tokens': 4096}
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


---- RAW LLM OUTPUT ----
[
  {"subject": "The reporting organization", "predicate": "shall report", "object": "the following information"},
  {"subject": "Requirements", "predicate": "are presented in bold font", "object": "and indicated by the word 'shall'"},
  {"subject": "Requirements", "predicate": "may be accompanied by guidance", "object": ""},
  {"subject": "Guidance", "predicate": "includes background information, explanations, and examples", "object": "to help the organization better understand the requirements"},
  {"subject": "The Standards", "predicate": "may also include recommendations", "object": ""},
  {"subject": "Requirements", "predicate": "are presented in bold font", "object": "and indicated by the word 'shall'"},
  {"subject": "Recommendations", "predicate": "indicate a particular course of action", "object": "is encouraged but not required"},
  {"subject": "Defined terms", "predicate": "are underlined in the text", "object": "of the GRI Standards and linked to th

In [56]:
triples[0]

('The reporting organization', 'shall report', 'the following information')

In [57]:
from core.db.graph_db import EsgGraphDB

db = EsgGraphDB()

db.insert_triples(triples)

result = db.query("MATCH (n)-[r]->(m) RETURN n.name, r.type, m.name")
print(result)

db.close()

INFO:core.db.graph_db:Connected to Neo4j at bolt://localhost:7687 as user neo4j
INFO:core.db.graph_db:Inserted 15 triples into the graph database.
INFO:core.db.graph_db:Query executed: MATCH (n)-[r]->(m) RETURN n.name, r.type, m.name with parameters: {}
INFO:core.db.graph_db:Neo4j driver closed


[{'n.name': 'Company A', 'r.type': 'must_follow', 'm.name': 'GDPR'}, {'n.name': 'GDPR', 'r.type': 'requires', 'm.name': 'Data Encryption Policy'}, {'n.name': 'The reporting organization', 'r.type': 'shall report', 'm.name': 'the following information'}, {'n.name': 'The reporting organization', 'r.type': 'shall report', 'm.name': 'how it manages each of its material topics'}, {'n.name': 'The reporting organization', 'r.type': 'shall', 'm.name': 'use the total weight or volume of materials used as specified in Disclosure 301-1'}, {'n.name': 'The reporting organization', 'r.type': 'shall calculate', 'm.name': 'the percentage of recycled input materials used by applying the following formula'}, {'n.name': 'The reporting organization', 'r.type': 'should', 'm.name': 'report the methods used, if estimation is required'}, {'n.name': 'Requirements', 'r.type': 'are presented in bold font', 'm.name': "and indicated by the word 'shall'"}, {'n.name': 'Guidance', 'r.type': 'includes background infor