In [5]:
import pandas as pd
import numpy as np
import iris
import time
import os

### Setup Table

In [7]:
# Vector Database Setup
username = "demo"
password = "demo"
hostname = os.getenv("IRIS_HOSTNAME", "localhost")
port = "1972"
namespace = "USER"
CONNECTION_STRING = f"{hostname}:{port}/{namespace}"
print(CONNECTION_STRING)

conn = iris.connect(CONNECTION_STRING, username, password)
cursor = conn.cursor()

# Create table
tableName = "BASLABS.ClimateReports"
tableDefinition = "(company_name VARCHAR(255), report_url VARCHAR(1000), year INT)"

try:
    cursor.execute(f"DROP TABLE {tableName}")
except:
    pass
cursor.execute(f"CREATE TABLE {tableName} {tableDefinition}")

localhost:1972/USER


0

### Langchain Embeddings

In [116]:
embedTableName = "BASLABS.ClimateReportsEmbed"
tableDefinition = """
(
company_name VARCHAR(255), 
source_url VARCHAR(1000),
year INT,
page INT,
total_pages INT,
content VARCHAR(5000), 
content_embedding VECTOR(DOUBLE, 2048)
)
"""

try:
    cursor.execute(f"DROP TABLE {embedTableName}")
except:
    pass
cursor.execute(f"CREATE TABLE {embedTableName} {tableDefinition}")

0

In [104]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access variables
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")

client = NVIDIAEmbeddings(
    model="nvidia/llama-3.2-nv-embedqa-1b-v2",
    api_key=NVIDIA_API_KEY,
    truncate="NONE",
)

In [131]:
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_iris import IRISVector
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import PyMuPDFLoader

import requests
from io import BytesIO
import concurrent.futures
import time
import multiprocessing

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", "? ", "! "],
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

In [132]:
def run_with_timeout(func, timeout, *args, **kwargs):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(func, *args, **kwargs)
        try:
            return future.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            return None  # Or handle as needed

def load_pdf(url):
    loader = PyPDFLoader(url)
    return loader


def get_docs(url):
    try:
        start = time.time()
        loader = run_with_timeout(load_pdf, 2, url)
        end = time.time()
        print(f"Loading PDF took {(end - start):.2f} s")
        if loader == None:
            return None

        start = time.time()
        documents = loader.load()
        docs = text_splitter.split_documents(documents)
        end = time.time()
        print(f"Loading docs took {(end - start):.2f} s")
        return docs
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return None

def get_df(docs, company_name, year, pdf_url):
    data = [
        {
            "page": doc.metadata["page"],
            "total_pages": doc.metadata["total_pages"],
            "content": doc.page_content,
        }
        for doc in docs
    ]

    doc_df = pd.DataFrame(data)
    doc_df["company_name"] = company_name
    doc_df["year"] = year
    doc_df["source_url"] = pdf_url
    doc_embeddings = client.embed_documents(doc_df["content"].tolist())
    doc_df["content_embedding"] = doc_embeddings
    return doc_df

def insert_df(doc_df):
    sql = f"""
    INSERT INTO {embedTableName}
    (company_name, source_url, year, page, total_pages, content, content_embedding) 
    VALUES (?, ?, ?, ?, ?, ?, TO_VECTOR(?))
    """

    data = [
        (
            row["company_name"],
            row["source_url"],
            row["year"],
            row["page"],
            row["total_pages"],
            row["content"],
            str(row["content_embedding"]),
        )
        for index, row in doc_df.iterrows()
    ]
    results = cursor.executemany(sql, data)

In [133]:
link_df = pd.read_csv("data/tech_reports.csv")
link_df.head(2)

i = 0
for index, row in link_df.iterrows():

    company_name = row["company_name"]
    year = row["year"]
    source_url = row["report_url"]
    print("Processing", company_name)

    if i == 0 or i == 1:
        i += 1
        continue

    docs = get_docs(source_url)
    if docs == None:
        continue
    print(f"Found {len(docs)} docs")
    doc_df = get_df(docs, company_name, year, source_url)
    insert_df(doc_df)
    i += 1
    if i == 10:
        break

Processing Accenture
Processing Adobe Inc.
Processing Advanced Micro Devices


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Cellar/python@3.9/3.9.20/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/homebrew/Cellar/python@3.9/3.9.20/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'create_loader' on <module '__main__' (built-in)>


KeyboardInterrupt: 

### Similarity Search

In [None]:
numberOfResults = 5
query = "AI-driven precision farming optimizes water and fertilizer use, while monitoring soil health and carbon capture enhances sustainability efforts. Automated detection of deforestation and land use changes further strengthens environmental responsibility. These initiatives lead to higher crop yields (5-20% increase), reduced input costs, and potential revenue through carbon credits."
searchVector = client.encode_query(query)
sql = f"""
    SELECT TOP ? company_name, content
    FROM {embedTableName}
    ORDER BY VECTOR_DOT_PRODUCT(content_embedding, TO_VECTOR(?)) DESC
"""
cursor.execute(sql, [numberOfResults, str(searchVector)])
results = cursor.fetchall()
results

In [None]:
# fine-tune a bert model to classify paragraphs as climate actions or not
# Return on investment


In [None]:
## note the additional description_vector VECTOR(DOUBLE, 384) field to store embeddings


doc = docs[0]
page = doc.metadata["page"]
total_pages = doc.metadata["total_pages"]
doc_embedding = client.embed_documents

start_time = time.time()
data = [
    (
        company_name,
        pdf_url,
        year,
        page,
        total_pages,
        doc.page_content
        str(row["description_vector"]),
    )
]
cursor.execute(sql, data)

In [None]:
page_contents = [d.page_content for d in docs]
embeddings = client.embed_documents(page_contents)

[[-0.001018524169921875,
  -0.0197906494140625,
  0.032318115234375,
  0.01464080810546875,
  0.0075225830078125,
  -0.01035308837890625,
  0.01186370849609375,
  -0.005489349365234375,
  -0.00447845458984375,
  0.0241546630859375,
  -0.01593017578125,
  0.0246734619140625,
  -0.02325439453125,
  -0.01476287841796875,
  -0.016815185546875,
  0.03155517578125,
  -0.018524169921875,
  -0.0270233154296875,
  0.0225830078125,
  0.0308837890625,
  -0.0340576171875,
  0.04205322265625,
  0.0030307769775390625,
  0.014251708984375,
  0.046173095703125,
  -0.01511383056640625,
  -0.01091766357421875,
  0.027374267578125,
  -0.00797271728515625,
  0.006130218505859375,
  -0.032928466796875,
  0.0059051513671875,
  0.00199127197265625,
  0.0162353515625,
  0.005260467529296875,
  0.00341796875,
  -0.004802703857421875,
  0.0259552001953125,
  -0.0012874603271484375,
  -0.040771484375,
  0.01305389404296875,
  -0.01025390625,
  0.017181396484375,
  -0.0008878707885742188,
  -0.028839111328125,
  

In [56]:
client.embed_query(docs[10].page_content)

[0.0063934326171875,
 -0.00926971435546875,
 0.03472900390625,
 -0.00555419921875,
 -0.0165557861328125,
 0.01019287109375,
 -0.00659942626953125,
 0.055816650390625,
 0.0214691162109375,
 0.0025005340576171875,
 0.019073486328125,
 0.024078369140625,
 0.0169830322265625,
 0.0003733634948730469,
 0.007686614990234375,
 0.0169219970703125,
 0.006038665771484375,
 -0.0307769775390625,
 -0.0209197998046875,
 -0.0009760856628417969,
 -0.00846099853515625,
 0.00911712646484375,
 0.00862884521484375,
 -0.0219879150390625,
 0.026092529296875,
 -0.0025386810302734375,
 -0.0027675628662109375,
 0.01220703125,
 0.017333984375,
 0.02667236328125,
 -0.040802001953125,
 0.0085296630859375,
 -0.0165557861328125,
 0.017303466796875,
 -0.01084136962890625,
 -0.033477783203125,
 0.0028400421142578125,
 -0.0267181396484375,
 0.045074462890625,
 -0.0203399658203125,
 -0.017333984375,
 -0.029022216796875,
 -0.0255126953125,
 0.01168060302734375,
 0.00576019287109375,
 -0.0036449432373046875,
 0.0054435729

In [12]:
from langchain_community.document_loaders import PyPDFLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
import re

class DocumentProcessor:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", ". ", "? ", "! "],
            chunk_size=2000,
            chunk_overlap=0,
            length_function=len,
            is_separator_regex=False,
        )
        # self.embeddings = OpenAIEmbeddings()

    def clean_text(self, text: str) -> str:
        text = re.sub(r"\s+", " ", text)
        text = text.strip()
        return text
    
    def process_report(self, file_path: str) -> List[Dict]:
        


   

NameError: name 'List' is not defined

In [11]:
df = pd.read_csv("data/reports_links.csv")
df.head(2)

Unnamed: 0,company_name,report_url,year
0,A. P. Møller-Mærsk A/S,https://www.maersk.com/~/media_sc9/maersk/corp...,2021
1,AAK AB,https://www.aak.com/reports/sustainability_rep...,2021


In [None]:
import requests
import fitz  # PyMuPDF
from langchain.document_loaders import PyMuPDFLoader
from io import BytesIO