# AIM Hackathon: Sample code
19.10.2024

In [37]:
import os
import requests
import PyPDF2
import tiktoken
import pandas as pd
import pickle
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings.base import OpenAIEmbeddings

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS

# load openai key
if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [38]:
REPORTS_SAVE_PATH = 'data/sample_reports'
DB_PATH = "data/db/sample.db"

# See https://openai.com/api/pricing/
MODEL = "gpt-3.5-turbo"

In [39]:
df = pd.read_json('data/reports.json')
df

Unnamed: 0,company_name,year,dataset,pdf_url
0,Walmart,2023,handcrafted,https://corporate.walmart.com/content/dam/corp...
1,Walmart,2021,handcrafted,https://corporate.walmart.com/content/dam/corp...
2,Walmart,2019,handcrafted,https://corporate.walmart.com/content/dam/corp...
3,Amazon,2023,handcrafted,https://sustainability.aboutamazon.com/content...
4,Amazon,2021,handcrafted,https://sustainability.aboutamazon.com/content...
...,...,...,...,...
141,tarkett,2020,scraped,https://www.tarkett.com/sites/default/files/20...
142,trivium-packaging,2021,scraped,https://www.triviumpackaging.com/media/13fl4q3...
143,trivium-packaging,2020,scraped,https://triviumpackaging.com/sustainability/re...
144,trust,2023,scraped,https://dezlwerqy1h00.cloudfront.net/images/co...


## Download some reports

In [40]:
# EXAMPLE: select apple reports
df_sample = df[df['company_name'] == 'Apple']

In [41]:
# download Apple reports to save_dir
def download_files(df: pd.DataFrame, save_dir: str):
    os.makedirs(save_dir, exist_ok=True)
    for url in df['pdf_url']:
        pdf_filename = os.path.basename(url)
        response = requests.get(url)
        with open(os.path.join(save_dir, pdf_filename), 'wb') as file:
            file.write(response.content)
    print(f"Success.")

In [42]:
download_files(df_sample, REPORTS_SAVE_PATH)

Success.


## Create simple vector database

In [43]:
# Load PDFs
def get_documents_from_path(files_path: str) -> [Document]:
    documents = []
    
    for file in os.listdir(files_path):
        _, file_extension = os.path.splitext(file)
        text = ""
        
        if file_extension == ".pdf":
            with open(os.path.join(files_path, file), 'rb') as f:
                reader = PyPDF2.PdfReader(f, strict=False)
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                
            if text:
                documents.append(Document(page_content=text, metadata={"source": file}))
            else:
                print(f"WARNING: No text extracted from {file}")
        else:
            # TODO: can add support for other file types here
            raise Exception(f"Unsupported file extension: {file_extension}")
    
    return documents

In [44]:
documents = get_documents_from_path(REPORTS_SAVE_PATH)

In [45]:
# TODO could also just provide a dummy retriever to not spoil too much
class DummyRetriever:
    def __init__(self, texts):
        self.texts = texts
        
    def dummy_retriever(self, query):
        import random
        return random.sample(self.texts, k=3)

In [46]:
# Create database
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=300, separators=["\n\n", "\n"])

# split documents and create vector database
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()  # https://platform.openai.com/docs/guides/embeddings/embedding-models
db = FAISS.from_documents(texts, embeddings)

# count build embedding token number
tokenizer = tiktoken.get_encoding("cl100k_base")
build_token_count = sum([len(tokenizer.encode(doc.page_content)) for doc in texts])
print(f"Token count: {build_token_count}")

Token count: 202599


In [47]:
# Store the database
with open(DB_PATH, "wb") as f:
    pickle.dump(db.serialize_to_bytes(), f)

## Create simple RAG

In [48]:
# Load the database
DB_PATH = "data/db/sample.db"

with open(DB_PATH, "rb") as f:
    db_bytes = pickle.load(f)
    db = FAISS.deserialize_from_bytes(db_bytes, OpenAIEmbeddings(), allow_dangerous_deserialization=True)

In [49]:
# Load the LLM
llm = ChatOpenAI(model_name=MODEL, temperature=0)  # for deterministic outputs

system_prompt = """
You are an expert assistant. Use only the following retrieved context to answer the question accurately and concisely. 
If nothing is mentioned in the context, say "I don't know".
Context: {context}
Question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"], 
    template=system_prompt
)

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [50]:
def ask_question(query):
    response = retrieval_chain({"query": query})
    print(f"Question: {query}\nAnswer: {response['result']}")
    return response

In [51]:
response = ask_question("When does Apply try to achieve carbon neutrality?")

  response = retrieval_chain({"query": query})


Question: When does Apply try to achieve carbon neutrality?
Answer: Apple aims to achieve carbon neutrality for its entire carbon footprint, including products, by 2030.


In [52]:
res = ask_question("Does Apple comply with EU Climate Law: Binding goal for zero emissions by 2050. Please rate using the following criteria: Ideal Output (70-100 Best ESG Compliance) is Full transition to renewable energy; zero-emission operations by 2050. Medium Output (40-60 Moderate ESG Compliance) is Significant emission reductions but reliance on offset schemes. Worst Output (0-30 Poor ESG Compliance) is Limited action with minimal emission reductions")

Question: Does Apple comply with EU Climate Law: Binding goal for zero emissions by 2050. Please rate using the following criteria: Ideal Output (70-100 Best ESG Compliance) is Full transition to renewable energy; zero-emission operations by 2050. Medium Output (40-60 Moderate ESG Compliance) is Significant emission reductions but reliance on offset schemes. Worst Output (0-30 Poor ESG Compliance) is Limited action with minimal emission reductions
Answer: Apple's goal to achieve carbon neutrality for its entire carbon footprint, including products, by 2030, reducing related emissions by 75 percent compared with 2015, aligns with the EU Climate Law's binding goal for zero emissions by 2050. This indicates a high level of ESG compliance, with an Ideal Output rating of 70-100.
