In [248]:
import json
import os
import pandas as pd

In [301]:
from dotenv import load_dotenv
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_community.document_loaders import Docx2txtLoader, TextLoader
from langchain_core.runnables.base import RunnableSequence

In [261]:
examples = json.load(open('../data/RAG_resources/examples.json', 'r'))

In [270]:
examples_fr = json.load(open('../data/RAG_resources/examples_fr.json', 'r'))

In [2]:
DATA_PATH = '../data'
# Create your representation model
#candidate_topics = ["climate", "sustainability", "other"]
#representation_model = ZeroShotClassification(candidate_topics, model="camembert-base")
#representation_model = MaximalMarginalRelevance(diversity=0.3)

for idx, path in enumerate(os.scandir(DATA_PATH)):
    if path.is_file():
        fn = os.path.basename(path.path)

files = ['letters', 'reports', 'hearings']
docs_fr = []
docs_en = []
for f in files:
    df = pd.read_csv(f'../data/{f}.csv')
    docs_fr.extend(df[df.lang == 'fr'].sentence.tolist())
    docs_en.extend(df[df.lang == 'en'].sentence.tolist())
print(len(docs_fr))
print(len(docs_en))

1537
168


In [302]:
load_dotenv()

True

In [309]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [310]:
loader = TextLoader("../data/RAG_resources/abstract.txt")
abstract = loader.load()

In [311]:
loader = Docx2txtLoader("../data/RAG_resources/green_gray.docx")
gg_article = loader.load()

In [312]:
# Initialize RecursiveCharacterTextSplitter to make chunks of HTML text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split GDPR HTML
splits = text_splitter.split_documents(gg_article)

In [313]:
splits[:2]

[Document(page_content='Green and grey: New ideologies of nature in urban sustainability policy\n\nSecond revision\n\n\n\n\n\nDavid Wachsmuth* and Hillary Angelo†\n\n*School of Urban Planning, McGill University \n\n†Department of Sociology, University of California–Santa Cruz', metadata={'source': '../data/RAG_resources/green_gray.docx'}),
 Document(page_content='Abstract: In the past two decades, “urban sustainability” has become a new policy common sense. This paper argues that contemporary urban sustainability thought and practice is co-constituted by two distinct representational forms, which we call green urban nature and grey urban nature. Green urban nature is the return of nature to the city in its most verdant form, signified by street trees, urban gardens, and the greening of post-industrial landscapes. Grey urban nature is the concept of social, technological urban space as already inherently sustainable, signified by dense urban cores, high-speed public transit, and energy-

In [314]:
# Initialize Chroma vectorstore with documents as splits and using OpenAIEmbeddings
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [315]:
# Setup vectorstore as retriever
retriever = vectorstore.as_retriever()

In [316]:
llm = ChatOpenAI(model_name="gpt-4o", temperature=0.7)

In [317]:
template="""
You are an annotator of bilingual texts in French and English. \
Using the provided context, determine if the provided sentence is about sustainability.\
If it is about sustainability, further determine if it is about 'green' or 'grey' sustainability, \ 
as defined in the article. Label the sentence as 'green', 'grey', or 'both'.\
If the sentence isn't about sustainability, label it 'other'.\
Never respond with more than one word, which should be the correct label.\
\nSentence: {sentence} \nContext: {context} \nLabel:"""

gg_prompt = PromptTemplate(input_variables=['sentence', 'context'], template=template)

In [318]:
# Create function to make sure retriever has access to all docs
def join_docs(docs:list):
    return " ".join(doc.page_content for doc in docs)

## RAG + CoT

In [319]:
sus_template="""
You are an annotator of bilingual texts in French and English.\
Using the provided context, determine if the provided sentence is about sustainability.\
Answer with the label 'yes' or 'no'.\
Never respond with more than one word, which should be the correct label.\
\n
<sentence>
{sentence} 
</sentence>
\n
<context>
{context}
</context>

\nLabel:"""

In [320]:
sus_prompt = PromptTemplate(input_variables=['sentence', 'context'], template=sus_template)

In [321]:
sus_chain = ({"context": retriever | join_docs, "sentence": RunnablePassthrough()}
             | sus_prompt 
             | llm 
             | StrOutputParser())

In [322]:
gg_template="""
You are an annotator of bilingual texts in French and English. \
This sentence is about sustainability.\
Using the provided context, determine if it is about 'green' or 'grey' sustainability.\
Answer with the label 'green' or 'grey'.\
Never respond with more than one word, which should be the correct label.\
\n
<sentence>
{sentence} 
</sentence>
\n
<context>
{context}
</context>

\nLabel:"""

In [323]:
gg_prompt = PromptTemplate(input_variables=['sentence', 'context'], template=gg_template)

In [324]:
gg_chain = ({"context": retriever | join_docs, "sentence": RunnablePassthrough()}
             | gg_prompt 
             | llm 
             | StrOutputParser())

In [325]:
climate_template="""
You are an annotator of bilingual texts in French and English. \
Using the provided context, determine if the sentence is about 'climage change'.\
Make sure the sentence is about climate change in particular, not just the 'environment'.\
Answer with the label 'yes' or 'no'.\
Never respond with more than one word, which should be the correct label.\
\n
<sentence>
{sentence} 
</sentence>
\n
<context>
{context}
</context>

\nLabel:"""

In [326]:
climate_prompt = PromptTemplate(input_variables=['sentence', 'context'], template=climate_template)

In [327]:
climate_chain = ({"context": retriever | join_docs, "sentence": RunnablePassthrough()}
             | climate_prompt 
             | llm 
             | StrOutputParser())

In [328]:
from typing import Tuple

In [329]:
def pipe(sentence:str, 
         sus_chain:RunnableSequence=sus_chain, 
         gg_chain:RunnableSequence=gg_chain,
         climate_chain:RunnableSequence=climate_chain) -> Tuple[str,str]:
    sus = sus_chain.invoke(sentence)
    climate = climate_chain.invoke(sentence)
    if "yes" in sus.lower():
        return [gg_chain.invoke(sentence).lower().strip(),
                climate.lower().strip()]
    else:
        return [sus.lower().strip(),
                climate.lower().strip()]

In [330]:
grey_q = "Denser buildings mean lower per capita energy use, which will lower carbon emissions and fight climate change"

In [331]:
green_q = "New solutions for public transit will make cities key players in securing a sustainable future for the planet."

In [332]:
for e in examples['green']:
    print(pipe(e))

['green', 'no']
['green', 'no']
['green', 'no']
['green', 'no']


In [333]:
for e in examples['grey']:
    print(pipe(e))

['grey', 'yes']
['green', 'no']
['grey', 'no']


In [334]:
for e in examples['other']:
    print(pipe(e))

['no', 'no']
['grey', 'no']
['no', 'no']


In [335]:
for e in examples_fr['green']:
    print(pipe(e))

['green', 'no']
['green', 'no']
['green', 'no']
['green', 'no']


In [336]:
for e in examples_fr['grey']:
    print(pipe(e))

['grey', 'yes']
['green', 'no']
['grey', 'no']


In [337]:
for e in examples_fr['other']:
    print(pipe(e))

['no', 'no']
['grey', 'no']
['grey', 'no']


In [338]:
pipe(grey_q)

['grey', 'yes']

In [339]:
pipe("The construction outside of my window is noisy.")

['no', 'no']

# All in one JSON

In [340]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [341]:
template="""
You are an annotator of bilingual texts in French and English. \
Using the provided context, determine if the provided sentence is about sustainability.\
Answer with the label 'yes' or 'no'.\
If the answer is 'yes', further use the provided context to determine if it's 'grey' or 'green' sustainability.\
For this answer, use the label 'green' or 'grey'.\
Finally, independently from the context and from the previous answers, determine if the sentence is about 'climate change'.\
For this answer, again use the labels 'yes' or 'no'.\
Your answer should always consist of a JSON 
\nSentence: {sentence} \nContext: {context}"""

In [342]:
class Sentence(BaseModel):
    sus: str = Field(description="Is the sentence about sustainability?")
    gg: str = Field(description="If the sentence is about sustainability, is it 'grey' or 'green'?")
    climate: str = Field(description="Is the sentence about climate change?")

In [343]:
# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Sentence)

prompt = PromptTemplate(
    template=template+".\n{format_instructions}\n",
    input_variables=["sentence","context"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

#chain = prompt | llm | parser
json_chain = ({"context": retriever | join_docs, "sentence": RunnablePassthrough()}
             | prompt 
             | llm 
             | parser)

json_chain.invoke("My tummy hurts")

{'sus': 'no', 'gg': '', 'climate': 'no'}

In [344]:
def route(sus):
    if "yes" in sus["label"].lower():
        print("this works")
        return gg_chain
    else:
        return gg_chain

In [345]:
from langchain_core.runnables import RunnableLambda

full_chain = {"label": sus_chain, "sentence": lambda x: x["sentence"]} | RunnableLambda(
    route
)

In [346]:
#full_chain.invoke("Subways will make cities more sustainable")

In [347]:
query_green = "New solutions for public transit will make cities key players in securing a sustainable future for the planet."

In [348]:
query_green_fr = "Les nouvelles solutions de transport en commun feront des villes des acteurs clés pour assurer un avenir durable à la planète."

In [349]:
answer = sus_chain.invoke(query_green)
answer

'yes'

In [350]:
answer = sus_chain.invoke(query_green_fr)
answer

'yes'

In [351]:
query_gray = "Denser buildings mean lower per capita energy use, which will lower carbon emissions and fight climate change"

In [352]:
#query_gray = "This sentence is about grey sustainability."

In [353]:
answer = sus_chain.invoke(query_gray)
answer

'yes'

In [354]:
query_not = "The construction here is far too noisy."

In [355]:
answer = sus_chain.invoke(query_not)
answer

'No'