In [1]:
import json
import numpy as np
import os
import pandas as pd
import random

In [2]:
from dotenv import load_dotenv
from langchain import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import Docx2txtLoader, TextLoader
from langchain_core.runnables.base import RunnableSequence

In [3]:
from tqdm import tqdm
from typing import Tuple

In [4]:
np.random.seed = 42

In [5]:
examples = json.load(open('../data/RAG_resources/examples.json', 'r'))

In [6]:
examples_fr = json.load(open('../data/RAG_resources/examples_fr.json', 'r'))

In [7]:
DATA_PATH = '../data/'

for idx, path in enumerate(os.scandir(DATA_PATH)):
    if path.is_file():
        fn = os.path.basename(path.path)

files = ['letters', 'reports', 'hearings']
docs_fr = []
docs_en = []
for f in files:
    df = pd.read_csv(f'../data/{f}.csv')
    docs_fr.extend(df[df.lang == 'fr'].sentence.tolist())
    docs_en.extend(df[df.lang == 'en'].sentence.tolist())
print(len(docs_fr))
print(len(docs_en))

1537
168


In [8]:
df[df.lang == 'en']

Unnamed: 0.1,Unnamed: 0,speaker,sentence,lang
265,265,M. THOMAS BOUSHEL,Puis mon français est loin d’être parfait.,en
266,266,M. THOMAS BOUSHEL,"Aujourd’hui, monsieur Coderre a dit theFrancis...",en
267,267,M. THOMAS BOUSHEL,"Well, obviously what we saw tonight here, from...",en
268,268,M. THOMAS BOUSHEL,And my question to you is that over the past d...,en
269,269,M. THOMAS BOUSHEL,Peter McGill has no parks per se.,en
...,...,...,...,...
868,868,M. ROBERT HAJALY,"If I understoodcorrectly, the 1.8 million doll...",en
869,869,M. ROBERT HAJALY,"Now, I want toemphasize that outside of Peter ...",en
870,870,M. ROBERT HAJALY,InPeter McGill there is virtually none despite...,en
871,871,M. ROBERT HAJALY,I want to know if the City is prepared to comm...,en


In [9]:
load_dotenv()

True

In [10]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [11]:
loader = TextLoader("../data/RAG_resources/abstract.txt")
abstract = loader.load()

In [12]:
loader = Docx2txtLoader("../data/RAG_resources/green_gray.docx")
gg_article = loader.load()

In [13]:
# Initialize RecursiveCharacterTextSplitter to make chunks of HTML text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Split GDPR HTML
splits = text_splitter.split_documents(gg_article)

In [14]:
splits[:2]

[Document(page_content='Green and grey: New ideologies of nature in urban sustainability policy\n\nSecond revision\n\n\n\n\n\nDavid Wachsmuth* and Hillary Angelo†\n\n*School of Urban Planning, McGill University \n\n†Department of Sociology, University of California–Santa Cruz', metadata={'source': '../data/RAG_resources/green_gray.docx'}),
 Document(page_content='Abstract: In the past two decades, “urban sustainability” has become a new policy common sense. This paper argues that contemporary urban sustainability thought and practice is co-constituted by two distinct representational forms, which we call green urban nature and grey urban nature. Green urban nature is the return of nature to the city in its most verdant form, signified by street trees, urban gardens, and the greening of post-industrial landscapes. Grey urban nature is the concept of social, technological urban space as already inherently sustainable, signified by dense urban cores, high-speed public transit, and energy-

In [15]:
# Initialize Chroma vectorstore with documents as splits and using OpenAIEmbeddings
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [16]:
# Setup vectorstore as retriever
retriever = vectorstore.as_retriever()

In [17]:
llm = ChatOpenAI(model_name="gpt-4o", temperature=0.7).bind(logprobs=True)

In [18]:
# Create function to make sure retriever has access to all docs
def join_docs(docs:list):
    return " ".join(doc.page_content for doc in docs)

## RAG + CoT

In [19]:
with open(f'{DATA_PATH}RAG_resources/sus_template.txt', 'r') as f:
    sus_template = f.read()

In [20]:
with open(f'{DATA_PATH}RAG_resources/gg_template.txt', 'r') as f:
    gg_template = f.read()

In [21]:
with open(f'{DATA_PATH}RAG_resources/climate_template.txt', 'r') as f:
    climate_template = f.read()

In [22]:
sus_template

"You are an annotator of bilingual texts in French and English.\nUsing the provided context, determine if the provided sentence is about sustainability.\nAnswer with the label 'yes' or 'no'.\nNever respond with more than one word, which should be the correct label.\n\n<sentence>\n{sentence} \n</sentence>\n\n<context>\n{context}\n</context>\n\nLabel:"

In [23]:
gg_template

'You are an annotator of bilingual texts in French and English.\nThis sentence is about sustainability.\nUsing the provided context, determine if it is about \'green\' or \'grey\' sustainability.\nAnswer with the label \'green\' or \'grey\'.\nNever respond with more than one word, which should be the correct label.\n\n<sentence>\n{sentence} \n</sentence>\n\n<context>\n{context}\n</context>\n\nLabel:"""'

In [24]:
climate_template

'You are an annotator of bilingual texts in French and English.\nUsing the provided context, determine if the sentence is about \'climage change\'.\nMake sure the sentence is about climate change in particular, not just the \'environment\'.\nAnswer with the label \'yes\' or \'no\'.\nNever respond with more than one word, which should be the correct label.\n\n<sentence>\n{sentence} \n</sentence>\n\nLabel:"""'

In [25]:
def return_RAG_chain(template, retriever, llm):
    prompt = PromptTemplate(input_variables=['sentence', 'context'], template=template)
    chain = ({"context": retriever | join_docs, "sentence": RunnablePassthrough()}
             | prompt 
             | llm)
    return chain

In [26]:
sus_chain = return_RAG_chain(sus_template, retriever, llm)

In [27]:
gg_chain = return_RAG_chain(gg_template, retriever, llm)

In [28]:
def return_chain(template, llm):
    prompt = PromptTemplate(input_variables=['sentence'],template=climate_template)
    chain = (prompt 
             | llm)
    return chain

In [29]:
climate_chain = return_chain(climate_template, llm)

In [30]:
def get_ai_message_prob(message):
    return np.exp(message.response_metadata["logprobs"]["content"][0]['logprob'])

def get_ai_message_str(message):
    return message.content.lower().strip()

In [31]:
def pipe(sentence:str, 
         sus_chain:RunnableSequence=sus_chain, 
         gg_chain:RunnableSequence=gg_chain,
         climate_chain:RunnableSequence=climate_chain) -> Tuple[str,str]:
    
    sus = sus_chain.invoke(sentence)
    sus_str = get_ai_message_str(sus)
    sus_prob = get_ai_message_prob(sus)
    
    climate = climate_chain.invoke(sentence)
    climate_str = get_ai_message_str(climate)
    climate_prob = get_ai_message_prob(climate)
    
    if "yes" in sus.content.lower():
        gg = gg_chain.invoke(sentence)
        gg_str = get_ai_message_str(gg)
        gg_prob = get_ai_message_prob(gg)
        
        return [(gg_str, gg_prob),
                (climate_str, climate_prob)]
    else:
        return [(sus_str, sus_prob),
                (climate_str, climate_prob)]

In [34]:
grey_q = "Denser buildings mean lower per capita energy use, which will lower carbon emissions and fight climate change"

In [35]:
green_q = "New solutions for public transit will make cities key players in securing a sustainable future for the planet."

In [36]:
for e in examples['green']:
    print(pipe(e))

[('green', 0.999288060049595), ('no', 0.991326232567116)]
[('green', 0.9980710846529887), ('no', 0.7308887423963716)]
[('green', 0.9913830589981061), ('no', 0.9044383913090571)]
[('green', 0.9932990114294211), ('no', 0.9858102589889465)]


In [37]:
for e in examples['grey']:
    print(pipe(e))

[('grey', 0.999694378111887), ('yes', 0.5608936295963909)]
[('green', 0.8518158023083355), ('no', 0.32074620730968334)]
[('grey', 0.9998026776706121), ('no', 0.6790459097085093)]


In [38]:
for e in examples['other']:
    print(pipe(e))

[('no', 0.9606779498501362), ('no', 0.924055143507971)]
[('grey', 0.998954042595181), ('no', 0.8804934676351257)]
[('no', 0.941609085576197), ('no', 0.6223652873038552)]


In [39]:
for e in examples_fr['green']:
    print(pipe(e))

[('green', 0.9914080059795866), ('no', 0.8173274899065028)]
[('green', 0.9975163743144668), ('no', 0.6759537651680623)]
[('green', 0.9990827697430746), ('no', 0.5616402829181874)]
[('green', 0.9819689443754485), ('no', 0.32061668407552896)]


In [40]:
for e in examples_fr['grey']:
    print(pipe(e))

[('grey', 0.9703511894790794), ('yes', 0.5619944806748091)]
[('green', 0.8798819044535885), ('no', 0.5592428576468129)]
[('grey', 0.9975884933651923), ('no', 0.9817018763089634)]


In [41]:
for e in examples_fr['other']:
    print(pipe(e))

[('no', 0.9817446195574484), ('no', 0.561859533904657)]
[('grey', 0.9984576411598366), ('no', 0.32074729784864214)]
[('grey', 0.9322985760801751), ('no', 0.7307427765652157)]


In [42]:
pipe(grey_q)

[('grey', 0.9980241896880838), ('yes', 0.6224430131340336)]

In [43]:
pipe("The construction outside of my window is noisy.")

[('no', 0.8175065532785368), ('no', 0.9913827045787258)]

# All in one JSON

In [340]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

In [341]:
template="""
You are an annotator of bilingual texts in French and English. \
Using the provided context, determine if the provided sentence is about sustainability.\
Answer with the label 'yes' or 'no'.\
If the answer is 'yes', further use the provided context to determine if it's 'grey' or 'green' sustainability.\
For this answer, use the label 'green' or 'grey'.\
Finally, independently from the context and from the previous answers, determine if the sentence is about 'climate change'.\
For this answer, again use the labels 'yes' or 'no'.\
Your answer should always consist of a JSON 
\nSentence: {sentence} \nContext: {context}"""

In [342]:
class Sentence(BaseModel):
    sus: str = Field(description="Is the sentence about sustainability?")
    gg: str = Field(description="If the sentence is about sustainability, is it 'grey' or 'green'?")
    climate: str = Field(description="Is the sentence about climate change?")

In [343]:
# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Sentence)

prompt = PromptTemplate(
    template=template+".\n{format_instructions}\n",
    input_variables=["sentence","context"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

#chain = prompt | llm | parser
json_chain = ({"context": retriever | join_docs, "sentence": RunnablePassthrough()}
             | prompt 
             | llm 
             | parser)

json_chain.invoke("My tummy hurts")

{'sus': 'no', 'gg': '', 'climate': 'no'}

In [344]:
def route(sus):
    if "yes" in sus["label"].lower():
        print("this works")
        return gg_chain
    else:
        return gg_chain

In [345]:
from langchain_core.runnables import RunnableLambda

full_chain = {"label": sus_chain, "sentence": lambda x: x["sentence"]} | RunnableLambda(
    route
)

In [346]:
#full_chain.invoke("Subways will make cities more sustainable")

In [347]:
query_green = "New solutions for public transit will make cities key players in securing a sustainable future for the planet."

In [348]:
query_green_fr = "Les nouvelles solutions de transport en commun feront des villes des acteurs clés pour assurer un avenir durable à la planète."

In [349]:
answer = sus_chain.invoke(query_green)
answer

'yes'

In [350]:
answer = sus_chain.invoke(query_green_fr)
answer

'yes'

In [351]:
query_gray = "Denser buildings mean lower per capita energy use, which will lower carbon emissions and fight climate change"

In [352]:
#query_gray = "This sentence is about grey sustainability."

In [353]:
answer = sus_chain.invoke(query_gray)
answer

'yes'

In [354]:
query_not = "The construction here is far too noisy."

In [355]:
answer = sus_chain.invoke(query_not)
answer

'No'

# Sample Sentences

In [44]:
DATA_PATH = '../output/sample_output'
dfs = []

for idx, path in enumerate(os.scandir(DATA_PATH)):
    if path.is_file():
        fn = os.path.basename(path.path)
        dfs.append(pd.read_csv(f'{DATA_PATH}/{fn}'))

In [45]:
all_sents = pd.concat(dfs, axis=0, ignore_index=True).drop(columns=['Unnamed: 0'])

In [46]:
mask = (all_sents['sentence'].str.len() > 299)
long_sents = all_sents[mask].copy().sample(frac=1).reset_index(drop=True)


In [56]:
long_sents["gg"] = np.nan
long_sents["climate"] = np.nan
long_sents["gg_prob"] = np.nan
long_sents["climate_prob"] = np.nan

In [73]:
long_sents.shape

(769, 8)

In [57]:
long_sents_100 = long_sents[:100].copy().reset_index(drop=True)

In [58]:
long_sents_100.head(5)

Unnamed: 0,file,speaker,sentence,lang,gg,climate,gg_prob,climate_prob
0,citizen_sub_8a6,LETTER,"est apparent dans le sud-ouest, la proximité d...",fr,,,,
1,citizen_sub_8b3,LETTER,"Les logements familiaux, comme les logements b...",fr,,,,
2,citizen_sub_8a28,LETTER,Le corridor Dalhousie qui vise à faciliter la ...,fr,,,,
3,citizen_sub_8a3,LETTER,The democratic Process I believe in democracy ...,en,,,,
4,report_report,LETTER,"Lors de cette même audience, plusieurs interve...",fr,,,,


In [59]:
llm_outputs = []

for s in tqdm(long_sents_100.sentence):
    llm_output = pipe(s)
    llm_outputs.append(llm_output)

100%|████████████████████████████████████████████████████████████████████████████| 100/100 [02:27<00:00,  1.48s/it]


In [67]:
long_sents_100["gg"] = [op[0][0] for op in llm_outputs]
long_sents_100["climate"] = [op[1][0] for op in llm_outputs]

long_sents_100["gg_prob"] = [op[0][1] for op in llm_outputs]
long_sents_100["climate_prob"] = [op[1][1] for op in llm_outputs]

In [68]:
long_sents_100.to_csv("../output/test100.csv", encoding='utf-8')

In [69]:
# https://stackoverflow.com/questions/41004941/python-replace-french-letters-with-english

translation_table = str.maketrans("éàèùâêîôûç", "eaeuaeiouc")
test = "Héllô Càèùverâêt Jîôûç"
test = test.translate(translation_table)
print(test)

Hello Caeuveraet Jiouc


In [70]:
long_sents_100['sentence_no_fr'] = [s.translate(translation_table) for s in long_sents_100.sentence]

In [71]:
long_sents_100.to_csv("../output/test100.csv", encoding='utf-8')

In [95]:
#long_sents_100['climate'] = ['climate' if c=='yes' else 'other' for c in long_sents_100['climate']]
long_sents_100['gg'] = ['other' if gg=='no' else gg for gg in long_sents_100['gg']]

In [85]:
from typing import Dict

In [110]:
def process_item(row:Dict):
    item = {}
    item['text'] = row['sentence']
    item['meta'] = {}
    item['meta']['gg'] = row['gg']
    item['meta']['gg_prob'] = row['gg_prob']
    item['meta']['climate'] = row['climate']
    item['meta']['climate_prob'] = row['climate_prob']
    item['meta']['file'] = row['file']
    item['meta']['speaker'] = row['speaker']
    item['meta']['lang'] = row['lang']
    item['meta']['accept'] = [row['gg'].upper(), 
                             row['climate'].upper()]
    return item

In [111]:
to_jsonl = []

for idx, row in long_sents_100.iterrows():
    item = process_item(row)
    to_jsonl.append(item)

In [112]:
import json
     
with open("../output/test100.jsonl", 'w') as f:
    for item in to_jsonl:
        f.write(json.dumps(item) + "\n")