In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain.vectorstores.utils import filter_complex_metadata

In [2]:
from langchain.prompts.chat import (
  ChatPromptTemplate,
  SystemMessagePromptTemplate,
  AIMessagePromptTemplate,
  HumanMessagePromptTemplate,
)

In [3]:
import pandas as pd

In [4]:
class MyChat:

    def __init__(self,model:str,template:str):
        self.vector_store = None
        self.retriever = None
        self.chain = None
        self.model = ChatOllama(model=model)
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
        '''self.prompt = PromptTemplate.from_template(
            template
        )'''

    def ingest(self, pdf_file_path: str):
        docs = PyPDFLoader(file_path=pdf_file_path).load()
        chunks = self.text_splitter.split_documents(docs)
        return filter_complex_metadata(chunks)

    def make_vector_store(self, pdf_file_path:str):
        chunks = self.ingest(pdf_file_path)
        embedding_fn = FastEmbedEmbeddings()

        self.vector_store = Chroma.from_documents(documents=chunks, embedding=embedding_fn)

    def make_chain(self,template):
        self.retriever = self.vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={
                "k": 3,
                "score_threshold": 0.5,
            },
        )

        sys_message_prompt= SystemMessagePromptTemplate.from_template(template)
        example_human_history = HumanMessagePromptTemplate.from_template("Olá!")
        example_ai_history = AIMessagePromptTemplate.from_template("Oi, como você está hoje?")

        human_template="{input}"
        human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

        chat_prompt = ChatPromptTemplate.from_messages([sys_message_prompt, example_human_history, example_ai_history, human_message_prompt])

        self.chain = chat_prompt | self.model

        '''self.chain = ({"context": self.retriever, "emendas","input": RunnablePassthrough()}
                      | self.prompt
                      | self.model
                      | StrOutputParser())'''

    def ask(self, query: str,emendas:str):
        return self.chain.invoke({"context": self.retriever,"emendas":emendas,"input":query})

    def clear(self):
        self.vector_store = None
        self.retriever = None
        self.chain = None

In [5]:
template="""Tarefa: Agrupamento Semântico de Emendas Parlamentares
Você foi designado para realizar um agrupamento semântico das emendas parlamentares. Cada emenda é identificada por um texto descritivo.
Todas as emendas são referentes ao mesmo projeto de lei, PL.

Texto do PL:
<context>
{context}
</context>

Detalhes da Tarefa:
Você receberá um conjunto de emendas parlamentares, cada uma representada por um texto descritivo. As emendas podem abordar uma variedade de tópicos.
Seu modelo deve atribuir cada emenda a um grupo semântico com base em seus tópicos principais.
Certifique-se de que cada emenda seja atribuída a um único grupo e que todas as emendas sejam atribuídas a um grupo.

Exemplo de Emenda:
AQUI SE INICIA A EMENDA ID 000001 TEXTO

Texto das Emendas:
{emendas}

Question: {input}"""

In [13]:
query=f"""Realize o agrupamento semântico das emendas parlamentares fornecidas, atribuindo cada emenda a um grupo com base em seus tópicos principais."""

In [7]:
emendas_df = pd.read_csv("emendas-786-2020.csv")

In [8]:
inteiro_teor_todas_emendas = mensagem = "".join([f"AQUI SE INICIA A EMENDA ID {numero_emenda}:\n{text_proposto_emenda}\n" for numero_emenda, text_proposto_emenda in zip(emendas_df['NUMEROEMENDA'], emendas_df['TEXTOPROPOSTOEMENDA'])])
inteiro_teor_todas_emendas

'AQUI SE INICIA A EMENDA ID 30870016:\nSuprima-se o texto atual.\nAQUI SE INICIA A EMENDA ID 32280017:\nSuprima-se o texto atual.\nAQUI SE INICIA A EMENDA ID 36110022:\nSuprima-se o texto atual.\nAQUI SE INICIA A EMENDA ID 39160035:\nSuprima-se o texto atual.\nAQUI SE INICIA A EMENDA ID 39540014:\nSuprima-se o texto atual.\nAQUI SE INICIA A EMENDA ID 39840011:\nSuprima-se o texto atual.\nAQUI SE INICIA A EMENDA ID 40700034:\nSuprima-se o texto atual.\nAQUI SE INICIA A EMENDA ID 41300028:\nSuprima-se o texto atual.\n'

In [9]:
emendas = inteiro_teor_todas_emendas

In [10]:
estimativa_tokens = len(inteiro_teor_todas_emendas.split())
estimativa_tokens

88

### PL-786-2020

In [11]:
chat = MyChat("mistral",template)

chat.make_vector_store("PL-786-2020.pdf")

chat.make_chain(template)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 19498.31it/s]


In [14]:
response = chat.ask(query,emendas)
print(response)

AIMessage(content=' Based on the text descriptions provided, it seems that all these emendas share a common theme of proposing to "suprima-se o texto atual." Therefore, I would suggest creating a single semantic group for all these emendas as they all focus on removing or deleting certain existing texts.\n\nSo, the resulting groups could be something like:\n1. Group 1 (Main Topic: Removal of Texts)\n   - Emenda ID 30870016\n   - Emenda ID 32280017\n   - ...\n   - Emenda ID 41300028\n\nThis would result in a single semantic group for all the given emendas as they all propose to modify the text by removing or deleting existing texts.', response_metadata={'model': 'mistral', 'created_at': '2024-05-02T11:35:41.498986225Z', 'message': {'role': 'assistant', 'content': ''}, 'done': True, 'total_duration': 345534563099, 'load_duration': 156474619843, 'prompt_eval_count': 749, 'prompt_eval_duration': 129099999000, 'eval_count': 178, 'eval_duration': 59078086000}, id='run-7473c2ff-9f55-4fd0-8421

AIMessage(content='

Based on the text descriptions provided, it seems that all these emendas share a common theme of proposing to "suprima-se o texto atual." Therefore, I would suggest creating a single semantic group for all these emendas as they all focus on removing or deleting certain existing texts.\n\nSo, the resulting groups could be something like:
Group 1 (Main Topic: Removal of Texts)
   - Emenda ID 30870016   
   - Emenda ID 32280017   
   - ...   
   - Emenda ID 41300028
   
   This would result in a single semantic group for all the given emendas as they all propose to modify the text by removing or deleting existing texts.'
   
   , response_metadata={'model': 'mistral', 'created_at': '2024-05-02T11:35:41.498986225Z', 'message': {'role': 'assistant', 'content': ''}, 'done': True, 'total_duration': 345534563099, 'load_duration': 156474619843, 'prompt_eval_count': 749, 'prompt_eval_duration': 129099999000, 'eval_count': 178, 'eval_duration': 59078086000}, id='run-7473c2ff-9f55-4fd0-8421-b5875a6900ec-0')

### PL-20-2020

In [25]:
emendas_df = pd.read_csv("emendas-20-2020.csv")

In [26]:
inteiro_teor_todas_emendas = mensagem = "".join([f"AQUI SE INICIA A EMENDA ID {numero_emenda}:\n{text_proposto_emenda}\n" for numero_emenda, text_proposto_emenda in zip(emendas_df['NUMEROEMENDA'], emendas_df['TEXTOPROPOSTOEMENDA'])])
emendas = inteiro_teor_todas_emendas

In [27]:
estimativa_tokens = len(inteiro_teor_todas_emendas.split())
estimativa_tokens

4160

In [28]:
chat = MyChat("mistral",template)

chat.make_vector_store("PL-20-2020.pdf")

chat.make_chain(template)

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 7446.98it/s]


In [29]:
response = chat.ask(query,emendas)
print(response)

content=' Group 1: Priorities and Metas of Federal Public Administration in 2020\n\nEmenda ID 50359996: No change (Establishes the general provisions for the 2020 Federal Budget Law)\n\nEmenda ID 50360001, 50360004, 50360009, 71250006, and 71250012: Include specific areas of focus for the federal public administration in 2020, such as education (PNE), equality and violence against women, social housing programs, and fulfilling the goals outlined in the Plano Plurianual 2020-2023.\n\nEmenda ID 50360003: Modifies the primary goal of the federal budget to include the costs related to investments and payments from My House, My Life program and Minha Vida Minha Casa social housing program.\n\nEmenda ID 60000003: Includes education goals in the priorities for the federal public administration in 2012.\n\nGroup 2: Budget Amounts and Reductions\n\nEmenda ID 50359998, 50360007, 60000030, and 71080009: No change (Maintain the current budget amounts or priorities)\n\nEmenda ID 60000031: Modifies 

content=' 

Group 1: Priorities and Metas of Federal Public Administration in 2020

    - Emenda ID 50359996: No change (Establishes the general provisions for the 2020 Federal Budget Law)
    - Emenda ID 50360001, 50360004, 50360009, 71250006, and 71250012: Include specific areas of focus for the federal public administration in 2020, such as education (PNE), equality and violence against women, social housing programs, and fulfilling the goals outlined in the Plano Plurianual 2020-2023.
    - Emenda ID 50360003: Modifies the primary goal of the federal budget to include the costs related to investments and payments from My House, My Life program and Minha Vida Minha Casa social housing program.
    - Emenda ID 60000003: Includes education goals in the priorities for the federal public administration in 2012.

Group 2: Budget Amounts and Reductions

    - Emenda ID 50359998, 50360007, 60000030, and 71080009: No change (Maintain the current budget amounts or priorities)
    - Emenda ID 60000031: Modifies the primary goal of the federal budget to include goals from various areas such as education, equality and violence against women, social housing programs, and goals outlined in the Plano Plurianual 2020-2023.
    - Emenda ID 60000008: Reduces the primary goal of the federal budget to account for the costs related to investments in social housing programs.

Group 3: Specific Programs and Policies

    - Emenda ID 50360011: Includes actions against violence against women as a priority for the federal public administration in 2020.
    - Emenda ID 71250006: Establishes priorities and goals for the federal public administration related to education, security, and social programs (Segurança Pública, Educação, e Programas Sociais) as outlined in the Plano Plurianual 2020-2023.

' response_metadata={'model': 'mistral', 'created_at': '2024-05-02T12:34:10.68895688Z', 'message': {'role': 'assistant', 'content': ''}, 'done': True, 'total_duration': 738894884632, 'load_duration': 15983077884, 'prompt_eval_count': 1566, 'prompt_eval_duration': 484632260000, 'eval_count': 561, 'eval_duration': 236576432000} id='run-016b2de5-ec29-460d-8d2c-2e6968390faf-0'
