In [13]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
from PyPDF2 import PdfReader

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import faiss
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

# from langchain_huggingface import HuggingFaceEmbeddings

from langchain_google_genai import ChatGoogleGenerativeAI
from google.generativeai.types.safety_types import (
    HarmBlockThreshold,
    HarmCategory,
)

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            max_output_tokens=256,
            top_k=10,
            safety_settings={
                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            },
)

I0000 00:00:1721332437.794788  194195 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
I0000 00:00:1721332437.807899  194195 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


In [18]:
# creating custom template to guide llm model
custom_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)

In [19]:
def get_pdf_text(pdf_path):
    # Verifica se o caminho é um arquivo, não um diretório
    if not os.path.isfile(pdf_path):
        raise ValueError(f"O caminho fornecido não é um arquivo: {pdf_path}")

    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Caminho para o arquivo PDF
pdf_path = "/home/rafael/Downloads/chat-pdf-gemini/state_of_the_union.pdf"

# Extração do texto do PDF
pdf_text = get_pdf_text(pdf_path)
print(pdf_text)

Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress 
and the Cabinet. Justices of the Supreme Court. My fellow Americans.  
Last year COVID -19 kept us apart. This year we are finally together again. Tonight, we meet as 
Democrats, Republicans, and Independents. But most importantly as Americans. With a duty to one 
another, to the American people, to the Constitution. And with an  unwavering resolve that freedom will 
always triumph over tyranny.  
 
Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world, thinking he could 
make it bend to his menacing ways. But he badly miscalculated. He thought he could roll into Ukraine 
and the world would roll over. Instead, he met a  wall of strength he never imagined. He met the 
Ukrainian people. From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their 
determination, inspires the world.  
 
Groups of citizens blocking tanks with their bodie

In [20]:
# converting text to chunks
def get_chunks(raw_text):
    text_splitter=CharacterTextSplitter(separator="\n",
                                        chunk_size=1000,
                                        chunk_overlap=200,
                                        length_function=len)   
    chunks=text_splitter.split_text(raw_text)
    return chunks

In [21]:
chunks = get_chunks(pdf_text)
chunks

['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress \nand the Cabinet. Justices of the Supreme Court. My fellow Americans.  \nLast year COVID -19 kept us apart. This year we are finally together again. Tonight, we meet as \nDemocrats, Republicans, and Independents. But most importantly as Americans. With a duty to one \nanother, to the American people, to the Constitution. And with an  unwavering resolve that freedom will \nalways triumph over tyranny.  \n \nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world, thinking he could \nmake it bend to his menacing ways. But he badly miscalculated. He thought he could roll into Ukraine \nand the world would roll over. Instead, he met a  wall of strength he never imagined. He met the \nUkrainian people. From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their \ndetermination, inspires the world.',
 'Ukrainian people. From President Zele

In [22]:
# using all-MiniLm embeddings model and faiss to get vectorstore
def get_vectorstore(chunks):
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",model_kwargs={'device':'cpu'})
    vectorstore=faiss.FAISS.from_texts(texts=chunks,embedding=embeddings)
    return vectorstore

In [24]:
vectorstore = get_vectorstore(chunks)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x7fd1e84e6b60>

In [25]:
# generating conversation chain  
def get_conversationchain(vectorstore):
    memory = ConversationBufferMemory(memory_key='chat_history', 
                                        return_messages=True,
                                        output_key='answer') # using conversation buffer memory to hold past information
    conversation_chain = ConversationalRetrievalChain.from_llm(
                                llm=llm,
                                retriever=vectorstore.as_retriever(),
                                condense_question_prompt=CUSTOM_QUESTION_PROMPT,
                                memory=memory)
    return conversation_chain

In [26]:
conversation_chain = get_conversationchain(vectorstore)
conversation_chain

ConversationalRetrievalChain(memory=ConversationBufferMemory(output_key='answer', return_messages=True, memory_key='chat_history'), combine_docs_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatGoogleGenerativeAI(model='models/gemini-1.5-flash', temperature=0.0, top_k=10, max_output_tokens=256, safety_settings={<HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: 10>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HATE_SPEECH: 8>: <HarmBlockThreshold.BLOCK_NONE: 4>, <HarmCategory.HARM_CATEGORY_HARASSMENT: 7>: <Har

In [27]:
conversation_chain.invoke('what sanctions have been placed on Russia')

{'question': 'what sanctions have been placed on Russia',
 'chat_history': [HumanMessage(content='what sanctions have been placed on Russia'),
  AIMessage(content='The text describes several sanctions placed on Russia:\n\n* **Economic sanctions:**\n    * Cutting off Russia\'s largest banks from the international financial system.\n    * Preventing Russia\'s central bank from defending the Russian Ruble, making Putin\'s "war fund" worthless.\n    * Choking off Russia\'s access to technology to weaken its economy and military.\n    * Closing off American airspace to all Russian flights.\n* **Targeting oligarchs:**\n    * Assembling a task force to go after the crimes of Russian oligarchs.\n    * Seizing their yachts, luxury apartments, and private jets.\n\nThe text also mentions the release of 60 million barrels of oil from reserves around the world, including 30 million barrels from the US Strategic Petroleum Reserve, to help blunt gas prices. \n')],
 'answer': 'The text describes sever