In [1]:
from uuid import uuid4
import os
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_groq import ChatGroq
from langchain.vectorstores import Qdrant
from langchain import hub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.callbacks import StdOutCallbackHandler

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct


from crewai import Agent, Task, Crew
from crewai_tools import BaseTool

## Cargo claves

In [2]:
load_dotenv()

GROQ_API_KEY = os.getenv('GROQ_API_KEY')
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    url="https://7fb9acbe-a3aa-4e57-94c0-c496d78947cf.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="Sf0twa2JRMLttZ9Geb-xfgx4Q_xoWqO_V18vdNwzY0WqmY7a3d6jGw",
)

## Carga y split de documentos

In [3]:
def doc_loader(pdf):
    pdf_loader = PyPDFLoader(pdf)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size= 512, chunk_overlap= 50)
    doc = pdf_loader.load_and_split(text_splitter)

    return doc

In [4]:
doc = doc_loader("docs/tfg_paula.pdf")

In [21]:
doc[0].page_content

'Trabajo de Fin de Grado Influencia del microbioma vaginal en el parto pretérmino espontáneo  Influencia del microbioma vaxinal nel parto pretérmino espontáneo  Vaginal microbiome influence in spontaneous pre-term birth                Autora: Paula Sofía Simón Alonso Tutor: Dr. Carlos López Ramón y Cajal Cotutor: Dr. Emilio Couceiro Naveira Departamento: Ginecología y Obstetricia   Junio 2024 Trabajo de Fin de Grao presentado en la Facultade de Medicina y Odontología de la Universidad de Santiago de'

## Almacenamiento en la VDB de Qdrant

In [5]:
# Creo una colección en la base de datos vectorial

client = QdrantClient(url=QDRANT_URL,
                      api_key=QDRANT_API_KEY
                     )


client.create_collection(collection_name="tfg",
                         vectors_config={"content": VectorParams(size=1536, distance=Distance.COSINE)}
                        )

True

In [22]:
# Almaceno documentos en la base de datos vectorial:

embedding = OpenAIEmbeddings()

def chunked_metadata(doc, client=client, collection_name="tfg"):

    chunked_metadata = []

    for item in doc:

      id = str(uuid4())
      content = item.page_content
      source = item.metadata["source"]
      page = item.metadata["page"]

      content_vector = embedding.embed_documents([content])[0]
      vector_dict = {"content": content_vector}

      payload = {"page_content": content,
                 "metadata": {"id": id,
                              "source": source,
                              "page": page,
                             }
                }


      metadata = PointStruct(id=id, vector=vector_dict, payload=payload)
      chunked_metadata.append(metadata)

    client.upsert(collection_name=collection_name,
                  wait=True,
                  points=chunked_metadata)

In [8]:
chunked_metadata(doc)

In [6]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='tfg_paula'), CollectionDescription(name='tfg')])

In [42]:
attention_collection = client.get_collection("tfg")

print(f"Points: {attention_collection.points_count} ")

Points: 113 


In [43]:
# Hagamos una busqueda para ver que documentos son más similares

client.search(
    collection_name="tfg",
    query_vector=("content", embedding.embed_documents(["Agregado molecular"])[0]),
    with_payload=["page_content", "source"],
    limit=5
)

[ScoredPoint(id='df0a7089-d9c2-4c57-bcf4-316ba75da51f', version=0, score=0.86286503, payload={'page_content': 'Los agregados moleculares son estructuras compuestas por moléculas de un\nsurfactante. Cuando se añade un surfactante a un aceite, la parte polar de sus mo-\nléculas se desplazará a la superficie para evitar el contacto con este. Si se aumenta\nsu concentración, las moléculas empezaran a ocupar toda la superficie disponible\nhasta saturarla. Cuando la concentración de surfactante sobrepasa este punto críti-\nco, conocido como concentración micelar crítica (CMC), las moléculas empiezan a'}, vector=None, shard_key=None),
 ScoredPoint(id='8d4f8af8-0c6a-4541-a5ed-bea1e80aaaad', version=0, score=0.84515244, payload={'page_content': 'gar a un intercambio de moléculas de alcohol cargadas, saltando la carga negativa\nentre ellos mientras la positiva queda inmóvil [3]. De esta forma se tendrían dos\nestructuras cargadas que podrían conducir corriente eléctrica.\nFigura 4: Proceso de fo

## Inicializar la base de datos vectorial

In [9]:
# Initialize vector store
client = QdrantClient(url=QDRANT_URL,
                      api_key=QDRANT_API_KEY
                     )

embedding = OpenAIEmbeddings()

vectorstore = Qdrant(client=client,
                     collection_name="tfg",
                     embeddings=embedding,
                     vector_name="content")

In [49]:
# Downlad prompt template

rag_prompt = hub.pull("rlm/rag-prompt")

In [64]:
rag_prompt.messages[0].prompt.template

"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use four sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"

In [6]:
template = '''
You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Question: {question} 
Context: {context}
Answer:
'''

## RAG Pipeline

In [46]:
rag_prompt = ChatPromptTemplate.from_template(template)

In [10]:
retriever = vectorstore.as_retriever()

In [11]:
handler = StdOutCallbackHandler()

In [12]:
llm = ChatGroq(model="llama3-70b-8192",
               groq_api_key= GROQ_API_KEY)

# Retrieval qa chain
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       callbacks=[handler],
                                       retriever=retriever)

In [11]:
# Invoke qa_chain gpt-3.5-turbo

qa_chain.invoke({"query": "Que es la conductividad electrica"})["result"]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'De acuerdo al texto, la conductividad eléctrica es una propiedad intensiva que representa la capacidad que tiene un material para conducir la corriente eléctrica.'

In [73]:
qa_chain.invoke({"query": "Que resultados se obtienen de la conductividad electrica"})["result"]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Según el texto, los resultados obtenidos de la conductividad eléctrica son:\n\n* La conductividad eléctrica aumenta con la concentración de alcohol en la mezcla.\n* La longitud de la cadena del alcohol juega un papel fundamental en la estabilidad de la mezcla, y se obtienen mejores resultados con alcoholes de cadena larga.\n* La conductividad eléctrica es mayor en la mezcla de 1-Nonanol que en otras mezclas.\n* La medida de la conductividad eléctrica puede ser poco certera debido a la inestabilidad de los agregados que se forman en la disolución.\n\nNo se mencionan otros resultados específicos de la conductividad eléctrica, pero se habla de la importancia de la longitud de la cadena del alcohol y la concentración de alcohol en la mezcla.'

In [16]:
qa_chain.invoke("Can you analyze the research paper?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Can you analyze the research paper?',
 'result': "I'd be happy to try! However, I need more information about what you would like me to analyze. \n\nThe references provided appear to be a list of citations from a research paper, but without the actual paper, it's challenging to provide a comprehensive analysis.\n\nCould you please provide more context or clarify what specifically you would like me to analyze? For example, are you looking for an analysis of the methodology, results, conclusions, or something else? Additionally, do you have access to the full paper, and if so, can you share it with me?"}

In [7]:
qa_chain.combine_documents_chain.llm_chain.prompt.messages[1].prompt

PromptTemplate(input_variables=['question'], template='{question}')

In [13]:
# Create custom tool to use the vector store

class MyCustomTool(BaseTool):
    name: str = "Research Paper"
    description: str = "Research Paper"

    def _run(self, argument: str):
        return qa_chain.invoke("Can you analyze with detail the research paper?")

tool_instance = MyCustomTool()

In [14]:
### RESEARCHER ###

# Agent
reasarcher = Agent(
    role="Physicist Researcher",
    goal='Understanding the published research paper and extract the main insights',
    backstory="""You work at a Physics Science school.
                  A new paper has been published and you need to extract
                  the main information and provide the main insights to
                  the students.You have a knack for dissecting complex
                  research papers""",
    verbose=True,
    llm= llm,
    tools=[tool_instance],
    allow_delegation=False
    )


# Tasks
task_research = Task(
    description="""Conduct an in-depth analysis of the published research paper.
                    Identify the key components and try to understand how each works.""",
    expected_output="""Your final answer must be a full analysis report of the research paper""",
    agent=reasarcher,
    output_file='researcher_analysis.md' 

)


### WRITER ###

# Agent
writer = Agent(
    role="Physicist Writer",
    goal= 'Craft easy to understand content for physics students',
    backstory="""You are working on writing a summary of a research paper.
                  Use the content of the Physicist Researcher to develop
                  a short comprehensive summary of the research paper""",
    verbose=True,
    llm= llm,
    allow_delegation=False,
)

# Tasks
task_writer = Task(
    description="""Extract the most important topics and key components of the report
                    developed by the Physicist Researcher.""",
    expected_output="""Your final answer must be a short comprehensive summary 
                        to understand report for physics students""",
    agent=writer,
    output_file='writer_summary.md' 

)

# CREWAI KICKOFF

Crew = Crew(agents=[reasarcher, writer],
            tasks=[task_research, task_writer],
            verbose=99)

In [15]:
result = Crew.kickoff()

[1m[95m [DEBUG]: == Working Agent: Medical Researcher[00m
[1m[95m [INFO]: == Starting Task: Conduct an in-depth analysis of the published research paper.
                    Identify the key components and try to understand how each works.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: I need to carefully read and understand the research paper to extract the main insights.

Action: Research Paper
Action Input: {"argument": "string"}[0m

[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
[95m 

{'query': 'Can you analyze with detail the research paper?', 'result': "I can try to provide some general insights and information about the research paper based on the provided context. However, please note that I don't have access to the full paper, and my analysis will be limited to the snippets of information provided.\n\n**Research Topic:** The research paper appears to be focused on the microbiota associated with Preterm Premature R