In [1]:
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
def load_pdf(data):
    loader = DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf("data/")

In [5]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [6]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 6863


In [None]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings


In [None]:
embeddings = download_hugging_face_embeddings()

In [None]:
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key="pcsk_4WZ9pp_QVsnicovswCthbzVz3stBxB3SD8JiF3vEUG4GjoGZ4tVygqQfeuJ6ZCMbX1j23s")

index_name = "test"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384, 
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)
print("Pinecone index ready âœ…")


In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,   
    embedding=embeddings, 
    index_name=index_name,
    pinecone_api_key="pcsk_4WZ9pp_QVsnicovswCthbzVz3stBxB3SD8JiF3vEUG4GjoGZ4tVygqQfeuJ6ZCMbX1j23s", 
)

In [None]:
import os
from dotenv import load_dotenv
from langchain_pinecone import PineconeVectorStore

load_dotenv()

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)


In [None]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k": 3})

In [None]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("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")

In [None]:
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",   # fast & free
    temperature=0
)


In [None]:
llm.invoke("hi")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [None]:
result = rag_chain.invoke({"input": "What is Acne?"})
print(result["answer"])
