In [None]:
%pwd

In [None]:
import os
os.chdir("../")

In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
# Extract text from PDF files
def load_pdf_files(path):
    loader = DirectoryLoader(
        path,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_files("data")
extracted_data

In [None]:
len(extracted_data)

In [None]:
from typing import List
from langchain_core.documents import Document

def filterToMinimalDocs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs


In [None]:
minimal_docs = filterToMinimalDocs(extracted_data)

In [None]:
minimal_docs

In [None]:
from langchain_community.document_loaders.base_o365 import CHUNK_SIZE


def textSplit(minimal_docs):
  textSplitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
  textChunk = textSplitter.split_documents(minimal_docs)
  return textChunk

In [None]:
texts_chunk = textSplit(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

def downlaodEmbeddings():
  embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
  return embedding_model

embeddingModel = downlaodEmbeddings()


In [None]:
embeddingModel

In [None]:
vector = embeddingModel.embed_query("Hello world")
vector

In [None]:
print( "Vector length:", len(vector))

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [None]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [None]:
pc

In [None]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws",region="us-east-1"),
    )
index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embeddingModel,
    index_name=index_name
)