In [4]:
from langchain.prompts import PromptTemplate
from langchain.vectorstores import pinecone
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
import pinecone
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
from langchain_pinecone.vectorstores import PineconeVectorStore
    

In [17]:
import os
from dotenv import load_dotenv
load_dotenv()

pinecone_api_key=os.environ["PINECONE_API_KEY"]

In [18]:
def load_data(data):
    loader=DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
    
    document=loader.load()
    
    return document

In [19]:
extracted_data = load_data("data/")

In [20]:
def chunk_text(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    
    return text_chunks

In [21]:
text_chunks=chunk_text(extracted_data)

In [22]:
def embedding_hfe():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v1")
    return embeddings

In [23]:
embeddings=embedding_hfe()



In [24]:
from pinecone import Pinecone, ServerlessSpec

pinecone = Pinecone(api_key=pinecone_api_key)

index_name ="medicalaiassistant"
if index_name not in [index.name for index in pinecone.list_indexes()]:
    pinecone.create_index(
        name=index_name,
        dimension=384, 
        metric='cosine', 
        spec=ServerlessSpec(cloud='aws', region='us-west-1') 
    )

index = pinecone.Index(index_name)


In [25]:
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
vectorstore.from_texts([t.page_content for t in text_chunks],embeddings,index_name=index_name)