In [22]:
from langchain_community.llms import LlamaCpp
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as PineconeVecDb
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from pinecone import Pinecone

from dotenv import load_dotenv
import os

In [18]:
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV_NAME = os.getenv('PINECONE_ENV_NAME')
PINECONE_INDEX = os.getenv('PINECONE_INDEX')

### Functions

In [8]:
# FUNCTIONS
def load_data(path: str):
    loader = DirectoryLoader(path=path, glob='*.pdf', loader_cls=PyPDFLoader)
    return loader.load()

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return text_splitter.split_documents(documents)

### Extracting Data

In [9]:
extracted_data = load_data('documents/')
overlap_data = split_text(extracted_data)

In [13]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
# init pinecone
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)

In [33]:
docsearch = PineconeVecDb.from_documents(overlap_data, embeddings, index_name=PINECONE_INDEX)