# **The purpose of this notebook is to tokenise/embed our documents and upload it to our Pinecone database. Private information has been removed.**

Install **Dependencies**

In [None]:
! pip install --upgrade langchain openai -q
!pip install sentence_transformers -q
!pip install unstructured -q
!pip install unstructured -q
!pip install unstructured[local-inference] -q
!pip install detectron2@git+https://github.com/facebookresearch/detectron2.git
!apt-get install poppler-utils
!pip install pytesseract
import pytesseract
!sudo apt install tesseract-ocr
# Required for OpenAI embedding
!pip install tiktoken -q
!pip install pinecone-client -q

**Load the documents in the specified directory.**

In [None]:
from langchain.document_loaders import DirectoryLoader

directory = 'data'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

**Splitting the Text into Chunks**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size=1421, chunk_overlap=520):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

In [None]:
for doc in docs:
  print(doc.page_content)
  print("New")

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
query_result = embeddings.embed_query("Hello world")
len(query_result)

**Initialise the Pinecone Database Client**

In [20]:
import pinecone
from langchain.vectorstores import Pinecone

# Initialise Pinecone
pinecone.init(
	api_key='removed',
	environment='removed'
)

index_name = "removed"
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [None]:
def get_similar_docs(query, k=1, score=False):
  if score:
    similar_docs = index.similarity_search_with_score(query, k=k)
  else:
    similar_docs = index.similarity_search(query, k=k)
  return similar_docs

query = "Hi, I graduated from SMU in 2022. I would like to access the LKS library"
similar_docs = get_similar_docs(query)
print(query)
print(similar_docs)