# Uploading document data to PineconeDB as vector embeddings
Use the parsed txt files and convert them into chunks with meaningful size. Create the vector embeddings on the chunks and upload them to Pinecone.

## Steps involved:
Step-1: Basic cleaning of the parsed documents (Using Python and manually)

Step-2: Use [LangChain](https://python.langchain.com/v0.1/docs/modules/data_connection/document_transformers/character_text_splitter/) to create the text chunks for each heading/section

Step-3: Initiate the [OpenAI embedding](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) model to create vector embeddings of the chunks created

Step-4: Upload the embeddings to [Pinecone](https://www.pinecone.io/)

## Import required libraries

In [1]:
import os
from dotenv import load_dotenv
import re
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.globals import set_verbose, set_debug

## Set the required variables and file imports

In [2]:
# Load.env file
load_dotenv()

True

In [3]:
# Disable verbose logging
set_verbose(False)

# Disable debug logging
set_debug(False)

In [4]:
# Calling the API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
def get_files_in_directory(path):
    """Gets all files in the specified directory.

    Args:
        path: The directory path.

    Returns:
        A list of filenames in the directory.
    """
    
    return [f for f in os.listdir(path) if (os.path.isfile(os.path.join(path, f)))]
    # return [f for f in os.listdir(path) if (os.path.isfile(os.path.join(path, f)) and f[-3:]=="txt")]
    # return [f for f in os.listdir(path) if (os.path.isfile(os.path.join(path, f)) and f=="telecalm_KB_parsed.txt")]

In [6]:
# Get the current working directory
raw_file_dir = os.getcwd()


input_parsed_file_dir = raw_file_dir + "/parsed_docs"

## Document Reading and Cleaning

### Reading parsed documents

In [None]:
# Get the txt files in the given directory
input_parsed_file_paths = get_files_in_directory(input_parsed_file_dir)

print(f"Number of parsed text documents in the directory: {len(input_parsed_file_paths)}")

In [None]:
print(input_parsed_file_paths)

In [10]:
all_content = ""

In [None]:
for i in range(len(input_parsed_file_paths)):
    print(f"Currently reading: {input_parsed_file_paths[i]}")

    f = open(input_parsed_file_dir + "/" + input_parsed_file_paths[i], "r")
    file_content = f.read()
    all_content += file_content + "\n\n"
    f.close()

In [12]:
# print(all_content)

### document cleaning

In [None]:
# Split the text based on markdown
all_docs_sep = all_content.split("\n# ")

print(f"Number of chunks/documents: {len(all_docs_sep)}")

In [15]:
# Replace the uneven newline spacing/characters to keep everything properly formatted
for i in range(len(all_docs_sep)):
    all_docs_sep[i] = "# " + all_docs_sep[i].replace("# ", "").replace("\n\n", "\n")

# Combine all the text to form single text which is to be passed for chunking
all_docs_sep_final = "\n".join(all_docs_sep)

In [None]:
dbl_split_content = len(all_docs_sep_final.split("\n\n"))
print(f"Number of chunks/documents on splitting based of double newline: {dbl_split_content}")

## Converting document into chunks

In [17]:
# Each question is split into a separate chunk based on the "\n\n" separator
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=256,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_text(all_docs_sep_final)

In [None]:
# Convert the chunks to Document objects
docs = [Document(page_content=chunk) for chunk in chunks]

print(f"Number of documents in the Knowledge Base: {len(docs)}")
print(docs)

## Embediing model setup

In [19]:
query = "Some query question"

In [None]:
embeddings = OpenAIEmbeddings()

# Convert the query to OpenAI embedding format
embedded_query = embeddings.embed_query(query)

# Check the size and see the embedding
print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

## Uploading the Vectors to PineconeDB

In [None]:
index_name = "index_name"
namespace = "namespace_name"

pinecone = PineconeVectorStore.from_documents(
    docs, embeddings, index_name=index_name, namespace=namespace
)