## Text Embedding with OpenAI

In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np
from langchain_chroma import Chroma

In [3]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science.docx")
pages = loader_docx.load()

md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [("#", "Course Title"), 
                                                                ("##", "Lecture Title")])
pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = " ".join(pages_md_split[i].page_content.split())

char_splitter = CharacterTextSplitter(separator = ".",
                                      chunk_size = 500,
                                      chunk_overlap = 50)
pages_char_split = char_splitter.split_documents(pages_md_split)

In [4]:
print(f"DEBUG: Final document chunks to embed: {len(pages_char_split)}")
if not pages_char_split:
    print("FATAL: Document list is empty. Debug splitting logic.")
    # Stop here if the list is empty

# 2. Print the persistence path
PERSIST_DIR = "./vector-store"
print(f"DEBUG: Persistence directory set to: {PERSIST_DIR}")

DEBUG: Final document chunks to embed: 20
DEBUG: Persistence directory set to: ./vector-store


In [5]:
print(pages_char_split[3], end="\n\n")
print(pages_char_split[5], end="\n\n")
print(pages_char_split[18])

page_content='Analytics is essentially the application of logical and computational reasoning to the component parts obtained in an analysis. And in doing this you are looking for patterns and exploring what you could do with them in the future. Here, analytics branches off into two areas: qualitative analytics – this is using your intuition and experience in conjunction with the analysis to plan your next business move' metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}

page_content='You may use this intuition to decide on which styles of clothing to start selling. This would be qualitative analytics. But you might not know when to introduce the new collection. In that case, relying on past sales data and user experience data, you could predict in which month it would be best to do that. This is an example of using quantitative analytics' metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Anal

In [6]:
embedding = OpenAIEmbeddings(model = "text-embedding-ada-002")

In [7]:
#vector1 = embedding.embed_query(pages_char_split[3].page_content)
#vector2 = embedding.embed_query(pages_char_split[5].page_content)
#vector3 = embedding.embed_query(pages_char_split[18].page_content)

In [8]:
# Grab the content of the first chunk
test_text = pages_char_split[0].page_content

try:
    print("DEBUG: Attempting to embed a single chunk...")
    # .embed_query() is for single text input, perfect for testing
    test_vector = embedding.embed_query(test_text)
    
    # If successful, print confirmation
    print(f"SUCCESS: Embedding worked! Vector dimension: {len(test_vector)}")
    print("DEBUG: Proceeding to Chroma creation...")

except Exception as e:
    # If it fails, the error will be printed, likely revealing an API issue
    print(f"FATAL: Embedding failed. Check OpenAI API Key/Network. Error: {e}")
    # Stop here if the embedding fails

DEBUG: Attempting to embed a single chunk...
SUCCESS: Embedding worked! Vector dimension: 1536
DEBUG: Proceeding to Chroma creation...


## Creating a Chroma Vector Store

In [9]:
vectorstore = Chroma.from_documents(documents = pages_char_split,
                                    embedding = embedding,
                                   persist_directory = "./vector-store" )
# --- Verification Step ---
# To verify the contents, you must reload the vector store from the disk.
# NOTE: You must provide the same embedding function to load the store.
reloaded_vectorstore = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embedding
)

# Use the internal _collection object to get the count
count = reloaded_vectorstore._collection.count()
print(f"VERIFICATION: Reloaded Chroma store count: {count} documents.")

if count == 0:
    print("FATAL: Chroma persistence failed or documents were not added.")
    print(f"ACTION: Check folder '{PERSIST_DIR}' for files to confirm disk write attempt.")
else:
    print("SUCCESS: Documents were saved and can be reloaded.")

VERIFICATION: Reloaded Chroma store count: 140 documents.
SUCCESS: Documents were saved and can be reloaded.


In [10]:
vectorstore_from_directory = Chroma(persist_directory = "./vector-store",
                                   embedding_function = embedding)

## Manage Documents within the Vector Store

In [13]:
vectorstore_from_directory.get(ids = "a4e8e867-e800-4934-837e-27ccb2076f23")

{'ids': ['a4e8e867-e800-4934-837e-27ccb2076f23'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'Analysis vs Analytics',
   'Course Title': 'Introduction to Data and Data Science'}]}