In [29]:
!pip install langchain_ollama

Collecting langchain_ollama
  Downloading langchain_ollama-1.0.0-py3-none-any.whl.metadata (2.1 kB)
Downloading langchain_ollama-1.0.0-py3-none-any.whl (29 kB)
Installing collected packages: langchain_ollama
Successfully installed langchain_ollama-1.0.0


In [None]:
!pip install -U scikit-learn

In [None]:
!pip install -U langchain langchain-core langchain-community langchain-openai langchain-chroma

In [30]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import sys 
sys.path.append("../../llm_engineering")
from api_clients import create_clients
from langchain_ollama import OllamaEmbeddings


In [37]:
clients = create_clients()
db_name = "vector_db"

In [20]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [22]:
folders = glob.glob("knowledge-base/*") # get all folder under knowledge-base/
text_loader_kwargs = {'encoding': 'utf-8'}
documents = []
for folder in folders:
    doc_type = os.path.basename(folder) # Extract the folder name
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs) # object to load all docs from knowledge-base/ folder
    folder_docs = loader.load() # load all docs (.md files)
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [24]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Created a chunk of size 1088, which is longer than the specified 1000


Document types found: company, products, contracts, employees


In [None]:
!ollama pull nomic-embed-text

In [32]:
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",  # or "nomic-embed-text", "mxbai-embed-large", etc.
)

In [38]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [45]:
# Create a Chroma vector database from document chunks
# - Converts all text chunks into vector embeddings using the specified model
# - Stores vectors in a persistent database on disk (survives program restarts)
vectorstore = Chroma.from_documents(
    documents=chunks,              # The split document chunks to embed
    embedding=embeddings,           # The embedding model (Ollama/OpenAI) to convert text to vectors
    persist_directory=db_name       # Where to save the database on disk
)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")

# Access the underlying Chroma collection object
# This is the raw database that holds all the vectors and metadata
collection = vectorstore._collection

# Retrieve a sample embedding vector to check its dimensions
# - Gets the first document's embedding from the database
# - include=["embeddings"] ensures we get the actual vector numbers, not just metadata
sample_embedding = collection.get(
    limit=1,                        # Get only one document
    include=["embeddings"]          # Return the embedding vectors
)["embeddings"][0]                  # Extract the first (and only) embedding array

# Count how many dimensions (numbers) are in the embedding vector
# Different models produce different sizes: OpenAI=1536, many Ollama models=768
dimensions = len(sample_embedding)

print(f"The vectors have {dimensions:,} dimensions")

Vectorstore created with 246 documents
The vectors have 768 dimensions


In [None]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [47]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# use the whole docs rather than separating them into chunks 