In [1]:
from pathlib import Path

# Directories of interest
# include folders that you might want the llm to have access to
DIRECTORIES = [r"C:\\Users\\Jamin Carter\\Downloads\\web_archive", r"D:\Project\Web-scraping\TestFolder"]
    
    
files_to_vectorize=[]
for dir in DIRECTORIES:
    files = Path(dir).glob("*")
    for file_path in files:
        if file_path.is_file():
            with open(file_path, 'r', encoding='utf-8') as f:
                files_to_vectorize.append(str(file_path))

In [2]:
print(f"Total files to vectorize: {len(files_to_vectorize)}")

Total files to vectorize: 72


In [3]:
files_to_vectorize[7][55:65]

'_game8_co_'

In [None]:
# DIRECTORY SETUP
import os
if not os.path.exists("./db/sqlite"):
    os.makedirs("./db/sqlite")
    

if not os.path.exists("./db/chroma"):
    os.makedirs("./db/chroma")

FileExistsError: [WinError 183] Cannot create a file when that file already exists: './db/sqlite'

In [8]:
import chromadb
import sqlite3
import hashlib
from tqdm import tqdm
# --- SQLite ---
conn = sqlite3.connect("./db/sqlite/isVectorized.db")
cur = conn.cursor()
cur.execute("PRAGMA journal_mode=WAL;")
cur.execute("""
CREATE TABLE IF NOT EXISTS vectorized_files (
    file_id TEXT PRIMARY KEY,
    hash TEXT
)
""")
conn.commit()

def is_vectorized(file_id, file_hash) -> bool:
    cur.execute(
        "SELECT 1 FROM vectorized_files WHERE file_id=? AND hash=?",
        (file_id, file_hash)
    )
    return cur.fetchone() is not None


# --- Chroma ---
client = chromadb.PersistentClient(path="./db/chroma/")
collection = client.get_or_create_collection("ALL_TEXT_FILES")


def clean_content(text: str) -> str:
    if not text:
        return ""

    text = text.replace("\r\n", "\n").replace("\r", "\n")

    lines = []
    for line in text.split("\n"):
        line = line.strip()
        if line:
            lines.append(line)

    return "\n".join(lines)

def chunking(text: str, chunk_size: int, overlap: int) -> list[str]:
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = min(start + chunk_size, text_length)
        chunk = text[start:end]
        chunks.append(chunk)

        if end == text_length:
            break

        start += chunk_size - overlap

    return chunks

def add_to_chroma(file_path: str):
    with open(file_path, "r", encoding="utf-8") as f:
        raw = f.read()

    content = clean_content(raw)
    file_hash = hashlib.md5(content.encode("utf-8")).hexdigest()

    if is_vectorized(file_path, file_hash):
        #tqdm.write(f"[DEV] Skipping AV:{file_path[50:70]}")
        return

    #tqdm.write(f"[DEV] Attempting AV:{file_path[50:70]}")

    chunks = chunking(content, 1000, 200)

    # 1️⃣ Add to Chroma FIRST
    for i, chunk in enumerate(chunks):
        collection.add(
            documents=[chunk],
            metadatas=[{"file_id": file_path}],
            ids=[f"{file_path}::chunk_{i}"]
        )

    # 2️⃣ Mark vectorized AFTER success
    cur.execute(
        """
        INSERT INTO vectorized_files (file_id, hash)
        VALUES (?, ?)
        ON CONFLICT(file_id) DO UPDATE SET
            hash = excluded.hash
        """,
        (file_path, file_hash)
    )
    conn.commit()


In [9]:
add_to_chroma(files_to_vectorize[12])


In [11]:
result = collection.get(
    limit=1,
    include=["documents", "metadatas"]
)
print(result)


{'ids': ['C:\\Users\\Jamin Carter\\Downloads\\web_archive\\2025-12-18__github_com_open_webui_open_webui.txt::chunk_0'], 'embeddings': None, 'documents': ["Skip to content\nYou signed in with another tab or window. Reload to refresh your session.\nYou signed out in another tab or window. Reload to refresh your session.\nYou switched accounts on another tab or window. Reload to refresh your session.\nDismiss alert\nopen-webui\nPublic\nSponsor\nWatch\nCouldn't load subscription status.\nRetry\nUh oh!\nThere was an error while loading. Please reload this page.\nFork\n16.6k\nFork your own copy of open-webui/open-webui\nForks could not be loaded\nLoading\nUh oh!\nThere was an error while loading. Please reload this page.\nStarred\n118k\nLoading\nUh oh!\nThere was an error while loading. Please reload this page.\nStar\n118k\nLoading\nUh oh!\nThere was an error while loading. Please reload this page.\nWatch536\nSponsor open-webui/open-webui\nSponsor open-webui/open-webui\nGitHub Sponsors\nLear

In [12]:
from tqdm import tqdm

pbar = tqdm(files_to_vectorize, desc="Vectorizing files")

for file_path in pbar:
    pbar.set_postfix(file=file_path[56:])  # show last part only
    add_to_chroma(file_path)


Vectorizing files: 100%|██████████| 72/72 [03:49<00:00,  3.19s/it, file=]                                                                                    


92