In [None]:
!pip install -q gitpython chromadb sentence-transformers langchain langchain-community langchain-text-splitters

In [None]:
import os
import json
import shutil
from pathlib import Path
from datetime import datetime
import git
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

In [None]:
REPO_URL = "https://github.com/langchain-ai/langchain.git"  # Change to any repo
LOCAL_REPO_PATH = "./langchain_repo"
VECTOR_DB_PATH = "./chroma_db"
STATE_FILE = "./sync_state.json"

# Only process these file types (adjust as needed)
ALLOWED_EXTENSIONS = ['.md', '.mdx', '.txt', '.py']

# Specific directories to monitor (leave empty to monitor all)
MONITOR_PATHS = ['docs/docs']  # For LangChain, focus on docs

# Chunking parameters
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

In [None]:
def load_state():
    """Load the last processed commit hash"""
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE, 'r') as f:
            return json.load(f)
    return {"last_commit": None, "last_update": None}

def save_state(commit_hash):
    """Save the current commit hash as processed"""
    state = {
        "last_commit": commit_hash,
        "last_update": datetime.now().isoformat()
    }
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f, indent=2)
    print(f"✓ State saved: {commit_hash[:8]}")

In [None]:
def clone_or_update_repo():
    """Clone repo if not exists, otherwise fetch latest changes"""
    if os.path.exists(LOCAL_REPO_PATH):
        print(f"📦 Repository exists, fetching updates...")
        repo = git.Repo(LOCAL_REPO_PATH)
        origin = repo.remotes.origin
        origin.fetch()
        # Pull latest changes
        repo.git.pull('origin', 'master')
        print(f"✓ Repository updated")
    else:
        print(f"📥 Cloning repository...")
        repo = git.Repo.clone_from(REPO_URL, LOCAL_REPO_PATH, depth=1)
        print(f"✓ Repository cloned")

    return repo

def get_changed_files(repo, last_commit):
    """Get list of changed files since last commit"""
    current_commit = repo.head.commit.hexsha

    if last_commit is None:
        # First run - treat all files as added
        print("🆕 First run - will process all files")
        return {
            'added': get_all_monitored_files(repo),
            'modified': [],
            'deleted': [],
            'current_commit': current_commit
        }

    # Get diff between commits
    print(f"🔍 Comparing {last_commit[:8]} → {current_commit[:8]}")

    try:
        diff_output = repo.git.diff('--name-status', f'{last_commit}..{current_commit}')
    except git.exc.GitCommandError:
        print("⚠️ Cannot find previous commit, processing all files")
        return {
            'added': get_all_monitored_files(repo),
            'modified': [],
            'deleted': [],
            'current_commit': current_commit
        }

    added = []
    modified = []
    deleted = []

    if diff_output:
        for line in diff_output.splitlines():
            parts = line.split('\t')
            if len(parts) >= 2:
                status = parts[0]
                file_path = parts[1]

                # Filter by allowed extensions and paths
                if should_process_file(file_path):
                    if status == 'A':
                        added.append(file_path)
                    elif status == 'M':
                        modified.append(file_path)
                    elif status == 'D':
                        deleted.append(file_path)

    print(f"📊 Changes detected: {len(added)} added, {len(modified)} modified, {len(deleted)} deleted")

    return {
        'added': added,
        'modified': modified,
        'deleted': deleted,
        'current_commit': current_commit
    }

def get_all_monitored_files(repo):
    """Get all files that should be monitored"""
    all_files = []
    repo_path = Path(LOCAL_REPO_PATH)

    if MONITOR_PATHS:
        # Only scan specified directories
        for monitor_path in MONITOR_PATHS:
            search_path = repo_path / monitor_path
            if search_path.exists():
                for ext in ALLOWED_EXTENSIONS:
                    all_files.extend([str(f.relative_to(repo_path))
                                    for f in search_path.rglob(f'*{ext}')])
    else:
        # Scan entire repo
        for ext in ALLOWED_EXTENSIONS:
            all_files.extend([str(f.relative_to(repo_path))
                            for f in repo_path.rglob(f'*{ext}')])

    return all_files

def should_process_file(file_path):
    """Check if file should be processed based on extension and path"""
    # Check extension
    if not any(file_path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
        return False

    # Check if in monitored paths (if specified)
    if MONITOR_PATHS:
        return any(file_path.startswith(path) for path in MONITOR_PATHS)

    return True

In [None]:
def initialize_embeddings():
    """Initialize embedding model (using free HuggingFace model)"""
    print("🤖 Loading embedding model...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )
    print("✓ Embedding model loaded")
    return embeddings

def initialize_vectordb(embeddings):
    """Initialize or load existing ChromaDB"""
    print("💾 Initializing Vector Database...")
    vectordb = Chroma(
        persist_directory=VECTOR_DB_PATH,
        embedding_function=embeddings,
        collection_name="langchain_docs"
    )
    print("✓ Vector Database ready")
    return vectordb

def process_file_to_chunks(file_path):
    """Load and chunk a single file"""
    full_path = os.path.join(LOCAL_REPO_PATH, file_path)

    if not os.path.exists(full_path):
        return []

    try:
        loader = TextLoader(full_path, encoding='utf-8')
        documents = loader.load()

        # Add metadata
        for doc in documents:
            doc.metadata['source_file'] = file_path
            doc.metadata['file_type'] = Path(file_path).suffix

        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
        chunks = text_splitter.split_documents(documents)

        return chunks
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return []

def process_additions_and_modifications(vectordb, files):
    """Process added and modified files"""
    if not files:
        return

    print(f"\n➕ Processing {len(files)} added/modified files...")

    all_chunks = []
    for file_path in files:
        chunks = process_file_to_chunks(file_path)
        if chunks:
            all_chunks.extend(chunks)
            print(f"  ✓ {file_path}: {len(chunks)} chunks")

    if all_chunks:
        # Add to vector database
        vectordb.add_documents(all_chunks)
        print(f"✓ Added {len(all_chunks)} chunks to VectorDB")

def process_deletions(vectordb, files):
    """Process deleted files by removing their embeddings"""
    if not files:
        return

    print(f"\n🗑️ Processing {len(files)} deleted files...")

    for file_path in files:
        try:
            # Delete all chunks from this file
            # ChromaDB uses metadata filtering
            vectordb._collection.delete(
                where={"source_file": file_path}
            )
            print(f"  ✓ Removed: {file_path}")
        except Exception as e:
            print(f"  ⚠️ Error deleting {file_path}: {e}")

In [None]:
def run_incremental_update():
    """Main function to run the incremental update pipeline"""
    print("=" * 70)
    print("🚀 INCREMENTAL VECTORDB UPDATE PIPELINE")
    print("=" * 70)

    # 1. Load previous state
    state = load_state()
    last_commit = state.get('last_commit')
    print(f"\n📋 Last processed commit: {last_commit[:8] if last_commit else 'None (first run)'}")

    # 2. Clone/update repository
    repo = clone_or_update_repo()

    # 3. Detect changes
    changes = get_changed_files(repo, last_commit)

    # 4. Initialize embeddings and vector DB
    embeddings = initialize_embeddings()
    vectordb = initialize_vectordb(embeddings)

    # 5. Process changes
    total_changes = len(changes['added']) + len(changes['modified']) + len(changes['deleted'])

    if total_changes == 0:
        print("\n✨ No changes detected - VectorDB is up to date!")
    else:
        # Process deletions first
        process_deletions(vectordb, changes['deleted'])

        # Process additions and modifications
        process_additions_and_modifications(
            vectordb,
            changes['added'] + changes['modified']
        )

        # Persist changes
        vectordb.persist()
        print("\n💾 VectorDB persisted")

    # 6. Save new state
    save_state(changes['current_commit'])

    print("\n" + "=" * 70)
    print("✅ UPDATE COMPLETE")
    print("=" * 70)

    return vectordb


In [None]:
def test_query(vectordb, query, k=3):
    """Test the vector database with a query"""
    print(f"\n🔍 Searching for: '{query}'")
    results = vectordb.similarity_search(query, k=k)

    print(f"\n📚 Top {len(results)} results:")
    for i, doc in enumerate(results, 1):
        print(f"\n{i}. Source: {doc.metadata.get('source_file', 'Unknown')}")
        print(f"   Preview: {doc.page_content[:200]}...")

def get_stats(vectordb):
    """Get statistics about the vector database"""
    try:
        collection = vectordb._collection
        count = collection.count()
        print(f"\n📊 VectorDB Statistics:")
        print(f"   Total chunks: {count}")
        print(f"   Collection: {collection.name}")
        print(f"   Path: {VECTOR_DB_PATH}")
    except Exception as e:
        print(f"⚠️ Could not fetch stats: {e}")

In [None]:

# Run the incremental update
vectordb = run_incremental_update()


🚀 INCREMENTAL VECTORDB UPDATE PIPELINE

📋 Last processed commit: None (first run)
📥 Cloning repository...
✓ Repository cloned
🆕 First run - will process all files
🤖 Loading embedding model...


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Embedding model loaded
💾 Initializing Vector Database...


  vectordb = Chroma(


✓ Vector Database ready

✨ No changes detected - VectorDB is up to date!
✓ State saved: 4c38157e

✅ UPDATE COMPLETE


In [None]:
# Get database statistics
get_stats(vectordb)


📊 VectorDB Statistics:
   Total chunks: 0
   Collection: langchain_docs
   Path: ./chroma_db


In [None]:
# Test with a query
test_query(vectordb, "How do I use LangChain with OpenAI?")


🔍 Searching for: 'How do I use LangChain with OpenAI?'

📚 Top 0 results:


In [None]:
vectordb = run_incremental_update()

🚀 INCREMENTAL VECTORDB UPDATE PIPELINE

📋 Last processed commit: 4c38157e
📦 Repository exists, fetching updates...
✓ Repository updated
🔍 Comparing 4c38157e → 4c38157e
📊 Changes detected: 0 added, 0 modified, 0 deleted
🤖 Loading embedding model...
✓ Embedding model loaded
💾 Initializing Vector Database...
✓ Vector Database ready

✨ No changes detected - VectorDB is up to date!
✓ State saved: 4c38157e

✅ UPDATE COMPLETE


In [None]:
# Incremental VectorDB Updater for GitHub Repositories
# Simple implementation for Google Colab - no complex classes, just functions

# ============================================================================
# STEP 1: Install required packages
# ============================================================================
"""
Run this cell first in Colab:

!pip install -q gitpython chromadb sentence-transformers langchain langchain-community langchain-text-splitters
"""

# ============================================================================
# STEP 2: Imports and Setup
# ============================================================================
import os
import json
import shutil
from pathlib import Path
from datetime import datetime
import git
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# ============================================================================
# CONFIGURATION
# ============================================================================
REPO_URL = "https://github.com/langchain-ai/langchain.git"  # Change to any repo
LOCAL_REPO_PATH = "./langchain_repo"
VECTOR_DB_PATH = "./chroma_db"
STATE_FILE = "./sync_state.json"

# Only process these file types (adjust as needed)
ALLOWED_EXTENSIONS = ['.md', '.mdx', '.txt', '.py']

# Specific directories to monitor (leave empty to monitor all)
MONITOR_PATHS = ['docs/docs']  # For LangChain, focus on docs

# Chunking parameters
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

# ============================================================================
# STATE MANAGEMENT
# ============================================================================

def load_state():
    """Load the last processed commit hash"""
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE, 'r') as f:
            return json.load(f)
    return {"last_commit": None, "last_update": None}

def save_state(commit_hash):
    """Save the current commit hash as processed"""
    state = {
        "last_commit": commit_hash,
        "last_update": datetime.now().isoformat()
    }
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f, indent=2)
    print(f"✓ State saved: {commit_hash[:8]}")

# ============================================================================
# REPOSITORY OPERATIONS
# ============================================================================

def clone_or_update_repo():
    """Clone repo if not exists, otherwise fetch latest changes"""
    if os.path.exists(LOCAL_REPO_PATH):
        print(f"📦 Repository exists, fetching updates...")
        repo = git.Repo(LOCAL_REPO_PATH)
        origin = repo.remotes.origin
        origin.fetch()
        # Pull latest changes
        repo.git.pull('origin', 'master')
        print(f"✓ Repository updated")
    else:
        print(f"📥 Cloning repository...")
        repo = git.Repo.clone_from(REPO_URL, LOCAL_REPO_PATH, depth=1)
        print(f"✓ Repository cloned")

    return repo

def get_changed_files(repo, last_commit):
    """Get list of changed files since last commit"""
    current_commit = repo.head.commit.hexsha

    if last_commit is None:
        # First run - treat all files as added
        print("🆕 First run - will process all files")
        return {
            'added': get_all_monitored_files(repo),
            'modified': [],
            'deleted': [],
            'current_commit': current_commit
        }

    # Get diff between commits
    print(f"🔍 Comparing {last_commit[:8]} → {current_commit[:8]}")

    try:
        diff_output = repo.git.diff('--name-status', f'{last_commit}..{current_commit}')
    except git.exc.GitCommandError:
        print("⚠️ Cannot find previous commit, processing all files")
        return {
            'added': get_all_monitored_files(repo),
            'modified': [],
            'deleted': [],
            'current_commit': current_commit
        }

    added = []
    modified = []
    deleted = []

    if diff_output:
        for line in diff_output.splitlines():
            parts = line.split('\t')
            if len(parts) >= 2:
                status = parts[0]
                file_path = parts[1]

                # Filter by allowed extensions and paths
                if should_process_file(file_path):
                    if status == 'A':
                        added.append(file_path)
                    elif status == 'M':
                        modified.append(file_path)
                    elif status == 'D':
                        deleted.append(file_path)

    print(f"📊 Changes detected: {len(added)} added, {len(modified)} modified, {len(deleted)} deleted")

    return {
        'added': added,
        'modified': modified,
        'deleted': deleted,
        'current_commit': current_commit
    }

def get_all_monitored_files(repo):
    """Get all files that should be monitored"""
    all_files = []
    repo_path = Path(LOCAL_REPO_PATH)

    if MONITOR_PATHS:
        # Only scan specified directories
        for monitor_path in MONITOR_PATHS:
            search_path = repo_path / monitor_path
            if search_path.exists():
                for ext in ALLOWED_EXTENSIONS:
                    all_files.extend([str(f.relative_to(repo_path))
                                    for f in search_path.rglob(f'*{ext}')])
    else:
        # Scan entire repo
        for ext in ALLOWED_EXTENSIONS:
            all_files.extend([str(f.relative_to(repo_path))
                            for f in repo_path.rglob(f'*{ext}')])

    return all_files

def should_process_file(file_path):
    """Check if file should be processed based on extension and path"""
    # Check extension
    if not any(file_path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
        return False

    # Check if in monitored paths (if specified)
    if MONITOR_PATHS:
        return any(file_path.startswith(path) for path in MONITOR_PATHS)

    return True

# ============================================================================
# EMBEDDING & VECTOR DB OPERATIONS
# ============================================================================

def initialize_embeddings():
    """Initialize embedding model (using free HuggingFace model)"""
    print("🤖 Loading embedding model...")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'}
    )
    print("✓ Embedding model loaded")
    return embeddings

def initialize_vectordb(embeddings):
    """Initialize or load existing ChromaDB"""
    print("💾 Initializing Vector Database...")
    vectordb = Chroma(
        persist_directory=VECTOR_DB_PATH,
        embedding_function=embeddings,
        collection_name="langchain_docs"
    )
    print("✓ Vector Database ready")
    return vectordb

def process_file_to_chunks(file_path):
    """Load and chunk a single file"""
    full_path = os.path.join(LOCAL_REPO_PATH, file_path)

    if not os.path.exists(full_path):
        return []

    try:
        loader = TextLoader(full_path, encoding='utf-8')
        documents = loader.load()

        # Add metadata
        for doc in documents:
            doc.metadata['source_file'] = file_path
            doc.metadata['file_type'] = Path(file_path).suffix

        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
        chunks = text_splitter.split_documents(documents)

        return chunks
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")
        return []

def process_additions_and_modifications(vectordb, files):
    """Process added and modified files"""
    if not files:
        return

    print(f"\n➕ Processing {len(files)} added/modified files...")

    all_chunks = []
    for file_path in files:
        chunks = process_file_to_chunks(file_path)
        if chunks:
            all_chunks.extend(chunks)
            print(f"  ✓ {file_path}: {len(chunks)} chunks")

    if all_chunks:
        # Add to vector database
        vectordb.add_documents(all_chunks)
        print(f"✓ Added {len(all_chunks)} chunks to VectorDB")

def process_deletions(vectordb, files):
    """Process deleted files by removing their embeddings"""
    if not files:
        return

    print(f"\n🗑️ Processing {len(files)} deleted files...")

    for file_path in files:
        try:
            # Delete all chunks from this file
            # ChromaDB uses metadata filtering
            vectordb._collection.delete(
                where={"source_file": file_path}
            )
            print(f"  ✓ Removed: {file_path}")
        except Exception as e:
            print(f"  ⚠️ Error deleting {file_path}: {e}")

# ============================================================================
# MAIN PIPELINE
# ============================================================================

def run_incremental_update():
    """Main function to run the incremental update pipeline"""
    print("=" * 70)
    print("🚀 INCREMENTAL VECTORDB UPDATE PIPELINE")
    print("=" * 70)

    # 1. Load previous state
    state = load_state()
    last_commit = state.get('last_commit')
    print(f"\n📋 Last processed commit: {last_commit[:8] if last_commit else 'None (first run)'}")

    # 2. Clone/update repository
    repo = clone_or_update_repo()

    # 3. Detect changes
    changes = get_changed_files(repo, last_commit)

    # 4. Initialize embeddings and vector DB
    embeddings = initialize_embeddings()
    vectordb = initialize_vectordb(embeddings)

    # 5. Process changes
    total_changes = len(changes['added']) + len(changes['modified']) + len(changes['deleted'])

    if total_changes == 0:
        print("\n✨ No changes detected - VectorDB is up to date!")
    else:
        # Process deletions first
        process_deletions(vectordb, changes['deleted'])

        # Process additions and modifications
        process_additions_and_modifications(
            vectordb,
            changes['added'] + changes['modified']
        )

        # Persist changes
        vectordb.persist()
        print("\n💾 VectorDB persisted")

    # 6. Save new state
    save_state(changes['current_commit'])

    print("\n" + "=" * 70)
    print("✅ UPDATE COMPLETE")
    print("=" * 70)

    return vectordb

# ============================================================================
# HELPER FUNCTIONS FOR TESTING
# ============================================================================

def test_query(vectordb, query, k=3):
    """Test the vector database with a query"""
    print(f"\n🔍 Searching for: '{query}'")
    results = vectordb.similarity_search(query, k=k)

    print(f"\n📚 Top {len(results)} results:")
    for i, doc in enumerate(results, 1):
        print(f"\n{i}. Source: {doc.metadata.get('source_file', 'Unknown')}")
        print(f"   Preview: {doc.page_content[:200]}...")

def get_stats(vectordb):
    """Get statistics about the vector database"""
    try:
        collection = vectordb._collection
        count = collection.count()
        print(f"\n📊 VectorDB Statistics:")
        print(f"   Total chunks: {count}")
        print(f"   Collection: {collection.name}")
        print(f"   Path: {VECTOR_DB_PATH}")

        if count > 0:
            # Show sample of metadata
            sample = collection.peek(limit=3)
            if sample and 'metadatas' in sample and sample['metadatas']:
                print(f"\n   Sample sources:")
                for meta in sample['metadatas'][:3]:
                    if meta:
                        print(f"     - {meta.get('source_file', 'Unknown')}")
    except Exception as e:
        print(f"⚠️ Could not fetch stats: {e}")

def debug_check():
    """Run diagnostics to see what's happening"""
    print("\n🔧 DIAGNOSTIC CHECK")
    print("=" * 70)

    # Check if repo exists
    if os.path.exists(LOCAL_REPO_PATH):
        print(f"✓ Repository exists at: {LOCAL_REPO_PATH}")

        # Count files
        repo_path = Path(LOCAL_REPO_PATH)
        if MONITOR_PATHS:
            total_files = 0
            for monitor_path in MONITOR_PATHS:
                search_path = repo_path / monitor_path
                if search_path.exists():
                    for ext in ALLOWED_EXTENSIONS:
                        files = list(search_path.rglob(f'*{ext}'))
                        total_files += len(files)
                        print(f"  Found {len(files)} {ext} files in {monitor_path}")
                else:
                    print(f"  ⚠️ Path does not exist: {monitor_path}")
            print(f"  Total monitored files: {total_files}")
        else:
            print("  Monitoring all files in repo")
    else:
        print(f"✗ Repository not found at: {LOCAL_REPO_PATH}")

    # Check vector DB
    if os.path.exists(VECTOR_DB_PATH):
        print(f"\n✓ Vector DB exists at: {VECTOR_DB_PATH}")
        try:
            embeddings = initialize_embeddings()
            vectordb = initialize_vectordb(embeddings)
            get_stats(vectordb)
        except Exception as e:
            print(f"✗ Error loading Vector DB: {e}")
    else:
        print(f"\n✗ Vector DB not found at: {VECTOR_DB_PATH}")

    # Check state file
    if os.path.exists(STATE_FILE):
        print(f"\n✓ State file exists")
        state = load_state()
        print(f"  Last commit: {state.get('last_commit', 'None')}")
        print(f"  Last update: {state.get('last_update', 'None')}")
    else:
        print(f"\n✗ State file not found (first run not completed)")

    print("=" * 70)

# ============================================================================
# USAGE EXAMPLE
# ============================================================================

"""
# STEP-BY-STEP USAGE FOR TROUBLESHOOTING

# 1. Run diagnostics first
debug_check()

# 2. If repo doesn't exist or DB is empty, run the update
vectordb = run_incremental_update()

# 3. Check stats again
get_stats(vectordb)

# 4. If still 0 chunks, try a smaller test repo first
# Change these at the top of the code:
# REPO_URL = "https://github.com/hwchase17/langchain-hub.git"  # Smaller repo
# MONITOR_PATHS = []  # Monitor all files
# Then run again: vectordb = run_incremental_update()

# 5. Test query
test_query(vectordb, "How do I use LangChain with OpenAI?")

# QUICK FIX: If LangChain repo is too large, try this smaller example
# REPO_URL = "https://github.com/gkamradt/langchain-tutorials.git"
# MONITOR_PATHS = []
# ALLOWED_EXTENSIONS = ['.md', '.txt', '.py']
"""

'\n# STEP-BY-STEP USAGE FOR TROUBLESHOOTING\n\n# 1. Run diagnostics first\ndebug_check()\n\n# 2. If repo doesn\'t exist or DB is empty, run the update\nvectordb = run_incremental_update()\n\n# 3. Check stats again\nget_stats(vectordb)\n\n# 4. If still 0 chunks, try a smaller test repo first\n# Change these at the top of the code:\n# REPO_URL = "https://github.com/hwchase17/langchain-hub.git"  # Smaller repo\n# MONITOR_PATHS = []  # Monitor all files\n# Then run again: vectordb = run_incremental_update()\n\n# 5. Test query\ntest_query(vectordb, "How do I use LangChain with OpenAI?")\n\n# QUICK FIX: If LangChain repo is too large, try this smaller example\n# REPO_URL = "https://github.com/gkamradt/langchain-tutorials.git"\n# MONITOR_PATHS = []\n# ALLOWED_EXTENSIONS = [\'.md\', \'.txt\', \'.py\']\n'

In [None]:
debug_check()


🔧 DIAGNOSTIC CHECK
✓ Repository exists at: ./langchain_repo
  ⚠️ Path does not exist: docs/docs
  Total monitored files: 0

✓ Vector DB exists at: ./chroma_db
🤖 Loading embedding model...
✓ Embedding model loaded
💾 Initializing Vector Database...
✓ Vector Database ready

📊 VectorDB Statistics:
   Total chunks: 0
   Collection: langchain_docs
   Path: ./chroma_db

✓ State file exists
  Last commit: 4c38157ee0e655f11e9538627759ae10bfa67fff
  Last update: 2025-10-08T04:27:58.102102
