In [6]:
import os
import git
import ast
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
import sys
print(sys.executable)

c:\Users\immah\anaconda3\envs\rag\python.exe


In [8]:
repo_url = "https://github.com/isaac-sim/IsaacLab.git"
local_path = "./isaaclab_local"


In [9]:
# --- 1. Clone the GitHub repo ---
if not os.path.exists(local_path):
    git.Repo.clone_from(repo_url, local_path)
    print("✅ Repo cloned.")
else:
    print("📁 Repo already exists. Skipping clone.")

✅ Repo cloned.


In [17]:
# --- 2. Set source folder path ---
SOURCE_PATH = "./isaaclab_local"
VALID_CODE_EXTENSIONS = {'.py', '.json', '.yaml', '.yml', '.toml', '.md'}
EXCLUDE_DIRS = {"_static"}

In [36]:
# --- 3. AST Chunking for Python files ---
def chunk_python_ast(code, file_path):
    chunks = []
    try:
        tree = ast.parse(code)
        for node in tree.body:
            if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
                text = ast.get_source_segment(code, node)
                if text:
                    chunks.append(Document(page_content=text, metadata={"source": file_path}))
    except Exception as e:
        print(f"AST parse error in {file_path}: {e}")
    return chunks

In [48]:
# --- 4. Fallback chunker for other files ---
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

def chunk_fallback(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
        return text_splitter.create_documents([text], metadatas=[{"source": file_path}])
    except Exception as e:
        print(f"Error chunking {file_path}: {e}")
        return []

In [54]:
# --- 5. Traverse all valid files and chunk ---
def ingest_repo_files():
    all_chunks = []
    for root, dirs, files in os.walk(SOURCE_PATH):
        if any(skip in root for skip in EXCLUDE_DIRS):
            continue
        for file in files:
            ext = os.path.splitext(file)[1]
            if ext not in VALID_CODE_EXTENSIONS:
                continue
            file_path = os.path.join(root, file)
            print(f"📄 Ingesting: {file_path}")
            if ext == '.py':
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    code = f.read()
                chunks = chunk_python_ast(code, file_path)
                print("Used AST chunker")
            else:
                chunks = chunk_fallback(file_path)
                print("Used fallback chunker")
            all_chunks.extend(chunks)
    return all_chunks

In [55]:
# --- 6. Run it ---
documents = ingest_repo_files()
print(f"✅ Total chunks ingested: {len(documents)}")

📄 Ingesting: ./isaaclab_local\.pre-commit-config.yaml
Used fallback chunker
📄 Ingesting: ./isaaclab_local\CONTRIBUTING.md
Used fallback chunker
📄 Ingesting: ./isaaclab_local\CONTRIBUTORS.md
Used fallback chunker
📄 Ingesting: ./isaaclab_local\environment.yml
Used fallback chunker
📄 Ingesting: ./isaaclab_local\pyproject.toml
Used fallback chunker
📄 Ingesting: ./isaaclab_local\README.md
Used fallback chunker
📄 Ingesting: ./isaaclab_local\SECURITY.md
Used fallback chunker
📄 Ingesting: ./isaaclab_local\.aws\mirror-buildspec.yml
Used fallback chunker
📄 Ingesting: ./isaaclab_local\.aws\postmerge-ci-buildspec.yml
Used fallback chunker
📄 Ingesting: ./isaaclab_local\.aws\premerge-ci-buildspec.yml
Used fallback chunker
📄 Ingesting: ./isaaclab_local\.github\PULL_REQUEST_TEMPLATE.md
Used fallback chunker
📄 Ingesting: ./isaaclab_local\.github\stale.yml
Used fallback chunker
📄 Ingesting: ./isaaclab_local\.github\ISSUE_TEMPLATE\bug.md
Used fallback chunker
📄 Ingesting: ./isaaclab_local\.github\ISSUE_T

In [32]:
documents 

[Document(metadata={}, page_content='repos:\n  - repo: https://github.com/python/black\n    rev: 24.3.0\n    hooks:\n      - id: black\n        args: ["--line-length", "120", "--unstable"]\n  - repo: https://github.com/pycqa/flake8\n    rev: 7.0.0\n    hooks:\n      - id: flake8\n        additional_dependencies: [flake8-simplify, flake8-return]\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.5.0\n    hooks:\n      - id: trailing-whitespace\n      - id: check-symlinks\n      - id: destroyed-symlinks'),
 Document(metadata={}, page_content='- id: destroyed-symlinks\n      - id: check-added-large-files\n        args: ["--maxkb=2000"]  # restrict files more than 2 MB. Should use git-lfs instead.\n      - id: check-yaml\n      - id: check-merge-conflict\n      - id: check-case-conflict\n      - id: check-executables-have-shebangs\n      - id: check-toml\n      - id: end-of-file-fixer\n      - id: check-shebang-scripts-are-executable\n      - id: detect-private-key\n   

In [56]:
# Vector embeddings using sentence transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(
    [doc.page_content for doc in documents], 
    show_progress_bar=True,
    convert_to_numpy=True
)
print(embeddings)

Batches: 100%|██████████| 105/105 [01:44<00:00,  1.00it/s]

[[-0.0555299   0.00410058  0.00766323 ... -0.04854472  0.1010896
  -0.07439523]
 [ 0.05165645 -0.03046053 -0.00414181 ... -0.04216816  0.06255618
  -0.01578519]
 [-0.03924909 -0.02091024 -0.01859647 ... -0.12762715  0.02407106
  -0.00186439]
 ...
 [-0.000875    0.06802201  0.00077574 ... -0.07184713  0.00267993
  -0.03409529]
 [ 0.06395874  0.03100758 -0.03739776 ... -0.01544104  0.05010741
  -0.05373923]
 [ 0.01896854 -0.07728996  0.01581057 ... -0.00960813 -0.08471295
  -0.04199051]]





### next create supabse account vector db store this in that and do similarity search