In [1]:
# Install necessary packages
%pip install --upgrade --quiet langchain-openai tiktoken langchain-chroma langchain GitPython pymilvus langchain-community

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Load environment variables
import os
from dotenv import load_dotenv
load_dotenv()
repository_path = os.getenv('REPOSITORY_PATH')
knowledgebase_dir = os.getenv('KNOWLEDGEBASE_DIR')
openai_api_key = os.getenv('OPENAI_API_KEY')
print("Repository Path:", repository_path)
print("KnowledgeBase Path:", knowledgebase_dir)

Repository Path: C:\Users\mfoster\source\repos\langchain
KnowledgeBase Path: C:\Users\mfoster\source\repos\kb_test


In [3]:

# Import GitPython for repository operations
from git import Repo

# Document loading and parsing
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language
from pathlib import Path


In [11]:
# Load the ignore list from ignore_list.txt in the root project directory
ignore_list_path = os.path.join(repository_path, 'ignore_list.txt')
print("Looking for ignore_list.txt at:", ignore_list_path)
try:
    with open(ignore_list_path, 'r') as file:
        ignore_list = [line.strip() for line in file.readlines()]
except FileNotFoundError:
    print(f"File not found: {ignore_list_path}")
    ignore_list = []

# Include non-UTF-8 encoding exclusion
ignore_list.append('**/non-utf8-encoding.py')

print("Ignore List:", ignore_list)

# Construct the path to the directory the loader will search
load_directory = os.path.join(knowledgebase_dir, "libs/core/langchain_core")
print("Directory to load files from:", load_directory)


# Configure the loader to handle all file types
loader = GenericLoader.from_filesystem(
    knowledgebase_dir,
    glob="**/*",
    suffixes=[],
    exclude=ignore_list,
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500),
    show_progress=True,
)
documents = loader.load()
print("Documents Loaded:", len(documents))
for root, dirs, files in os.walk(knowledgebase_dir):
    for name in files:
        full_path = Path(root, name)
        relative_path = full_path.relative_to(knowledgebase_dir)
        if any(full_path.match(pattern) for pattern in ignore_list):
            print("Ignored:", f"\033[91m{relative_path}\033[0m")
        else:
            print(f"\033[92m{relative_path}\033[0m") 

Looking for ignore_list.txt at: C:\Users\mfoster\source\repos\langchain\ignore_list.txt
Ignore List: ['**/node_modules/**', '.git', '.env', 'build', 'temp', '*.log', '*__pycache__', '**/non-utf8-encoding.py']
Directory to load files from: C:\Users\mfoster\source\repos\kb_test\libs/core/langchain_core


100%|██████████| 2/2 [00:00<00:00, 234.16it/s]

Documents Loaded: 2
[92mtestdoc.txt[0m
[92mkb_subfolder\subfile.txt[0m
Ignored: [91mkb_subfolder\node_modules\node_moduletest.txt[0m
Ignored: [91mnode_modules\nodemodulefile.txt[0m





In [None]:

# Split documents for processing
from langchain_text_splitters import RecursiveCharacterTextSplitter
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
texts = python_splitter.split_documents(documents)
print("Texts processed:", len(texts))


In [None]:

# Setup Milvus
from pymilvus import connections, CollectionSchema, FieldSchema, DataType, Collection
connections.connect("default", host="localhost", port="19530")
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768)  # Assuming embedding size of 768
]
schema = CollectionSchema(fields, description="Document embeddings")
collection = Collection(name="document_embeddings", schema=schema)


In [None]:

# Generate embeddings and insert into Milvus
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
db = Chroma.from_documents(texts, OpenAIEmbeddings(disallowed_special=()))
embeddings = db.get_embeddings(texts)
mr = collection.insert([embeddings])
print("Embeddings inserted into Milvus:", mr.primary_keys)

# Setup retrieval from Milvus
from pymilvus import SearchParams
search_params = SearchParams(metric_type="L2", params={"nprobe": 10})
results = collection.search(embeddings[:1], "embedding", search_params, limit=10)
print("Search results from Milvus:", results)


In [None]:

# Additional code for interacting with the embeddings
from langchain.chains import create_retrieval_chain
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 8})
document_chain = create_retrieval_chain(retriever)

# Example retrieval
question = "What is the purpose of the Runnable class?"
response = document_chain.invoke(question)
print("Retrieval response:", response)