In [1]:
from git import Repo
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language,RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [None]:
repo = Repo.clone_from("https://github.com/RocketChat/Rocket.Chat.git", to_path="Rocket.Chat", branch="develop")

In [3]:
loader = GenericLoader.from_filesystem(
    "Rocket.Chat/",
    glob="**/*",
    suffixes=[".ts",".tsx"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500),
)
documents = loader.load()

In [4]:
documents[0]

Document(page_content="import type { Config } from 'jest';\n\nconst config: Config = {\n\tprojects: [\n\t\t{\n\t\t\tdisplayName: 'client',\n\t\t\ttestEnvironment: 'jsdom',\n\t\t\ttestMatch: [\n\t\t\t\t'<rootDir>/client/**/**.spec.[jt]s?(x)',\n\t\t\t\t'<rootDir>/tests/unit/client/views/**/*.spec.{ts,tsx}',\n\t\t\t\t'<rootDir>/tests/unit/client/providers/**/*.spec.{ts,tsx}',\n\t\t\t],\n\t\t\terrorOnDeprecated: true,\n\n\t\t\tmodulePathIgnorePatterns: ['<rootDir>/dist/'],\n\n\t\t\ttransform: {\n\t\t\t\t'^.+\\\\.(t|j)sx?$': '@swc/jest',\n\t\t\t},\n\n\t\t\tmoduleNameMapper: {\n\t\t\t\t'\\\\.css$': 'identity-obj-proxy',\n\t\t\t\t'^react($|/.+)': '<rootDir>/node_modules/react$1',\n\t\t\t\t'^@tanstack/(.+)': '<rootDir>/node_modules/@tanstack/$1',\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tdisplayName: 'server',\n\t\t\ttestEnvironment: 'node',\n\t\t\ttestMatch: ['<rootDir>/ee/app/authorization/server/validateUserRoles.spec.ts'],\n\t\t\ttransformIgnorePatterns: ['!/node_modules/jose'],\n\t\t\terrorOnDeprec

In [5]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
chunks = python_splitter.split_documents(documents)

In [6]:
chunks[0].metadata

{'source': 'Rocket.Chat\\apps\\meteor\\jest.config.ts',
 'language': <Language.PYTHON: 'python'>}

In [7]:
embedding=OllamaEmbeddings(model="nomic-embed-text")

In [None]:
# vectorDb=Chroma.from_documents(
#     documents=chunks,
#     embedding=embedding,
#     persist_directory="./chroma_db"
# )
# vectorDb.persist()

In [13]:
vectorDb=Chroma.from_documents(
    documents=[chunks[0]],
    embedding=embedding,
    persist_directory="./chroma_db"
)
vectorDb.persist()

In [None]:
for i, chunk in enumerate(chunks):
    vectorDb.add_documents(documents=[chunk])