# Document Loader

## Git

In [1]:
from langchain_community.document_loaders import GitLoader

In [2]:
def load_documents(path, branch):
    loader = GitLoader(
        repo_path=path,
        branch=branch
    )
    data = loader.load()
    return data

In [3]:
data = load_documents('/Users/pandoks/Projects/spotlight', 'setup')

In [29]:
for document in data:
    document = document.dict()
    print(document["metadata"]["file_path"])

.gitignore
LangChain.ipynb
TODO.md
embeddings.py
test.py
lua/spotlight/init.lua
retrieval/embeddings/langchain/ollama.py
retrieval/loaders/langchain/git.py
retrieval/retrievers/langchain/parent.py
retrieval/retrievers/langchain/vectorstore.py
retrieval/splitters/langchain/code.py
retrieval/splitters/langchain/summary.py
retrieval/stores/langchain/chroma.py
retrieval/stores/langchain/keyvaluedocstore.py


## Directory

In [3]:
from langchain_community.document_loaders import DirectoryLoader

In [4]:
data = DirectoryLoader('./retrieval').load()

In [5]:
print(data)

[Document(page_content='from typing import List, TypedDict from langchain_core.documents import Document from langchain_core.vectorstores import VectorStore\n\nclass VectorStoreRetrieveConfig(TypedDict):\n\ndatabase: VectorStore\n\nsearch_type: str\n\nprompt: str\n\ndef retrieve(config: VectorStoreRetrieveConfig)\n\n> List[Document]:\n\ndatabase = config["database"]\n\nretriever = database.as_retriever(search_type=config["search_type"])\n\ndocuments = retriever.invoke(config["prompt"])\n\nreturn documents', metadata={'source': 'retrieval/retrievers/langchain/vectorstore.py'}), Document(page_content='from typing import Optional, TypedDict from langchain_core.documents import Document from langchain_core.stores import BaseStore from langchain_core.vectorstores import VectorStore from langchain_text_splitters import TextSplitter from langchain.retrievers import ParentDocumentRetriever\n\nclass ParentRetrieveConfig(TypedDict):\n\ndatabase: VectorStore\n\ndocument_database: BaseStore\n\npro

# Document Splitter

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
def split_text(documents, language, chunk_size, chunk_overlap):
    splitter = RecursiveCharacterTextSplitter.from_language(language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = splitter.split_documents(documents)
    return docs

In [7]:
split = split_text(data, "python", 1000, 0)

In [8]:
print(len(split))

1812


In [9]:
texts = [doc.page_content for doc in split]
metadatas = [doc.metadata for doc in split]

# Embeddings

In [6]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

In [7]:
def embed_documents(texts, model):
    embedder = OllamaEmbeddings(model=model)
    embeddings = embedder.embed_documents(texts)
    return embeddings

In [12]:
# embeddings = embed_documents(texts, "llama3")

# Summary Generator

In [8]:
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import Ollama
from langchain_core.documents import Document

In [9]:
summary_prompt = PromptTemplate(
    input_variables=["document_content", "filepath"],
    template="""Summarize this file so that the contents of the files can be queried with a 
    vector database without confusion later on. This is the document ({filepath}): {document_content}
    """
)

In [10]:
print(summary_prompt.dict())

{'name': None, 'input_variables': ['document_content', 'filepath'], 'input_types': {}, 'output_parser': None, 'partial_variables': {}, 'metadata': None, 'tags': None, 'template': 'Summarize this file so that the contents of the files can be queried with a \n    vector database without confusion later on. This is the document ({filepath}): {document_content}\n    ', 'template_format': 'f-string', 'validate_template': False, '_type': 'prompt'}


In [11]:
def create_summaries(documents, model):
    new_documents = []
    llm_sequence = summary_prompt | Ollama(model=model)
    for document in documents:
        print(document)
        document_dict = document.dict()
        page_content = document_dict["page_content"]
        metadata = document_dict["metadata"]
        summary = llm_sequence.invoke({"document_content": page_content, "filepath": metadata["file_path"]})
        new_document = Document(page_content=summary, metadata=metadata)
        new_documents.append(new_document)
        new_documents.append(document)
    return new_documents

In [19]:
summaries = create_summaries(data, "llama3")

page_content='.nvim*\n.test/\n.ipynb_checkpoints/\n' metadata={'source': '.gitignore', 'file_path': '.gitignore', 'file_name': '.gitignore', 'file_type': ''}


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



page_content='- [ ] Decide if you should use langchain Documents as a common api or just a standard dict\n- [ ] Add requirements.txt for easy pip dependency installs\n- [ ] Create a class for a common api to interact with\n' metadata={'source': 'TODO.md', 'file_path': 'TODO.md', 'file_name': 'TODO.md', 'file_type': '.md'}
page_content='import sys\nimport chromadb\nimport argparse\nimport uuid\nimport json\nfrom chromadb.utils import embedding_functions\n\nchromadb_client = None\nembedding_function = None\ncollection = None\n\n\ndef main():\n    parser = argparse.ArgumentParser()\n    subparsers = parser.add_subparsers(dest="command")\n\n    store_parser = subparsers.add_parser("store")\n    store_parser.add_argument("--collection-name", type=str, required=True)\n    store_parser.add_argument("--model", type=str, required=True)\n    store_parser.add_argument("--db-location", type=str)\n    store_parser.add_argument("--file-location", type=str, required=True)\n\n    retrieve_parser = sub

In [20]:
print(summaries)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Vector Store

In [12]:
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from hashlib import sha256

In [13]:
def setup_database(collection_name, embedding_function, persist_directory):
    database = Chroma(
        collection_name=collection_name,
        embedding_function=embedding_function,
        persist_directory=persist_directory
    )
    return database

In [14]:
database = setup_database("spotlight", OllamaEmbeddings(model="llama3"), "/Users/pandoks/Projects/spotlight/.test")

In [19]:
def add(db, documents):
    documents_with_hashes = []
    for document in documents:
        document = document.dict()
        page_content = document["page_content"]
        hashed_page_content = sha256(page_content.encode("utf-8")).hexdigest()

        new_metadata = document["metadata"]
        new_metadata["hash"] = hashed_page_content

        query_filter = {'source': new_metadata['source']}
        # {
        #     "$and":
        #         [
        #             {'source': new_metadata["source"]},
        #             {'file_path': new_metadata["file_path"]},
        #             {'file_name': new_metadata["file_name"]},
        #             {'file_type': new_metadata["file_type"]},
        #             {'hash': new_metadata["hash"]}
        #         ]
            
        # }
        existing_documents = db.get(where=query_filter)
        if not len(existing_documents["ids"]):
            documents_with_hashes.append(Document(page_content, metadata=new_metadata))
        
    if not len(documents_with_hashes):
        return
    return db.add_documents(documents_with_hashes)

In [16]:
def delete(db, metadatas = None):
    if not metadatas:
        to_be_deleted_documents = db.get()
        to_be_deleted_ids = to_be_deleted_documents["ids"]
        db.delete(ids=to_be_deleted_ids)
        return to_be_deleted_ids

    for metadata in metadatas:
        query_filter_list = []
        for key, value in metadata.items():
            query_filter_list.append({key: value})
        query_filter = None
        if len(query_filter_list) == 1:
            query_filter = query_filter_list[0]
        else:
            query_filter = {
                "$and": query_filter_list
            }
        to_be_deleted_documents = db.get(where=query_filter)
        to_be_deleted_ids = to_be_deleted_documents["ids"]
        if not len(to_be_deleted_ids):
            return
        db.delete(ids=to_be_deleted_ids)
        return to_be_deleted_ids

In [17]:
def update(db, documents):
    metadatas = []
    for document in documents:
        metadata = document.dict()["metadata"]
        if metadata not in metadatas:
            metadatas.append(metadata)
    print(metadatas)
    delete(db, metadatas)
    return add(db, documents)

In [20]:
added_ids = add(database, data)

In [17]:
added_ids = add(database, data)

f7fa030f57c31438cd8128eb92a54913b3d8d167a4a1b9f0a7e989a23431f680
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
6190951b776b06e428703c833147274d10a633df505fc593f3848aa330961389
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
d1ac797952b4a123164e726d80c9c44a9b42883e667222bbb7bcd01bd8d63f7b
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
3b538e0345625508259fdb62b7281178e1ccfc6486e150dadaae15d95236780f
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
bc0088e8eab712e8264107308f61f5f88d165097e1181e35fd6b22a12ea5425f
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
a3767afb4d72de63992e4164f0226697d2a42427fe6c7dccfa0cf985e9023520
{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None}
2747a2435c46dd6443eea7c36a2e6d2db7c5410dfdc204

In [171]:
deleted_ids = delete(database, [{'file_type': '.py'}])

In [33]:
deleted_ids = delete(database)

In [38]:
updated_ids = update(database, split)

[{'source': '.gitignore', 'file_path': '.gitignore', 'file_name': '.gitignore', 'file_type': ''}, {'source': 'embeddings.py', 'file_path': 'embeddings.py', 'file_name': 'embeddings.py', 'file_type': '.py'}, {'source': 'lua/spotlight/init.lua', 'file_path': 'lua/spotlight/init.lua', 'file_name': 'init.lua', 'file_type': '.lua'}, {'source': 'retrieval/embeddings/langchain/ollama.py', 'file_path': 'retrieval/embeddings/langchain/ollama.py', 'file_name': 'ollama.py', 'file_type': '.py'}, {'source': 'retrieval/loaders/langchain/git.py', 'file_path': 'retrieval/loaders/langchain/git.py', 'file_name': 'git.py', 'file_type': '.py'}, {'source': 'retrieval/splitters/langchain/code.py', 'file_path': 'retrieval/splitters/langchain/code.py', 'file_name': 'code.py', 'file_type': '.py'}, {'source': 'retrieval/stores/langchain/chroma.py', 'file_path': 'retrieval/stores/langchain/chroma.py', 'file_name': 'chroma.py', 'file_type': '.py'}]
c9205effb86dc516823576184de6289b22fe2bcabec9821c8d289f424055dbea


In [21]:
print(added_ids)

['e1fc5530-68cc-4dd7-9b53-94b9e63d4ea5', '5285ee2d-56a8-4e6d-9df0-4359d1cdd735', '76a12d4f-fd04-486e-957b-85d260d2e01d', '66f632ca-1514-42db-88d8-7d782f7af7ad', 'e70844e1-75f7-4454-a373-04687da6e9dd', '8816f08d-8ab1-4d28-bf2e-fd0635c561fe', '53bc90b3-e3b3-4aa0-a7ab-fdb07df3dcb9', 'd0b184c4-ad25-4ee1-aa63-12a94610195c']


In [37]:
print(deleted_ids)

['08365a87-9f71-4e7e-b792-4ca549369adf', '09f14b7b-7038-4974-976c-803cc8f38f66', '1d635304-cdf9-4754-8489-fec66d644ca6', '310f655d-9cca-447e-8366-6ceb3b4ba532', '397c135a-d6af-436d-a244-6526347af5ce', '40871d88-5173-4496-8a98-509c407552c7', '43697fd5-578d-4438-a047-0fec8b57ba02', '4a31038d-e723-4e79-9f5f-4bb38bdd026f', '6e4d0eda-b42b-4760-89c8-99f42e2b7bc0', '7586f2cf-8b9d-4706-91cc-905956cbe363', '7dc2f585-e03c-4eda-84a7-dc5b5dda2c0a', '9922c3be-ec1d-4a2f-a982-d8199fdd5fa9', '99cf8e66-dcbe-4129-8aa4-be23d129183e', '9a5ed6a7-de6a-4eaa-916a-218ad5627d84', 'b691ec21-a2b0-481f-9f20-e26bc86c09dc', 'c2cff058-0e4b-41c3-85e3-0c74289a377a', 'dd847b73-d167-4e7e-b89d-06972c2bbbd0', 'e7eae5dc-4296-4a8f-aa36-88c89243f849', 'e9b1cc97-80e1-4d4d-ae8e-b45945f76ebd']


In [38]:
print(len(database.get()["ids"]))

14


In [39]:
storage = database.get()
metadatas = storage["metadatas"]
seenfiles = set()
for metadata in metadatas:
    filepath = metadata["file_path"]
    if filepath not in seenfiles:
        print(filepath)
        seenfiles.add(filepath)

test.py
embeddings.py
lua/spotlight/init.lua
.gitignore
TODO.md
retrieval/splitters/langchain/code.py
LangChain.ipynb
retrieval/loaders/langchain/git.py
retrieval/stores/langchain/chroma.py
retrieval/stores/langchain/keyvaluedocstore.py
retrieval/retrievers/langchain/vectorstore.py
retrieval/splitters/langchain/summary.py
retrieval/retrievers/langchain/parent.py
retrieval/embeddings/langchain/ollama.py


# Retrieval

In [22]:
prompt = '''
ollama
'''

### Vector store-backed retriever

In [23]:
retriever = database.as_retriever(search_type="mmr")

In [24]:
documents = retriever.invoke(prompt)

Number of requested results 20 is greater than number of elements in index 8, updating n_results = 8


### Native database query

In [59]:
documents = database.similarity_search(prompt)

### Parent Document Retriever

In [27]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

#### Full file retrieval

In [30]:
store = InMemoryStore() # ideally make this persistent
retriever = ParentDocumentRetriever(
    vectorstore=database,
    docstore=store,
    child_splitter=RecursiveCharacterTextSplitter.from_language("python", chunk_size=10000, chunk_overlap=0),
)

In [36]:
retriever.add_documents(data, ids=None)

In [35]:
print(data)

[Document(page_content='from typing import List, TypedDict from langchain_core.documents import Document from langchain_core.vectorstores import VectorStore\n\nclass VectorStoreRetrieveConfig(TypedDict):\n\ndatabase: VectorStore\n\nsearch_type: str\n\nprompt: str\n\ndef retrieve(config: VectorStoreRetrieveConfig)\n\n> List[Document]:\n\ndatabase = config["database"]\n\nretriever = database.as_retriever(search_type=config["search_type"])\n\ndocuments = retriever.invoke(config["prompt"])\n\nreturn documents', metadata={'source': 'retrieval/retrievers/langchain/vectorstore.py'}), Document(page_content='from typing import Optional, TypedDict from langchain_core.documents import Document from langchain_core.stores import BaseStore from langchain_core.vectorstores import VectorStore from langchain_text_splitters import TextSplitter from langchain.retrievers import ParentDocumentRetriever\n\nclass ParentRetrieveConfig(TypedDict):\n\ndatabase: VectorStore\n\ndocument_database: BaseStore\n\npro

In [38]:
list(store.yield_keys())

['be23010d-c039-4f2d-89f4-84b4440d9306',
 'd6feba98-3740-454d-a628-e6916de74f24',
 'b191156f-3308-4aac-9309-1bd8a20bf4a3',
 '5a4a2a16-8f8b-4802-9151-c30015b8cc27',
 '4e7b1f29-eda7-440c-a11b-1ceb61e12d92',
 '606846e4-93f5-4251-b14d-9e5d9064395c',
 'dd21e8e8-aeaa-4bdc-8444-1926bfd7a3dc',
 '7db297dc-3070-4ee6-9975-581843a71ef0']

In [42]:
store.mget(store.yield_keys())

[Document(page_content='from typing import List, TypedDict from langchain_core.documents import Document from langchain_core.vectorstores import VectorStore\n\nclass VectorStoreRetrieveConfig(TypedDict):\n\ndatabase: VectorStore\n\nsearch_type: str\n\nprompt: str\n\ndef retrieve(config: VectorStoreRetrieveConfig)\n\n> List[Document]:\n\ndatabase = config["database"]\n\nretriever = database.as_retriever(search_type=config["search_type"])\n\ndocuments = retriever.invoke(config["prompt"])\n\nreturn documents', metadata={'source': 'retrieval/retrievers/langchain/vectorstore.py'}),
 Document(page_content='from typing import Optional, TypedDict from langchain_core.documents import Document from langchain_core.stores import BaseStore from langchain_core.vectorstores import VectorStore from langchain_text_splitters import TextSplitter from langchain.retrievers import ParentDocumentRetriever\n\nclass ParentRetrieveConfig(TypedDict):\n\ndatabase: VectorStore\n\ndocument_database: BaseStore\n\npr

In [34]:
small_documents = database.similarity_search(prompt)
print("Small page content:")
print(small_documents[0].page_content)
documents = retriever.invoke(prompt)

Small page content:
from typing import List, TypedDict from langchain_core.language_models.llms import LLM from langchain_core.prompts import PromptTemplate from langchain_core.documents import Document

class SummarySplitDocumentConfig(TypedDict):

document: Document

prompt: PromptTemplate

llm: LLM

# template variables -> "{variable}" # 2 possible variables: "{document_content}" & "{file_path}" # TODO: support more variables class SummaryPromptConfig(TypedDict): template: str

def create_summary_prompt(config: SummaryPromptConfig)

> PromptTemplate:

return PromptTemplate(

input_variables=["document_content", "filepath"], template=config["template"]

)

def create_summary(config: SummarySplitDocumentConfig) -> List[Document]: llm_sequence = config["prompt"] | config["llm"] document = config["document"].dict() page_content = document["page_content"] metadata = document["metadata"] summary = llm_sequence.invoke( {"document_content": page_content, "filepath": metadata["file_path"]} )

#### Partial file retrieval (if files are way way too big)

In [61]:
store = InMemoryStore() # ideally make this persistent
retriever = ParentDocumentRetriever(
    vectorstore=database,
    docstore=store,
    child_splitter=RecursiveCharacterTextSplitter.from_language("python", chunk_size=50, chunk_overlap=10),
    parent_splitter=RecursiveCharacterTextSplitter.from_language("python", chunk_size=500, chunk_overlap=10)
)

In [71]:
print(len(data))
retriever.add_documents(data, ids=None)

14


KeyboardInterrupt: 

In [70]:
import pprint
pprint.pprint(store.)

<langchain_core.stores.InMemoryBaseStore object at 0x307d0d850>


In [67]:
list(store.yield_keys())

['3c4839d3-bdbf-458a-85ad-800b5668f680']

In [112]:
small_documents = database.similarity_search(prompt)
print("Small page content:")
print(small_documents[0].page_content)
documents = retriever.invoke(prompt)
print(documents)

Small page content:
.. " --collection-name spotlight --model "
[Document(page_content='local file = io.open(filePath, "r")\n\t\t\t\tif not file then\n\t\t\t\t\tgoto continue\n\t\t\t\tend\n\t\t\t\tlocal content = file:read("*all")\n\t\t\t\tfile:close()\n\t\t\t\tlocal insertCommand = opts.pythonPath\n\t\t\t\t\t.. " "\n\t\t\t\t\t.. embeddings\n\t\t\t\t\t.. " store --db-location "\n\t\t\t\t\t.. directoryLocation\n\t\t\t\t\t.. "/"\n\t\t\t\t\t.. opts.databaseDirectory\n\t\t\t\t\t.. " --collection-name spotlight --model "\n\t\t\t\t\t.. opts.ollamaModel\n\t\t\t\t\t.. " --file-location "\n\t\t\t\t\t.. filePath\n\t\t\t\tlocal handle = io.popen(insertCommand, "w")', metadata={'source': 'lua/spotlight/init.lua', 'file_path': 'lua/spotlight/init.lua', 'file_name': 'init.lua', 'file_type': '.lua'}), Document(page_content='class CodeSplitTextConfig(TypedDict):\n    text: str\n    language: Language\n    chunk_size: Optional[int]\n    chunk_overlap: Optional[int]\n\n\n# https://python.langchain.com/v0

### Multi Query Retriever

In [44]:
from langchain_community.llms import Ollama
import logging
from langchain_core.prompts import PromptTemplate

In [45]:
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [46]:
def parse_questions(questions):
    lines = questions.strip().split("\n")
    filtered_lines = [line for line in lines if line]
    print(filtered_lines)
    return filtered_lines

# only if you ONLY want the file paths and nothing else
# you won't be able to get the partial contexts unless you feed in the whole thing with the file path
def consolidate_files(documents):
    filepaths = set()
    filtered_documents = []
    for document in documents:
        document_dict = document.dict()
        filepath = document_dict["metadata"]["file_path"]
        if filepath not in filepaths:
            filtered_documents.append(document)
            filepaths.add(filepath)
    return filtered_documents

def run_prompts(prompts):
    documents = []
    for prompt in prompts:
        documents.extend(retriever.invoke(prompt))
    return documents

In [47]:
query_prompt = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 5
    different versions of the given user question to retrieve relevant coding files
    from a vector database and only those questions. By generating multiple perspectives on the user question,
    your goal is to help the user overcome some of the limitations of the distance-based
    similarity search. You want these questions to help the vector database find the most relevant files
    that relate to the user's question kind of like a search engine.
    Do not include give me anything else but the questions in plain text. 
    Do not include an introduction like "Here are 5 of my generated questions".
    Also don't enumerate or bullet each question, so don't give me "1. question1\n2. question2" or "- question1\n- question2".
    Only Provide these alternative questions separated by newlines so that
    it will find code files that are most relevant to this question: {question}"""
)

In [48]:
llm_sequence = query_prompt | Ollama(model="codellama") | parse_questions | run_prompts
documents = llm_sequence.invoke(prompt)

['ollama', 'what is the difference between a ollama and a llama?', 'ollama vs. alpaca: Which is better for meat?', 'llama or llama-like animal?', 'What is the name of an llama?', 'Ollama or Llama: Which one is more popular?']


### Metadata Retriever

In [33]:
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [34]:
metadata_field_info = [
    AttributeInfo(
        name="file_name",
        description="The name of the file.",
        type="string"
    ),
    AttributeInfo(
        name="file_path",
        description="The relevant file path of the file.",
        type="string"
    ),
    AttributeInfo(
        name="hash",
        description="The unique hash of contents that are being stored.",
        type="string"
    ),
    AttributeInfo(
        name="source",
        description="Absolute path of the file.",
        type="string"
    )
]
document_content_description = "Contents of a file for code."

In [35]:
retriever = SelfQueryRetriever.from_llm(
    Ollama(model="llama3"),
    database,
    document_content_description,
    metadata_field_info
)

In [36]:
documents = retriever.invoke(prompt)

### Custom composite

In [38]:
from langchain.retrievers import EnsembleRetriever

In [None]:
retrievers = []
retriever = EnsembleRetriever()

### Print Results

In [26]:
for document in documents:
    document = document.dict()
    print(document["metadata"]["source"])

retrieval/splitters/langchain/summary.py
retrieval/stores/langchain/chroma.py
retrieval/retrievers/langchain/vectorstore.py
retrieval/stores/langchain/keyvaluedocstore.py


In [23]:
for document in data:
    document = document.dict()
    print(document["page_content"])

.nvim*
.test/

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "87d5f168-d778-4df6-9fb9-b7429857f4f1",
   "metadata": {},
   "source": [
    "# Document Loader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "3f209f37-e612-4e79-8099-d5fe2dd87e12",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import GitLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ef23b2ab-4960-48f4-8337-06ec684d3555",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_documents(path, branch):\n",
    "    loader = GitLoader(\n",
    "        repo_path=path,\n",
    "        branch=branch\n",
    "    )\n",
    "    data = loader.load()\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2e4f351b-eae2-4e5f-9c70-167dcfa6ef1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = load_documents('/Users/pandoks/Projects/spotlight