In [None]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [None]:
# If you haven't set your access token as an environment variable, pass it in here.
from getpass import getpass

GITHUB_ACCESS_TOKEN = getpass()

In [None]:
from langchain_community.document_loaders import GitHubIssuesLoader, GithubFileLoader

In [None]:
# loader = GitHubIssuesLoader(
#     repo="TorchJD/torchjd",
#     access_token=GITHUB_ACCESS_TOKEN,  # delete/comment out this argument if you've set the access token as an env var.
#     state="all",
#     # page=2,
# )

python_code_loader = GithubFileLoader(
    repo="TorchJD/torchjd",  # the repo name
    branch="main",  # the branch name
    access_token=GITHUB_ACCESS_TOKEN,
    github_api_url="https://api.github.com",
    file_filter=lambda file_path: file_path.endswith(".py"),  # load all markdowns files.
)

docs = python_code_loader.load()

In [None]:
docs[-1]

In [None]:
len(docs)

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [None]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size = 4000, chunk_overlap = 1000
)

In [None]:
chunks = python_splitter.split_documents(docs)

In [None]:
# str_separation = "\n"*8 + "-"*180 + "\n"*8
# print(str_separation.join([document.page_content for document in chunks]))

In [None]:
document_ids = vector_store.add_documents(documents=chunks)

In [None]:
retrieved_docs = vector_store.similarity_search("how to project gradients onto the non-conflicting cone of other gradients")

In [None]:
str_separation = "\n" * 8 + "-" * 180 + "\n" * 8
def print_docs(documents):
    for doc in documents:
        print(doc.metadata["path"])
        print(doc.page_content)
        print(str_separation)

In [None]:
print_docs(retrieved_docs)

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)
document_ids = vector_store.add_documents(documents=chunks)

In [None]:
retrieved_docs = vector_store.similarity_search("how to project gradients onto the non-conflicting cone of other gradients")

In [None]:
print_docs(retrieved_docs)