Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 34 additions & 19 deletions backend/danswer/connectors/github_files/connector.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
"""GitHub Files connector — indexes files (default: JSON) sitting at a fixed
depth under a configurable path prefix.
"""GitHub Files connector — indexes files matching a given extension under
a configurable path prefix in a repository.

Matches the layout
Two modes:

<path_prefix>/<single_dir>/<file><extension>
- Fixed-depth (default): matches `<path_prefix>/<single_dir>/<file><extension>`
— i.e. exactly one folder under the prefix, file directly inside. Default
settings target a service-catalog layout:
service-catalog/products/<product>/<file>.json
Anything deeper or shallower is skipped.

i.e. exactly one folder under the prefix, file directly inside that folder.
Default settings target a service-catalog layout:
- Recursive (`recursive=True`): walks every folder under `path_prefix`
(the whole repo if the prefix is empty) and matches by extension at any
depth. Useful for "index all .md files in the repo" style configurations.

service-catalog/products/<product>/<file>.json

Anything deeper or shallower is skipped, as are files at intermediate
directories. The connector reuses the existing GitHub access token credential
shape, so users don't need to re-enter their PAT.
The connector reuses the existing GitHub access token credential shape, so
users don't need to re-enter their PAT.
"""
import time
from datetime import datetime
Expand Down Expand Up @@ -76,6 +78,7 @@ def __init__(
path_prefix: str = _DEFAULT_PATH_PREFIX,
file_extension: str = _DEFAULT_FILE_EXTENSION,
branch: str = "",
recursive: bool = False,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.repo_owner = repo_owner
Expand All @@ -86,6 +89,7 @@ def __init__(
file_extension if file_extension.startswith(".") else f".{file_extension}"
).lower()
self.branch = branch or "" # empty -> use repo's default branch
self.recursive = recursive
self.batch_size = batch_size
self.github_client: Github | None = None

Expand All @@ -112,7 +116,9 @@ def _resolve_branch(self, repo) -> str:

def _list_matching_paths(self, repo, branch: str) -> list[tuple[str, str]]:
"""Walk the git tree once, returning (path, blob_sha) pairs for files
matching `<prefix>/<single_dir>/<file><extension>`."""
matching the configured extension. In fixed-depth mode, only files at
`<prefix>/<single_dir>/<file><extension>` match; in recursive mode,
any file under `<prefix>` (or the repo root) at any depth matches."""
branch_obj = _retry_on_rate_limit(self.github_client, repo.get_branch, branch)
head_sha = branch_obj.commit.sha
tree = _retry_on_rate_limit(
Expand All @@ -129,9 +135,10 @@ def _list_matching_paths(self, repo, branch: str) -> list[tuple[str, str]]:
path = element.path
if prefix and not path.startswith(prefix + "/"):
continue
parts = path.split("/")
if len(parts) != expected_depth:
continue
if not self.recursive:
parts = path.split("/")
if len(parts) != expected_depth:
continue
if not path.lower().endswith(self.file_extension):
continue
results.append((path, element.sha))
Expand Down Expand Up @@ -177,13 +184,20 @@ def _convert_to_document(
# while unchanged files stay deduped across runs.
doc_id = f"{html_url}@{sha}"

# In recursive mode files at different depths can share a filename, so
# use the full repo-relative path as the semantic identifier.
if self.recursive:
semantic_identifier = path
else:
semantic_identifier = (
f"{product_dir}/{filename}" if product_dir else filename
)

return Document(
id=doc_id,
sections=[Section(link=html_url, text=text)],
source=DocumentSource.GITHUB_FILES,
semantic_identifier=f"{product_dir}/{filename}"
if product_dir
else filename,
semantic_identifier=semantic_identifier,
doc_updated_at=doc_updated_at,
metadata={
"repo": repo.full_name,
Expand Down Expand Up @@ -211,7 +225,7 @@ def _fetch_documents(
try:
commits_iter = repo.get_commits(
sha=branch,
path=self.path_prefix or None,
**({"path": self.path_prefix} if self.path_prefix else {}),
**({"since": start} if start else {}),
**({"until": end} if end else {}),
)
Expand Down Expand Up @@ -283,6 +297,7 @@ def poll_source(
path_prefix=os.environ.get("PATH_PREFIX", _DEFAULT_PATH_PREFIX),
file_extension=os.environ.get("FILE_EXTENSION", _DEFAULT_FILE_EXTENSION),
branch=os.environ.get("BRANCH", ""),
recursive=os.environ.get("RECURSIVE", "").lower() in ("1", "true", "yes"),
)
connector.load_credentials(
{"github_access_token": os.environ["GITHUB_ACCESS_TOKEN"]}
Expand Down
45 changes: 37 additions & 8 deletions web/src/app/admin/connectors/github-files/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import * as Yup from "yup";
import { useState } from "react";
import { EditIcon, GithubIcon, TrashIcon } from "@/components/icons/icons";
import { TextFormField } from "@/components/admin/connectors/Field";
import {
BooleanFormField,
TextFormField,
} from "@/components/admin/connectors/Field";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import useSWR, { useSWRConfig } from "swr";
import { errorHandlingFetcher } from "@/lib/fetcher";
Expand Down Expand Up @@ -226,7 +229,11 @@ const Main = () => {
const c = ccPairStatus.connector.connector_specific_config;
const ext = c.file_extension || ".json";
const branch = c.branch ? `@${c.branch}` : "";
return `${c.path_prefix}/<dir>/*${ext}${branch}`;
const root = c.path_prefix || "<repo root>";
const pattern = c.recursive
? `${root}/**/*${ext}`
: `${root}/<dir>/*${ext}`;
return `${pattern}${branch}`;
},
},
]}
Expand All @@ -247,6 +254,9 @@ const Main = () => {
— i.e. exactly one folder under the prefix, file directly inside.
Defaults target a{" "}
<code>service-catalog/products/&lt;product&gt;/*.json</code> layout.
Enable <em>Recursive</em> to walk every folder under the prefix
(or the whole repo if the prefix is blank) and match by extension
at any depth — e.g. <code>*.md</code> across the entire repo.
</Text>

<ConnectorForm<GithubFilesConfig>
Expand All @@ -267,21 +277,29 @@ const Main = () => {
label="Path Prefix:"
subtext={
<>
The folder containing per-product subfolders. Files are
indexed at exactly one level deeper.
The folder to scan. In the default mode, files are
indexed exactly one level deeper (e.g.{" "}
<code>&lt;prefix&gt;/&lt;dir&gt;/*&lt;ext&gt;</code>). In
recursive mode this is the root of the walk; leave blank
to scan the whole repository.
</>
}
/>
<TextFormField
name="file_extension"
label="File Extension:"
subtext="e.g. .json — only files with this extension at the matching depth are indexed."
subtext="e.g. .md — only files with this extension are indexed."
/>
<TextFormField
name="branch"
label="Branch (optional):"
subtext="Leave blank to use the repository's default branch."
/>
<BooleanFormField
name="recursive"
label="Recursive"
subtext="Walk all subfolders under the path prefix and match files by extension at any depth. Leave off for the fixed depth (<prefix>/<dir>/<file>) layout."
/>
</>
}
validationSchema={Yup.object().shape({
Expand All @@ -291,20 +309,31 @@ const Main = () => {
repo_name: Yup.string().required(
"Please enter the name of the repository"
),
path_prefix: Yup.string().required(
"Please enter the path prefix to scan"
),
path_prefix: Yup.string()
.defined()
.test(
"required-unless-recursive",
"Please enter the path prefix to scan",
function (value) {
return (
Boolean(this.parent.recursive) ||
(typeof value === "string" && value.length > 0)
);
}
),
file_extension: Yup.string().required(
"Please enter the file extension to filter on"
),
branch: Yup.string(),
recursive: Yup.boolean(),
})}
initialValues={{
repo_owner: "",
repo_name: "",
path_prefix: "service-catalog/products",
file_extension: ".json",
branch: "",
recursive: false,
}}
refreshFreq={10 * 60}
credentialId={githubCredential.id}
Expand Down
1 change: 1 addition & 0 deletions web/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ export interface GithubFilesConfig {
path_prefix: string;
file_extension: string;
branch?: string;
recursive?: boolean;
}

export interface GitlabConfig {
Expand Down
Loading