In [1]:
import sys
import shutil
import os
import logging
from dotenv import load_dotenv
from langchain_community.document_loaders import GithubFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma

In [5]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

In [33]:
def load_documents(repository: str) -> list[Document]:
    """Loads documents from the specified repository."""
    logging.info(f"Loading documents from repository: {repository}")
    # if not os.path.exists(repository):
    #     raise FileNotFoundError(f"repository not found: {repository}")
    # loader = DirectoryLoader(repository, glob="**/*.md")

    loader = GithubFileLoader(
        repo="nudgenow/nudge-devdocs",
        branch="prod_main",
        file_filter=lambda file_path: file_path.endswith(
            ".md") and file_path.startswith("docs/"),
        directory=["docs/"],
        access_token="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",
    )

    documents = loader.load()
    
    return documents

documents: list[Document] = load_documents("docs")

In [34]:
import json

In [35]:
for doc in documents[:5]:
    print(json.dumps(doc.metadata, indent=2))

{
  "path": "docs/01-getting-started/01-account-setup.md",
  "sha": "3fff097133588d5aca032d0d6a2a2a31ed785aa9",
  "source": "https://api.github.com/nudgenow/nudge-devdocs/blob/prod_main/docs/01-getting-started/01-account-setup.md"
}
{
  "path": "docs/01-getting-started/02-invite-a-team-member.md",
  "sha": "af74f74b000454e87bb71b2d09dc151ddc8171d0",
  "source": "https://api.github.com/nudgenow/nudge-devdocs/blob/prod_main/docs/01-getting-started/02-invite-a-team-member.md"
}
{
  "path": "docs/01-getting-started/03-finding-your-api-keys.md",
  "sha": "172982dfe38301a35e4e785ec5abb1ddbf90d3af",
  "source": "https://api.github.com/nudgenow/nudge-devdocs/blob/prod_main/docs/01-getting-started/03-finding-your-api-keys.md"
}
{
  "path": "docs/02-core-concepts/01-events.md",
  "sha": "166e8a49469abeaa724d39a2f2c805e970948302",
  "source": "https://api.github.com/nudgenow/nudge-devdocs/blob/prod_main/docs/02-core-concepts/01-events.md"
}
{
  "path": "docs/02-core-concepts/02-users.md",
  "sha"

In [31]:
BASE_URL = "https://docs.nudgenow.com/"

In [None]:
# Process all documents and transform paths to web URLs
import re
import os

for doc in documents:
    original_path = doc.metadata['path'][5:]
    print(f"Original path: {original_path}")
    
    # Split the path into components
    path_parts = original_path.split('/')
    
    # Clean each component by removing the numbering prefix
    cleaned_parts = [re.sub(r'^\d+[\-\.]', '', part) for part in path_parts]
    
    # Reassemble the path
    cleaned_path = '/'.join(cleaned_parts)

    # Replace spaces with underscores
    cleaned_path = cleaned_path.replace(' ', '%20')
    
    # Create web URL (remove .md extension)
    web_path = os.path.splitext(cleaned_path)[0]
    web_url = f"{BASE_URL}{web_path}"
    
    print(f"Cleaned path: {cleaned_path}")
    print(f"Web URL: {web_url}\n")

    # Update the document metadata with the new web URL
    doc.metadata['path_'] = web_url

Original path: 01-getting-started/01-account-setup.md
Cleaned path: getting-started/account-setup.md
Web URL: https://docs.nudgenow.com/getting-started/account-setup

Original path: 01-getting-started/02-invite-a-team-member.md
Cleaned path: getting-started/invite-a-team-member.md
Web URL: https://docs.nudgenow.com/getting-started/invite-a-team-member

Original path: 01-getting-started/03-finding-your-api-keys.md
Cleaned path: getting-started/finding-your-api-keys.md
Web URL: https://docs.nudgenow.com/getting-started/finding-your-api-keys

Original path: 02-core-concepts/01-events.md
Cleaned path: core-concepts/events.md
Web URL: https://docs.nudgenow.com/core-concepts/events

Original path: 02-core-concepts/02-users.md
Cleaned path: core-concepts/users.md
Web URL: https://docs.nudgenow.com/core-concepts/users

Original path: 02-core-concepts/03-cohorts.md
Cleaned path: core-concepts/cohorts.md
Web URL: https://docs.nudgenow.com/core-concepts/cohorts

Original path: 02-core-concepts/04

In [36]:
for doc in documents:
    original_path = doc.metadata['source'].split("docs/")[-1]
    print(f"Original path: {original_path}")
    
    # Split the path into components
    path_parts = original_path.split('/')
    
    # Clean each component by removing the numbering prefix
    cleaned_parts = [re.sub(r'^\d+[\-\.]', '', part) for part in path_parts]
    
    # Reassemble the path
    cleaned_path = '/'.join(cleaned_parts)

    # Replace spaces with underscores
    cleaned_path = cleaned_path.replace(' ', '%20')
    
    # Create web URL (remove .md extension)
    web_path = os.path.splitext(cleaned_path)[0]
    web_url = f"{BASE_URL}{web_path}"
    
    print(f"Cleaned path: {cleaned_path}")
    print(f"Web URL: {web_url}\n")

    # Update the document metadata with the new web URL
    doc.metadata['source'] = web_url
    doc.metadata['path'] = web_url

Original path: 01-getting-started/01-account-setup.md
Cleaned path: getting-started/account-setup.md
Web URL: https://docs.nudgenow.com/getting-started/account-setup

Original path: 01-getting-started/02-invite-a-team-member.md
Cleaned path: getting-started/invite-a-team-member.md
Web URL: https://docs.nudgenow.com/getting-started/invite-a-team-member

Original path: 01-getting-started/03-finding-your-api-keys.md
Cleaned path: getting-started/finding-your-api-keys.md
Web URL: https://docs.nudgenow.com/getting-started/finding-your-api-keys

Original path: 02-core-concepts/01-events.md
Cleaned path: core-concepts/events.md
Web URL: https://docs.nudgenow.com/core-concepts/events

Original path: 02-core-concepts/02-users.md
Cleaned path: core-concepts/users.md
Web URL: https://docs.nudgenow.com/core-concepts/users

Original path: 02-core-concepts/03-cohorts.md
Cleaned path: core-concepts/cohorts.md
Web URL: https://docs.nudgenow.com/core-concepts/cohorts

Original path: 02-core-concepts/04

In [38]:
for doc in documents[:5]:
    filename = os.path.basename(doc.metadata["source"])
    import re
    clean_filename = re.sub(r'^\d+\-', '', filename)
            
    if "title" not in doc.metadata:
        clean_title = os.path.splitext(clean_filename)[0].replace('-', ' ').title()

    print(f"Cleaned title: {clean_title}")
    # doc.metadata["title"] = clean_title

Cleaned title: Account Setup
Cleaned title: Invite A Team Member
Cleaned title: Finding Your Api Keys
Cleaned title: Events
Cleaned title: Users
