In [1]:
from pathlib import Path

ROOT = Path("rag-pipeline-aws-pinecone-langchain")

FILES = [
    # Root
    "README.md",
    "LICENSE",
    ".gitignore",
    ".env.example",
    "requirements.txt",
    "pyproject.toml",
    "setup.cfg",
    "Makefile",

    # Project package init
    "rag_pipeline/__init__.py",

    # Config
    "config/__init__.py",
    "config/settings.py",
    "config/config.yaml",
    "config/logging.yaml",
    "config/secrets.example.yaml",

    # Data layout (local dev; prod maps to S3 / Pinecone)
    "data/raw/.gitkeep",                # raw PDFs, HTML, DOCX (local)
    "data/processed/.gitkeep",          # cleaned text/json
    "data/chunks/.gitkeep",             # chunked segments
    "data/embeddings_cache/.gitkeep",   # optional local embedding cache
    "data/tmp/.gitkeep",

    # Ingestion (Step 1 – connect sources)
    "rag_pipeline/ingestion/__init__.py",
    "rag_pipeline/ingestion/step1_connect_sources.py",
    "rag_pipeline/ingestion/s3_loader.py",
    "rag_pipeline/ingestion/web_loader.py",
    "rag_pipeline/ingestion/local_loader.py",
    "rag_pipeline/ingestion/confluence_loader.py",

    # Parsing (Step 2 – extract / parse)
    "rag_pipeline/parsing/__init__.py",
    "rag_pipeline/parsing/step2_extract_and_parse.py",
    "rag_pipeline/parsing/pdf_extractor.py",
    "rag_pipeline/parsing/html_extractor.py",
    "rag_pipeline/parsing/docx_extractor.py",
    "rag_pipeline/parsing/text_normalization.py",

    # Chunking (Step 3)
    "rag_pipeline/chunking/__init__.py",
    "rag_pipeline/chunking/step3_chunk_text.py",
    "rag_pipeline/chunking/chunk_strategies.py",
    "rag_pipeline/chunking/token_counters.py",

    # Embeddings / formatting (Step 4)
    "rag_pipeline/embedding/__init__.py",
    "rag_pipeline/embedding/step4_embed_and_format.py",
    "rag_pipeline/embedding/embeddings_pinecone.py",
    "rag_pipeline/embedding/embeddings_aws_bedrock.py",
    "rag_pipeline/embedding/embedding_utils.py",

    # Storage (Step 5 – vector + metadata + raw-in-S3)
    "rag_pipeline/storage/__init__.py",
    "rag_pipeline/storage/step5_store_vectorstore.py",
    "rag_pipeline/storage/pinecone_client.py",
    "rag_pipeline/storage/metadata_store_dynamodb.py",
    "rag_pipeline/storage/s3_raw_store.py",       # sync local PDFs -> S3 and back

    # Retrieval / Generation pipeline
    "rag_pipeline/retrieval/__init__.py",
    "rag_pipeline/retrieval/retriever.py",
    "rag_pipeline/retrieval/retriever_langgraph.py",
    "rag_pipeline/generation/__init__.py",
    "rag_pipeline/generation/rag_chain.py",
    "rag_pipeline/generation/query_to_response.py",
    "rag_pipeline/generation/prompts.py",
    "rag_pipeline/generation/answer_postprocessing.py",

    # Workflows (LangGraph / higher-level orchestration)
    "rag_pipeline/workflows/__init__.py",
    "rag_pipeline/workflows/ingest_flow.py",
    "rag_pipeline/workflows/query_flow.py",

    # API & interfaces
    "api/__init__.py",
    "api/fastapi_app.py",
    "api/lambda_handler.py",
    "api/cli_entrypoint.py",
    "api/schemas.py",

    # Scripts for dev / ops
    "scripts/bootstrap_dev_env.sh",
    "scripts/run_local_ingest.sh",
    "scripts/run_local_api.sh",
    "scripts/deploy_lambda.sh",
    "scripts/sync_data_s3.sh",          # sync data/ ↔ S3 buckets for PDFs/chunks

    # Infra: CDK
    "infra/cdk/app.py",
    "infra/cdk/cdk.json",
    "infra/cdk/requirements.txt",
    "infra/cdk/stacks/__init__.py",
    "infra/cdk/stacks/vpc_stack.py",
    "infra/cdk/stacks/rag_lambda_stack.py",
    "infra/cdk/stacks/rag_stepfunctions_stack.py",
    "infra/cdk/stacks/rag_api_stack.py",
    "infra/cdk/stacks/dynamodb_pinecone_stack.py",
    "infra/cdk/stacks/s3_buckets_stack.py",     # buckets: rag-raw-docs, rag-processed-docs, rag-mlflow-artifacts

    # Infra: CloudFormation (optional overrides)
    "infra/cloudformation/s3_rag_buckets.yml",
    "infra/cloudformation/iam_roles.yml",
    "infra/cloudformation/lambda_rag_api.yml",

    # Orchestration definitions
    "orchestration/step_functions/rag_workflow.asl.json",
    "orchestration/step_functions/ingest_workflow.asl.json",
    "orchestration/eventbridge/rules.json",

    # Observability
    "observability/logging/structlog_config.py",
    "observability/dashboards/cloudwatch_dashboard.json",
    "observability/alerts/alarms.json",
    "observability/tracing/xray_config.md",

    # MLOps: MLflow, pipelines, model registry-like flow for RAG configs
    "mlops/__init__.py",
    "mlops/mlflow_config.py",
    "mlops/mlflow_tracking_server.md",
    "mlops/pipelines/rag_experiment_pipeline.py",
    "mlops/pipelines/eval_pipeline.py",
    "mlops/metrics/rag_eval_metrics.py",
    "mlops/model_registry/rag_config_registry.py",

    # MLflow local store (gitignored except folder marker)
    "mlruns/.gitkeep",

    # Docker & containerization
    "docker/Dockerfile.api",
    "docker/Dockerfile.worker",
    "docker/docker-compose.yml",
    "docker/nginx.conf",
    "docker/README.md",

    # GitOps: manifests for ArgoCD/Flux style deployment
    "gitops/argocd/app-rag-pipeline.yaml",
    "gitops/kustomize/base/deployment.yaml",
    "gitops/kustomize/base/service.yaml",
    "gitops/kustomize/base/ingress.yaml",
    "gitops/kustomize/overlays/dev/kustomization.yaml",
    "gitops/kustomize/overlays/prod/kustomization.yaml",
    "gitops/README.md",

    # Deploy: cluster-agnostic k8s / helm
    "deploy/kubernetes/rag-api-deployment.yaml",
    "deploy/kubernetes/rag-api-service.yaml",
    "deploy/kubernetes/rag-worker-deployment.yaml",
    "deploy/helm/Chart.yaml",
    "deploy/helm/values.yaml",
    "deploy/helm/templates/deployment.yaml",
    "deploy/helm/templates/service.yaml",

    # Docs
    "docs/index.md",
    "docs/architecture-diagram.drawio",
    "docs/rag_flow.md",
    "docs/aws_integration.md",
    "docs/pinecone_setup.md",
    "docs/langchain_patterns.md",
    "docs/langgraph_patterns.md",
    "docs/deployment_guide.md",
    "docs/local_dev_guide.md",
    "docs/testing_strategy.md",
    "docs/data_architecture.md",        # where PDFs, chunks, embeddings, metadata live
    "docs/mlops_gitops.md",
    "docs/docker_mlflow.md",

    # Samples
    "samples/input_pdfs/README.md",
    "samples/queries/example_queries.json",
    "samples/example_env/.env.local.example",

    # Tests
    "tests/__init__.py",
    "tests/conftest.py",
    "tests/test_ingestion.py",
    "tests/test_parsing.py",
    "tests/test_chunking.py",
    "tests/test_embedding.py",
    "tests/test_storage.py",
    "tests/test_retrieval_generation.py",
    "tests/test_api.py",
    "tests/test_mlflow_integration.py",
    "tests/test_langgraph_flows.py",

    # Dev tooling
    ".pre-commit-config.yaml",
    ".flake8",
    ".editorconfig",

    # CI (GitHub Actions for GitOps / tests / docker builds)
    ".github/workflows/ci.yml",
    ".github/workflows/cd_gitops.yml",
]


def main():
    for rel_path in FILES:
        file_path = ROOT / rel_path
        file_path.parent.mkdir(parents=True, exist_ok=True)
        if not file_path.exists():
            file_path.touch()

    # Minimal README
    readme = ROOT / "README.md"
    if readme.read_text().strip() == "":
        readme.write_text(
            "# RAG Pipeline with AWS, Pinecone, LangChain & LangGraph\n\n"
            "Production-oriented RAG system with:\n"
            "- Data: local `data/` + S3 buckets for raw PDFs, processed text, and MLflow artifacts.\n"
            "- Vector store: Pinecone for embeddings; DynamoDB/S3 for metadata.\n"
            "- MLOps: MLflow (`mlruns/`), evaluation pipelines, and registry-like configs under `mlops/`.\n"
            "- GitOps: Kubernetes/Helm manifests in `gitops/` and `deploy/`, driven by GitHub Actions.\n"
        )

    # requirements
    req = ROOT / "requirements.txt"
    if req.read_text().strip() == "":
        req.write_text(
            "langchain\n"
            "langgraph\n"
            "pinecone-client\n"
            "boto3\n"
            "fastapi\n"
            "uvicorn\n"
            "pydantic\n"
            "python-dotenv\n"
            "pypdf\n"
            "beautifulsoup4\n"
            "tiktoken\n"
            "structlog\n"
            "pytest\n"
            "mlflow\n"
        )

    # .gitignore basics
    gitignore = ROOT / ".gitignore"
    if gitignore.read_text().strip() == "":
        gitignore.write_text(
            "__pycache__/\n"
            "*.pyc\n"
            ".env\n"
            ".venv/\n"
            "venv/\n"
            ".pytest_cache/\n"
            ".mypy_cache/\n"
            ".DS_Store\n"
            "mlruns/\n"
            "data/raw/*\n"
            "data/processed/*\n"
            "data/chunks/*\n"
            "data/embeddings_cache/*\n"
        )

    print(f"RAG project scaffold (with data, MLOps, GitOps, Docker) created under: {ROOT.resolve()}")


if __name__ == "__main__":
    main()


RAG project scaffold (with data, MLOps, GitOps, Docker) created under: C:\Users\SuryaDeva\Documents\Certifications_202k\Coding\RAG_Mini\dir_create\rag-pipeline-aws-pinecone-langchain
