From dd34fc24e4cda6c5852ce400704ed71da98f3576 Mon Sep 17 00:00:00 2001 From: SunneV Date: Sat, 30 May 2026 17:42:48 +0200 Subject: [PATCH] init project scriber 2.0 - boost by rust --- .github/workflows/ci.yml | 59 +- .github/workflows/release.yml | 85 ++- CHANGELOG.md | 10 + Cargo.toml | 19 + LICENSE | 2 +- README.md | 473 ++++--------- examples/example_pyproject.toml | 48 ++ pyproject.toml | 121 +++- rust/scriber_native/src/import.rs | 622 ++++++++++++++++ rust/scriber_native/src/io.rs | 48 ++ rust/scriber_native/src/lib.rs | 92 +++ rust/scriber_native/src/render.rs | 42 ++ rust/scriber_native/src/scan.rs | 356 ++++++++++ rust/scriber_native/src/score.rs | 817 ++++++++++++++++++++++ src/run.py | 6 - src/scriber/__init__.py | 14 +- src/scriber/__main__.py | 4 + src/scriber/cache.py | 135 ++++ src/scriber/cli.py | 361 ---------- src/scriber/cli/__init__.py | 3 + src/scriber/cli/main.py | 219 ++++++ src/scriber/config.py | 46 -- src/scriber/core.py | 682 ------------------ src/scriber/core/__init__.py | 64 ++ src/scriber/core/config.py | 447 ++++++++++++ src/scriber/core/errors.py | 2 + src/scriber/core/init_config.py | 57 ++ src/scriber/core/matchers.py | 101 +++ src/scriber/core/models.py | 167 +++++ src/scriber/core/open_file.py | 22 + src/scriber/core/root.py | 67 ++ src/scriber/engine/__init__.py | 3 + src/scriber/engine/scorer.py | 302 ++++++++ src/scriber/graph/__init__.py | 3 + src/scriber/graph/builder.py | 138 ++++ src/scriber/graph/languages/__init__.py | 1 + src/scriber/graph/languages/cpp.py | 51 ++ src/scriber/graph/languages/go.py | 48 ++ src/scriber/graph/languages/javascript.py | 51 ++ src/scriber/graph/languages/python.py | 139 ++++ src/scriber/graph/languages/rust.py | 106 +++ src/scriber/native.py | 45 ++ src/scriber/pack.py | 3 + src/scriber/packer/__init__.py | 3 + src/scriber/packer/pack.py | 365 ++++++++++ src/scriber/render.py | 3 + src/scriber/rendering/__init__.py | 3 + src/scriber/rendering/renderer.py | 278 ++++++++ src/scriber/scanner/__init__.py | 21 + src/scriber/scanner/files.py | 142 ++++ src/scriber/scanner/scan.py | 75 ++ src/scriber/scanner/scan_py.py | 79 +++ src/scriber/tokens.py | 14 + tests/test_cache.py | 43 ++ tests/test_config_schema.py | 77 ++ tests/test_init_config.py | 89 +++ tests/test_languages.py | 154 ++++ tests/test_native.py | 422 +++++++++++ tests/test_processing_modes.py | 56 -- tests/test_scriber.py | 222 ++++++ tests/test_suite.py | 457 ------------ tests/test_tokens.py | 36 + 62 files changed, 6603 insertions(+), 2017 deletions(-) create mode 100644 Cargo.toml create mode 100644 examples/example_pyproject.toml create mode 100644 rust/scriber_native/src/import.rs create mode 100644 rust/scriber_native/src/io.rs create mode 100644 rust/scriber_native/src/lib.rs create mode 100644 rust/scriber_native/src/render.rs create mode 100644 rust/scriber_native/src/scan.rs create mode 100644 rust/scriber_native/src/score.rs delete mode 100644 src/run.py create mode 100644 src/scriber/__main__.py create mode 100644 src/scriber/cache.py delete mode 100644 src/scriber/cli.py create mode 100644 src/scriber/cli/__init__.py create mode 100644 src/scriber/cli/main.py delete mode 100644 src/scriber/config.py delete mode 100644 src/scriber/core.py create mode 100644 src/scriber/core/__init__.py create mode 100644 src/scriber/core/config.py create mode 100644 src/scriber/core/errors.py create mode 100644 src/scriber/core/init_config.py create mode 100644 src/scriber/core/matchers.py create mode 100644 src/scriber/core/models.py create mode 100644 src/scriber/core/open_file.py create mode 100644 src/scriber/core/root.py create mode 100644 src/scriber/engine/__init__.py create mode 100644 src/scriber/engine/scorer.py create mode 100644 src/scriber/graph/__init__.py create mode 100644 src/scriber/graph/builder.py create mode 100644 src/scriber/graph/languages/__init__.py create mode 100644 src/scriber/graph/languages/cpp.py create mode 100644 src/scriber/graph/languages/go.py create mode 100644 src/scriber/graph/languages/javascript.py create mode 100644 src/scriber/graph/languages/python.py create mode 100644 src/scriber/graph/languages/rust.py create mode 100644 src/scriber/native.py create mode 100644 src/scriber/pack.py create mode 100644 src/scriber/packer/__init__.py create mode 100644 src/scriber/packer/pack.py create mode 100644 src/scriber/render.py create mode 100644 src/scriber/rendering/__init__.py create mode 100644 src/scriber/rendering/renderer.py create mode 100644 src/scriber/scanner/__init__.py create mode 100644 src/scriber/scanner/files.py create mode 100644 src/scriber/scanner/scan.py create mode 100644 src/scriber/scanner/scan_py.py create mode 100644 src/scriber/tokens.py create mode 100644 tests/test_cache.py create mode 100644 tests/test_config_schema.py create mode 100644 tests/test_init_config.py create mode 100644 tests/test_languages.py create mode 100644 tests/test_native.py delete mode 100644 tests/test_processing_modes.py create mode 100644 tests/test_scriber.py delete mode 100644 tests/test_suite.py create mode 100644 tests/test_tokens.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 24d6934..d0ffdb8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,28 +1,57 @@ -name: Continuous Integration +name: CI on: push: - branches: - - develop + branches: [main, develop] + pull_request: jobs: - run_tests: - runs-on: ubuntu-latest + test: + name: ${{ matrix.os }} / py${{ matrix.python-version }} + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - - name: Checkout code - uses: actions/checkout@v4 + - uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + + - name: Rust cache + uses: Swatinem/rust-cache@v2 + env: + TAR: ${{ matrix.os == 'windows-latest' && 'C:\Windows\System32\tar.exe' || 'tar' }} - name: Install uv - run: pipx install uv + uses: astral-sh/setup-uv@v5 + with: + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + run: uv python install ${{ matrix.python-version }} - - name: Install dependencies - run: uv pip install -e .[dev] --system + - name: Sync + run: uv sync --all-extras + + - name: Check native import + run: uv run python -c "import scriber._native; print('native ok')" + + - name: Rust format check + run: cargo fmt --check + + - name: Rust clippy + run: cargo clippy --all-targets -- -D warnings + + - name: Rust tests + run: cargo test - name: Run tests - run: pytest \ No newline at end of file + run: uv run pytest + + - name: CLI smoke + run: uv run scriber . --only-tree --output - \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c94a6a9..2887303 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,26 +1,83 @@ -name: Publish to PyPI +name: Release on: push: tags: - - 'v*' # Triggers on any tag starting with v, like v0.0.3 + - "v*" jobs: - build_and_publish: + build: + name: Build ${{ matrix.os }} + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + command: build + args: --release --locked --out dist --compatibility pypi + manylinux: "2014" + sccache: "true" + + - name: Build sdist + if: matrix.os == 'ubuntu-latest' + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Wheel smoke install + shell: bash + run: | + python -m venv test_venv + if [ "${{ matrix.os }}" = "windows-latest" ]; then + test_venv/Scripts/pip install dist/*.whl + test_venv/Scripts/python -c "import scriber._native; print('native ok')" + test_venv/Scripts/scriber . --only-tree --output - + else + test_venv/bin/pip install dist/*.whl + test_venv/bin/python -c "import scriber._native; print('native ok')" + test_venv/bin/scriber . --only-tree --output - + fi + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: wheels-${{ matrix.os }} + path: dist + + publish: + name: Publish + needs: [build] runs-on: ubuntu-latest + permissions: - id-token: write # Required for trusted publishing - contents: read # Required to read the repository content + id-token: write + contents: read steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install uv - run: pipx install uv + - uses: actions/download-artifact@v4 + with: + path: dist-artifacts + pattern: wheels-* + merge-multiple: true - - name: Build distributions - run: uv build + - name: List artifacts + run: ls -la dist-artifacts - - name: Publish to PyPI - run: uv publish \ No newline at end of file + - name: Publish + uses: pypa/gh-action-pypi-publish@release/v1 + with: + packages-dir: dist-artifacts \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cf07d6..bcd7481 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [2.0.0] - 2026-05-30 + +### Added +- **⚑ Native Rust Acceleration (`scriber._native`)**: Full transition of filesystem scanning, high-performance file reading/writing, and binary classification to a compiled Rust extension built using Maturin and PyO3. +- **🌳 Fast Parallel Scanner**: Re-engineered directory scanning utilizing the `WalkBuilder` from the `ignore` crate, fully respecting `.gitignore` rules with blazing fast native execution. +- **πŸ§ͺ Rigorous Verification & Equivalence Testing**: Comprehensive suite of regression and equivalence tests validating 100% exact matching behavior between Rust and Python scanner modules. +- **πŸ“¦ Multi-Platform Binary Wheels**: CI/CD integration using `PyO3/maturin-action` to compile and distribute native wheels across Linux, macOS, and Windows. + + ## [1.1.2] - 2025-09-30 ### Fixed diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..91e0426 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "project-scriber-native" +version = "2.0.0" +edition = "2021" + +[lib] +name = "_native" +crate-type = ["cdylib"] +path = "rust/scriber_native/src/lib.rs" + +[dependencies] +pyo3 = { version = "0.21", features = ["extension-module", "abi3-py310"] } +ignore = "0.4" +globset = "0.4" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +memchr = "2.7" +regex = "1.10" + diff --git a/LICENSE b/LICENSE index 222cae3..8039da5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 SunneV (Wojciech Mariusz CichoΕ„) +Copyright (c) 2026 SunneV Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index a9849e2..7dc7d1a 100644 --- a/README.md +++ b/README.md @@ -9,50 +9,44 @@ PyPI Version

-An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for -Large Language Models (LLMs), available as both a powerful CLI and a flexible Python library. +An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for Large Language Models (LLMs). **Version 2.0** brings advanced dependency graph analysis, strict whitelist-based file inclusion, zero-dependency lightweight execution, and progress tracking! ----- ## πŸ“– Table of Contents -- [πŸ€” Why ProjectScriber?](#-why-projectscriber) +- [πŸ€” Why ProjectScriber 2.0?](#-why-projectscriber-20) - [✨ Key Features](#-key-features) - [πŸš€ Quick Start](#-quick-start) - [πŸ’Ύ Installation](#-installation) - [πŸ–₯️ Command-Line Usage](#️-command-line-usage) -- [πŸ“š Library Usage (API)](#-library-usage-api) - [βš™οΈ Configuration](#️-configuration) - [🀝 Contributing & Development](#-contributing--development) ----- -## πŸ€” Why ProjectScriber? +## πŸ€” Why ProjectScriber 2.0? -When working with Large Language Models, providing the full context of a codebase is crucial for getting accurate -analysis, documentation, or refactoring suggestions. Manually copying and pasting files is tedious, error-prone, and -unsustainable for projects of any real size. **ProjectScriber automates this entire process.** It intelligently scans -your project, respects your existing -`.gitignore` rules, applies custom filters, and bundles all relevant code into a single, clean, and readable format -perfect for any AI model. +When working with Large Language Models, providing the full context of a codebase is crucial for getting accurate analysis, documentation, or refactoring suggestions. However, blindly pasting an entire project wastes tokens and introduces noise. + +**ProjectScriber 2.0** automates context building using a **Whitelist-First** philosophy and an **Intelligent Scoring Engine**. It analyzes your codebase's dependency graph (e.g., Python imports), determines which files are most relevant to the code you're working on, and bundles them into a single, clean markdown file, strictly respecting your token budgets and file-type configurations.

- πŸ“ Your Codebase β†’ πŸ“¦ ProjectScriber β†’ πŸ“‹ LLM-Ready Context + πŸ“ Your Codebase β†’ πŸ“¦ ProjectScriber 2.0 β†’ πŸ“‹ LLM-Ready Context

----- ## ✨ Key Features -|Feature |Description | -|:-------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **🌳 Smart Project Mapping** | Generates a clear and intuitive tree view of your project's structure. | -| **βš™οΈ Intelligent Filtering** | Automatically respects `.gitignore` and supports custom `include`, `exclude`, and `hidden` patterns using `.gitignore`-style syntax for precise control. | -| **πŸ“Š In-depth Code Analysis** | Provides a summary with total file size, estimated token count (using `cl100k_base`), and a language breakdown. | -| **🐍 Flexible Python Library** | Import and use the `Scriber` class directly in your Python projects for full programmatic control. | -| **✨ Interactive CLI** | A simple `scriber init` command walks you through creating a configuration file for your project. | -| **πŸ“‹ Clipboard Integration** | Use the `--copy` or `--copy-only` flags to automatically send the entire output to your clipboard, ready for pasting. | -| **πŸ’¨ Lightweight & Fast** | The default installation is minimal, and file analysis is multi-threaded for improved performance. A single-process mode is available for compatibility. | +| Feature | Description | +|:---|:---| +| **🌳 Smart Project Mapping** | Generates a clear and intuitive tree view of your project's structure. | +| **⚑ Native Rust Acceleration** | Accelerates heavy I/O and directory scanning natively via a high-performance Rust backend. | +| **πŸ›‘οΈ Whitelist Philosophy** | By default, only recognized code and support files are included. Binary and lock files are automatically ignored. | +| **🧠 Intelligent Scoring Engine** | Analyzes import graphs and file proximity to prioritize code modules that are directly related to your provided seed files. | +| **πŸ’° Token Budgets** | Set a hard limit on `--max-tokens`. Scriber will fit the most relevant files within your budget to save API costs. | +| **πŸ“Š Live Progress & Stats** | Built-in zero-dependency progress spinner and detailed statistics summary at the end of the run. | ----- @@ -62,72 +56,52 @@ perfect for any AI model. ```shell pip install project-scriber + ``` + +2. **Navigate to your project's root and initialize config:** - ```` + ```shell + scriber --init + ``` + *(This appends a `[tool.scriber]` block to your `pyproject.toml`. Use `--force` to overwrite it.)* -2. **Navigate to your project's root and run:** +3. **Pack your context!** Just point it to a file, folder, or let it scan the whole project: ```shell - scriber + scriber src/main.py --output context.md ``` -3. **That's it\!** A `scriber_output.txt` file is now in your directory. It will look something like this: - - ````text - === - Mapped Folder Structure - === - - ProjectScriber - β”œβ”€β”€ .github - β”‚ └── workflows - β”‚ β”œβ”€β”€ ci.yml - β”‚ └── release.yml - β”œβ”€β”€ README.md - └── src - └── scriber - β”œβ”€β”€ __init__.py - └── core.py - - --- - File: .github/workflows/ci.yml - Size: 512 bytes - --- - ```yaml - name: Continuous Integration - - on: - push: - branches: - - develop - - jobs: - run_tests: - ... - ```` +4. **Review your stats:** + ```text + Scriber build completed. + ---------------------------------------- + Code files included: 15 + Support files included: 4 + Files omitted/skipped: 2 + Estimated tokens: 12500 + ---------------------------------------- + Scriber pack written to: context.md + ``` ----- ## πŸ’Ύ Installation -You have two options for installation. - -#### Standard Installation - -This provides the core functionality with a minimal, text-based interface. +ProjectScriber distributes pre-compiled binary wheels for Linux, macOS, and Windows. A simple pip command is all you need: ```shell pip install project-scriber ``` -#### With Rich UI ✨ - -For an enhanced terminal experience with colors, tables, and progress bars, install the `rich` extra: +Or if you use `uv`: ```shell -pip install project-scriber[rich] +uv pip install project-scriber ``` +> [!NOTE] +> If a pre-compiled wheel is not available for your platform/architecture, the package will automatically build from source, which requires a Rust compiler toolchain (Rust 1.70+) installed on your machine. + ----- ## πŸ–₯️ Command-Line Usage @@ -136,315 +110,140 @@ pip install project-scriber[rich] - **Scan the current directory**: ```shell - scriber + scriber . ``` -- **Scan a different directory**: +- **Scan a specific file and its dependencies**: ```shell - scriber /path/to/your/project + scriber src/my_module.py ``` -- **Interactive Setup**: Create a configuration file (`.scriber.json` or `pyproject.toml`) for your project. +- **Interactive Setup**: Create/Append a default configuration in `pyproject.toml` (use `--force` to overwrite it). ```shell - scriber init + scriber --init ``` ### CLI Options -|Option | Alias | Description | -|:------------------|:------|:--------------------------------------------------------------------------------------------------------| -| `root_path` | | The project directory to map. Defaults to the current directory. | -| `--output [file]` | `-o` | Set a custom name for the output file. | -| `--config [path]` | | Path to a custom config file (e.g., a `pyproject.toml` in a monorepo). | -| `--copy` | `-c` | Copy the final output to the clipboard in addition to saving it. | -| `--copy-only` | | Generate the output and copy it to the clipboard without saving to a file. | -| `--tree-only` | | Generate only the file tree structure, without any file content. | -| `--single-process`| | Run file analysis in a single process. Recommended for use in environments like Celery. | -| `--version` | `-v` | Show the installed version of ProjectScriber. | -| `--help` | `-h` | Display the help message. | - -### Advanced Example - -Scan another project, save the output to `custom_map.txt`, and copy the result to the clipboard in one go: - -```shell -scriber ../my-other-project --output custom_map.txt --copy -``` - ------ - -## πŸ“š Library Usage (API) - -Use `ProjectScriber` directly in your Python code for maximum flexibility and automation. - -### Basic Example: Get Context as a String - -Initialize `Scriber`, and it will automatically handle mapping and analysis. - -```python -from pathlib import Path -from scriber import Scriber # The class is exposed for direct import - -# 1. Initialize Scriber for the current directory -scriber = Scriber(root_path=Path('.')) - -# 2. Get the complete output directly as a string -project_context = scriber.get_output_as_string() - -# 3. Use the context for your application -print(f"Generated context of {len(project_context)} characters.") - -# 4. Access the calculated statistics -stats = scriber.get_stats() -print(f"Total files mapped: {stats['total_files']}") -print(f"Estimated tokens: {stats['total_tokens']:,}") -``` - -### Advanced Configuration via ScriberConfig - -Bypass all on-disk configuration files by passing a `ScriberConfig` object directly to the constructor. This is perfect -for dynamic or controlled environments. - -```python -from pathlib import Path -from scriber import Scriber, ScriberConfig - -# 1. Create a config object and customize it -config = ScriberConfig() -config.single_process = True -config.exclude.append("tests/") -config.exclude.append("assets/scriber_*") - -# 2. Initialize Scriber with the root path and config object -current_directory = Path('.').resolve() -scriber = Scriber(root_path=current_directory, config=config) - -# 3. Get the output -project_context = scriber.get_output_as_string() -print(project_context) -``` - -### Scanning Multiple Directories - -You can pass a list of paths to the `Scriber` constructor to map multiple directories into a single output. The first -path in the list is treated as the "primary root" for loading configurations (`.gitignore`, `pyproject.toml`, etc.). - -```python -from pathlib import Path -from scriber import Scriber - -# Example: Scan both a 'backend' and a 'frontend' directory -backend_path = Path('./my_backend_project') -frontend_path = Path('./my_frontend_project') - -# Create dummy directories and files for the example -backend_path.mkdir(exist_ok=True) -(backend_path / "main.py").write_text("print('hello from backend')") -frontend_path.mkdir(exist_ok=True) -(frontend_path / "app.js").write_text("console.log('hello from frontend')") - -# Initialize with a list of paths. `backend_path` is the primary root. -scriber = Scriber(root_path=[backend_path, frontend_path]) - -# Get the combined context as a single string -combined_context = scriber.get_output_as_string() -print(combined_context) - -# The output will contain two separate trees and file content blocks, -# with file paths prefixed by their root folder's name. -``` - -### Accessing Intermediate Data - -You can also access the generated file tree and the list of mapped files before the final output is compiled. - -```python -from pathlib import Path -from scriber import Scriber - -scriber = Scriber(root_path=Path('.')) - -# Get just the formatted file tree -tree_representation = scriber.get_tree() -print("--- Project Tree ---") -print(tree_representation) - -# Get a list of all mapped file paths -print("\n--- Mapped Files ---") -file_paths = scriber.get_mapped_files() -for path in file_paths: - print(path.relative_to(scriber.primary_root)) -``` - -### Practical Example: Preparing Context for an LLM - -Here's a small function demonstrating how you can use ProjectScriber to generate a complete, well-formatted prompt for -an LLM. - -```python -from pathlib import Path -from scriber import Scriber - - -def get_llm_context(project_path: Path, task: str) -> str: - ''' - Generates a complete project context string ready for an LLM. - - Args: - project_path: The root directory of the project. - task: The specific task you want the LLM to perform. - - Returns: - A formatted string to be used as a prompt for an LLM. - ''' - # Initialize Scriber and get the project map - scriber = Scriber(root_path=project_path) - project_map = scriber.get_output_as_string() - - # Get some stats for the context header - stats = scriber.get_stats() - token_count = stats.get("total_tokens", 0) - - # Assemble the final prompt for the LLM - prompt = ( - f"Please perform the following task: {task}\n\n" - f"Here is the full context of the project codebase. " - f"It includes a file tree and the content of all relevant files.\n" - f"Estimated Token Count: {token_count:,}\n\n" - "--- PROJECT CONTEXT BEGINS ---\n" - f"{project_map}" - "--- PROJECT CONTEXT ENDS ---" - ) - - return prompt - - -# --- Usage --- -if __name__ == "__main__": - my_project_path = Path('.') - user_task = "Analyze the code for potential bugs and suggest improvements." - llm_prompt = get_llm_context(my_project_path, user_task) - - print(llm_prompt) - - # Now you can send `llm_prompt` to your favorite LLM API. -``` +| Option | Description | +|:---|:---| +| `paths` | Project file/folder paths used as seeds. Defaults to current directory `.`. | +| `--config [path]` | Path to `pyproject.toml`. Its parent directory becomes the project root. | +| `--path-base [base]`| Base for relative paths: `project` (default) or `cwd`. | +| `--format [md, txt]` | Output format. Defaults to `md` (Markdown). | +| `--output [file]` | Output file path. Use `-` for stdout. | +| `--dry-run` | Show pack summary without writing the output file. | +| `--open` | Open the generated file in the default editor. | +| `--validate-config`| Validate the `[tool.scriber]` configuration and exit. | +| `--only-tree` | Render only the scored tree/map, without any file contents. | +| `--[no-]modules` | Enable/Disable automatic related module selection (dependency graph scanning). | +| `--[no-]support` | Enable/Disable support files (like `.env.example`, `.github/workflows`). | +| `--support-content` | Override support file content policy (`full`, `auto`, `tree_only`). | +| `--max-files` | Maximum number of files in the pack. | +| `--max-tokens` | Approximate token budget using char-based estimation. `0` disables budget. | +| `--min-score` | Minimum relevance score (0-100) for non-seed files to be included. | +| `--init` | Append a default `[tool.scriber]` config to `pyproject.toml` and exit. | +| `--force` | Force overwrite of the config block when used with `--init`. | +| `--version` | Show program's version number and exit. | ----- ## βš™οΈ Configuration -ProjectScriber is configured via a file in your project's root. It searches for configurations in the following order of -precedence: +ProjectScriber 2.0 configures itself through the standard `pyproject.toml` using the `[tool.scriber]` table. +Generate the default block using: -1. **Direct `config` object/dictionary** (Library mode only). -2. **`--config [path]` flag** (CLI mode only). -3. **`.scriber.json`** in the project root. -4. **`[tool.scriber]`** section in `pyproject.toml`. -5. **Default Behavior**: If no file is found, a default configuration is used, and a `.scriber.json` may be created to - guide you. - -### Configuration Keys - -|Key |Type |Default |Description | -|:----------------|:--------|:-----------------------|:------------------------------------------------------------------------------------------------------------------------------------------------| -| `use_gitignore` | boolean | `true` |If `true`, all patterns in the `.gitignore` file will be used for exclusion. | -| `exclude` |list |See `config.py` |A list of file/folder names or `.gitignore`-style patterns to exclude globally (e.g., `"node_modules"`, `"*.log"`, `build/`). | -| `include` |list |`[]` |If not empty, **only** files matching these `.gitignore`-style patterns will be included. | -| `hidden` |list |`[]` |Files matching these patterns will appear in the tree but their content will be replaced with a placeholder. Useful for large lock files. | -| `exclude_map` |object |`{}` |A dictionary for language-specific and global exclusion patterns. See example below. | -| `output` |string |`"scriber_output.txt"` |The default name for the output file. | -| `single_process`|boolean |`false` |If `true`, runs file analysis in a single process. This is slower but required for environments like Celery that do not allow child processes. | +```shell +scriber --init +``` -### Example `pyproject.toml` Configuration +### Example `pyproject.toml` -Here is an example of a well-configured `[tool.scriber]` section in your `pyproject.toml` file: +> [!NOTE] +> This is a minimal example. Run `scriber --init` to generate the full default configuration. ```toml [tool.scriber] -# Respect the project's .gitignore file -use_gitignore = true - -# Globally exclude common folders and file types using gitignore-style patterns -exclude = [ - "__pycache__/", - "node_modules/", - "dist/", - "build/", - ".venv/", +format = "md" +max_tokens = 0 # 0 means unlimited +max_files = 0 # 0 means unlimited +only_tree = false # If true, file contents are omitted +allow_external_paths = false + +[tool.scriber.modules] +enabled = true +content_min_score = 50 + +[tool.scriber.tokens] +estimator = "chars" +chars_per_token = 4 + +[tool.scriber.code_files] +# Only files matching these are considered "Code" +patterns = [ + "**/*.py", + "**/*.js", + "**/*.ts", + "**/*.tsx" ] -# Only include files with these extensions -include = [ - "*.py", - "*.js", - "*.css", - "*.md" +[tool.scriber.support_files] +enabled = true +# Only files matching these are considered "Support" +patterns = [ + "pyproject.toml", + "Dockerfile", + "**/*.svg" ] -# Show these files in the tree, but hide their content -hidden = [ - "poetry.lock" +[tool.scriber.support_files.content] +default = "auto" +auto_max_bytes = 10000 +full = [ + "pyproject.toml", + "requirements.txt", + "README.md" +] +tree_only = [ + "**/*.svg" ] -# Run in a single process to prevent issues in certain environments -single_process = false - -# Language-specific and global exclusion rules -[tool.scriber.exclude_map] -# Exclude these patterns from all files -global = ["*.log", "*.tmp"] -# In Python files, exclude tests and setup scripts -python = ["*_test.py", "setup.py"] -# In JavaScript files, exclude spec files -javascript = ["*.spec.js"] +[tool.scriber.hard_ignore] +# Folders ignored entirely during the initial scan +patterns = [ + ".git/**", + "__pycache__/**", + "node_modules/**", + ".venv/**" +] ``` -> **πŸ’‘ Note on Pattern Matching:** The `exclude` and `include` options support `.gitignore`-style pattern matching. This -allows for more precise rules, such as matching directories only (e.g., `build/`), root-level files (e.g., -`/config.yaml`), or standard wildcards (`*.log`). +### Whitelist Policy +ProjectScriber 2.0 uses a strict **whitelist** approach: +1. Files must match either a `code_pattern` or a `support_pattern` to be considered. +2. Unrecognized extensions and binary files are automatically excluded, keeping your LLM context safe from binary garbage. +3. Lock files are included in the tree by default, but their contents are omitted to save tokens. +4. Support files can be marked as `tree_only` (e.g., `**/*.svg`), meaning they'll show up in the project map but their contents won't be read. ----- ## 🀝 Contributing & Development -Contributions are welcome\! If you have a suggestion or find a bug, please open an issue to discuss it first. +Contributions are welcome! ### Development Setup -1. **Prerequisites**: - - * Python 3.10 or higher. - -2. **Clone the Repository**: - +1. **Clone the Repository**: ```shell git clone https://github.com/SunneV/ProjectScriber.git + cd ProjectScriber ``` -3. **Navigate to the Project Directory**: - +2. **Install Dependencies & Compile Extension** (using `uv` is recommended): ```shell - cd ProjectScriber + uv sync --all-extras ``` + *(This synchronizes the virtual environment and compiles the native Rust extension automatically!)* -4. **Install Dependencies**: - Choose one of the following methods to install the project in editable mode with all development dependencies. - - * **Using `pip`**: - - ```shell - pip install -e .[dev] - ``` - - * **Using `uv`** (Recommended): - - ```shell - uv pip install -e .[dev] - ``` - -### Running Tests - -Run the test suite using `pytest`: - -```shell -pytest -``` \ No newline at end of file +3. **Run Tests**: + ```shell + uv run pytest + ``` \ No newline at end of file diff --git a/examples/example_pyproject.toml b/examples/example_pyproject.toml new file mode 100644 index 0000000..15dc70e --- /dev/null +++ b/examples/example_pyproject.toml @@ -0,0 +1,48 @@ +[tool.scriber] +version = "2" +format = "md" +output = ".scriber/scriber_pack.md" +use_gitignore = true +max_files = 60 +max_tokens = 100000 +min_score = 45 + +[tool.scriber.code_files] +patterns = ["**/*.py", "**/*.pyi", "**/*.rs", "**/*.ts", "**/*.tsx", "**/*.js", "**/*.jsx"] + +[tool.scriber.support_files] +enabled = true +patterns = [ + "pyproject.toml", + "README.md", + "requirements.txt", + "requirements/*.txt", + "poetry.lock", + "uv.lock", + ".env.example", + "Dockerfile", + "docker-compose.yml", + ".github/workflows/*.yml", +] + +[tool.scriber.support_files.content] +default = "auto" +full = ["pyproject.toml", "README.md", "requirements.txt", "requirements/*.txt", ".env.example", "Dockerfile", "docker-compose.yml", ".github/workflows/*.yml"] +tree_only = ["poetry.lock", "uv.lock"] + +[tool.scriber.modules] +enabled = true +depth = 2 +include_direct_dependencies = true +include_reverse_dependencies = true +include_tests = true +include_same_package = true +include_parent_entrypoints = true +include_project_configs = true +content_min_score = 50 +tree_min_score = 30 + +[tool.scriber.python] +source_roots = ["src", "app", "."] +test_roots = ["tests", "test"] +entrypoint_patterns = ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"] diff --git a/pyproject.toml b/pyproject.toml index 86f56ef..6339246 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,53 +1,106 @@ +[build-system] +requires = ["maturin>=1.7,<2"] +build-backend = "maturin" + [project] name = "project-scriber" -version = "1.1.2" -authors = [ - { name="SunneV (Wojciech Mariusz CichoΕ„)", email="wojciech.m.cichon@gmail.com" }, -] -description = "An intelligent tool to map, analyze, and compile project source code for LLM context." +version = "2.0.0" +description = "Scriber 2.0: build intelligent code packs from one or more project paths." readme = "README.md" requires-python = ">=3.10" -license = { file="LICENSE" } -keywords = ["llm", "code-analysis", "developer-tools", "context-builder", "source-code"] +license = { text = "MIT" } +authors = [ + { name = "SunneV" } +] +keywords = ["code-context", "llm", "project-map", "developer-tools"] classifiers = [ - "Programming Language :: Python :: 3", + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Documentation", "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Utilities", ] dependencies = [ - "pathspec", - "python-dotenv", - "tiktoken", - "pyperclip", - "tomlkit", - "tomli; python_version < '3.11'", + "tomli>=2.0; python_version < '3.11'", ] -[project.urls] -Homepage = "https://github.com/SunneV/ProjectScriber" -Issues = "https://github.com/SunneV/ProjectScriber/issues" +[project.optional-dependencies] +dev = [ + "pytest>=8", + "maturin>=1.7,<2", +] [project.scripts] scriber = "scriber.cli:main" -[project.optional-dependencies] -rich = ["rich"] -dev = [ - "pytest", - "pytest-mock", - "rich" +[tool.maturin] +python-source = "src" +module-name = "scriber._native" +features = ["pyo3/extension-module", "pyo3/abi3-py310"] + +[tool.pytest.ini_options] +addopts = "-q" +testpaths = ["tests"] + +[tool.scriber] +version = "2" +format = "md" +output = ".scriber/scriber_pack.md" +only_tree = false +use_gitignore = true +max_files = 60 +max_tokens = 100000 +min_score = 45 +path_style = "project-relative" +allow_external_paths = false + +[tool.scriber.code_files] +patterns = ["**/*.py", "**/*.pyi", "**/*.rs", "**/*.js", "**/*.jsx", "**/*.ts", "**/*.tsx"] + +[tool.scriber.support_files] +enabled = true +patterns = [ + "**/*.toml", + "**/*.lock", + "pyproject.toml", + "README.md", + "requirements.txt", + "requirements/*.txt", + ".env.example", + "Dockerfile", + "docker-compose.yml", + ".github/workflows/*.yml", + "**/*.svg", ] -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" +[tool.scriber.support_files.content] +default = "auto" +full = ["**/*.toml", "pyproject.toml", "README.md", "requirements.txt", "requirements/*.txt", ".env.example", "Dockerfile", "docker-compose.yml", ".github/workflows/*.yml"] +tree_only = ["**/*.svg", "**/*.lock"] -[tool.hatch.build.targets.wheel] -packages = ["src/scriber"] +[tool.scriber.modules] +enabled = true +depth = 2 +include_direct_dependencies = true +include_reverse_dependencies = true +include_tests = true +include_same_package = true +include_parent_entrypoints = true +include_project_configs = true +content_min_score = 50 +tree_min_score = 30 -[tool.pytest.ini_options] -pythonpath = [ - "." -] \ No newline at end of file +[tool.scriber.python] +source_roots = ["src", "app", "."] +test_roots = ["tests", "test"] +entrypoint_patterns = ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"] + +[tool.scriber.tokens] +estimator = "chars" +chars_per_token = 4 diff --git a/rust/scriber_native/src/import.rs b/rust/scriber_native/src/import.rs new file mode 100644 index 0000000..c86a963 --- /dev/null +++ b/rust/scriber_native/src/import.rs @@ -0,0 +1,622 @@ +use crate::scan::NativeFileInfo; +use pyo3::prelude::*; +use regex::Regex; +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +#[pyclass] +#[derive(Clone, Debug)] +pub struct NativeImportEdge { + #[pyo3(get)] + pub from: String, + #[pyo3(get)] + pub to: String, + #[pyo3(get)] + pub kind: String, +} + +fn is_under(relative: &str, root: &str) -> bool { + if root.is_empty() || root == "." { + return true; + } + let rel_parts: Vec<&str> = relative.split('/').collect(); + let root_parts: Vec<&str> = root.split('/').collect(); + if rel_parts.len() < root_parts.len() { + return false; + } + for i in 0..root_parts.len() { + if rel_parts[i] != root_parts[i] { + return false; + } + } + true +} + +fn relative_to_root(relative: &str, root: &str) -> String { + if root.is_empty() || root == "." { + return relative.to_string(); + } + let rel_parts: Vec<&str> = relative.split('/').collect(); + let root_parts: Vec<&str> = root.split('/').collect(); + rel_parts[root_parts.len()..].join("/") +} + +fn module_name_for_file( + relative: &str, + source_roots: &[String], + module_init_files: &[String], +) -> Option { + let mut roots = source_roots.to_vec(); + roots.sort_by_key(|r| if r == "." { 0 } else { r.len() }); + roots.reverse(); + + for r in roots { + if !is_under(relative, &r) { + continue; + } + let under = relative_to_root(relative, &r); + if under.is_empty() { + continue; + } + let p = Path::new(&under); + let file_name = p.file_name()?.to_str()?; + if file_name.ends_with(".py") || file_name.ends_with(".pyi") { + let mut parts: Vec = Vec::new(); + if let Some(parent) = p.parent() { + for c in parent.components() { + parts.push(c.as_os_str().to_string_lossy().to_string()); + } + } + if !module_init_files.contains(&file_name.to_string()) { + if let Some(stem) = p.file_stem() { + parts.push(stem.to_string_lossy().to_string()); + } + } + if parts.is_empty() { + continue; + } + return Some(parts.join(".")); + } + } + None +} + +fn resolve_relative_module( + current_module: &str, + current_is_init: bool, + level: usize, + module: &str, +) -> String { + if level == 0 { + return module.to_string(); + } + let mut parts: Vec<&str> = current_module.split('.').collect(); + if !current_is_init && !parts.is_empty() { + parts.pop(); + } + let up = level.saturating_sub(1); + if up < parts.len() { + parts.truncate(parts.len() - up); + } else { + parts.clear(); + } + if !module.is_empty() { + for part in module.split('.') { + parts.push(part); + } + } + parts.join(".") +} + +fn normalize_posix_path(path: &str) -> String { + let mut parts = Vec::new(); + for part in path.split('/') { + if part.is_empty() || part == "." { + continue; + } + if part == ".." { + parts.pop(); + } else { + parts.push(part); + } + } + parts.join("/") +} + +#[pyfunction] +pub fn build_import_graph( + root: &str, + files: Vec, + python_source_roots: Vec, + python_module_init_files: Vec, +) -> PyResult> { + let mut edges = Vec::new(); + if files.is_empty() { + return Ok(edges); + } + + let absolute_to_file: HashMap = + files.iter().map(|f| (f.relative.clone(), f)).collect(); + + let mut dir_to_files: HashMap> = HashMap::new(); + for file in &files { + let parent = Path::new(&file.relative) + .parent() + .unwrap_or(Path::new("")) + .to_string_lossy() + .replace("\\", "/"); + dir_to_files + .entry(parent) + .or_default() + .push(file.relative.clone()); + } + + // Pre-calculate Python module map + let mut module_to_path: HashMap = HashMap::new(); + let mut path_to_module: HashMap = HashMap::new(); + for file in &files { + if let Some(mod_name) = module_name_for_file( + &file.relative, + &python_source_roots, + &python_module_init_files, + ) { + path_to_module.insert(file.relative.clone(), mod_name.clone()); + module_to_path + .entry(mod_name) + .or_insert_with(|| file.relative.clone()); + } + } + + // Go module resolution + let mut go_module_name = None; + let go_mod_path = Path::new(root).join("go.mod"); + if go_mod_path.exists() { + if let Ok(content) = std::fs::read_to_string(go_mod_path) { + let go_mod_re = Regex::new(r"(?m)^\s*module\s+(\S+)").unwrap(); + if let Some(m) = go_mod_re.captures(&content) { + go_module_name = Some(m.get(1).unwrap().as_str().to_string()); + } + } + } + + // Regex compile + let py_import_re = Regex::new(r"(?m)^\s*import\s+([a-zA-Z0-9_.,\t ]+)").unwrap(); + let py_from_paren_re = + Regex::new(r"(?m)^\s*from\s+(\.+[a-zA-Z0-9_.]*|[a-zA-Z0-9_.]+)\s+import\s+\(([^)]+)\)") + .unwrap(); + let py_from_simple_re = Regex::new( + r"(?m)^\s*from\s+(\.+[a-zA-Z0-9_.]*|[a-zA-Z0-9_.]+)\s+import\s+([a-zA-Z0-9_.,\t ]+)", + ) + .unwrap(); + + let js_import_re = Regex::new(r#"(?:import|export)\s+(?:[\w*\s{},]*\s+from\s+)?['"]([^'"]+)['"]|require\s*\(\s*['"]([^'"]+)['"]\s*\)"#).unwrap(); + + let rust_mod_re = Regex::new(r"\bmod\s+(\w+)\s*;").unwrap(); + let rust_use_re = Regex::new(r"\buse\s+([^;]+)\s*;").unwrap(); + + let go_import_single_re = Regex::new(r#"\bimport\s+['"]([^'"]+)['"]"#).unwrap(); + let go_import_block_re = Regex::new(r"(?s)\bimport\s*\(([^)]+)\)").unwrap(); + + let cpp_include_re = Regex::new(r#"#include\s*["<]([^">]+)[">]"#).unwrap(); + + for file in &files { + if file.kind != "code" || file.is_binary { + continue; + } + + let file_abs_path = Path::new(root).join(&file.relative); + let mut source = match std::fs::read(&file_abs_path) { + Ok(bytes) => String::from_utf8_lossy(&bytes).to_string(), + Err(_) => continue, + }; + + if file.language == "python" { + let normalized = source.replace("\r\n", "\n"); + let mut clean = String::new(); + for line in normalized.lines() { + if let Some(idx) = line.find('#') { + clean.push_str(&line[..idx]); + } else { + clean.push_str(line); + } + clean.push('\n'); + } + source = clean.replace("\\\n", " "); + } + + let mut resolved_set = HashSet::new(); + + if file.language == "python" { + if let Some(current_module) = path_to_module.get(&file.relative) { + let current_is_init = file.relative.ends_with("__init__.py"); + + // Parse standard imports + for cap in py_import_re.captures_iter(&source) { + if let Some(m) = cap.get(1) { + for alias in m.as_str().split(',') { + let parts: Vec<&str> = alias.split_whitespace().collect(); + if !parts.is_empty() { + let imported_module = parts[0].to_string(); + resolved_set.insert((imported_module, true, 0, Vec::new())); + } + } + } + } + + // Parse from ... import (...) + for cap in py_from_paren_re.captures_iter(&source) { + let from_module = cap.get(1).unwrap().as_str().trim().to_string(); + let names_str = cap.get(2).unwrap().as_str().trim(); + let mut names = Vec::new(); + for name in names_str.split(',') { + let parts: Vec<&str> = name.split_whitespace().collect(); + if !parts.is_empty() && parts[0] != "*" { + names.push(parts[0].to_string()); + } + } + + let mut level = 0; + let mut module = from_module; + while module.starts_with('.') { + level += 1; + module = module[1..].to_string(); + } + + resolved_set.insert((module, false, level, names)); + } + + // Parse from ... import ... (simple) + for cap in py_from_simple_re.captures_iter(&source) { + let from_module = cap.get(1).unwrap().as_str().trim().to_string(); + let names_str = cap.get(2).unwrap().as_str().trim(); + let mut names = Vec::new(); + for name in names_str.split(',') { + let parts: Vec<&str> = name.split_whitespace().collect(); + if !parts.is_empty() && parts[0] != "*" { + names.push(parts[0].to_string()); + } + } + + let mut level = 0; + let mut module = from_module; + while module.starts_with('.') { + level += 1; + module = module[1..].to_string(); + } + + resolved_set.insert((module, false, level, names)); + } + + // Resolve python imports + for (module, is_import, level, names) in resolved_set { + let mut candidates = Vec::new(); + if is_import { + candidates.push(module); + } else { + let base = if level > 0 { + resolve_relative_module(current_module, current_is_init, level, &module) + } else { + module + }; + for name in &names { + if !base.is_empty() { + candidates.push(format!("{}.{}", base, name)); + } else { + candidates.push(name.clone()); + } + } + if !base.is_empty() { + candidates.push(base); + } + } + + for candidate in candidates { + if candidate.is_empty() { + continue; + } + let parts: Vec<&str> = candidate.split('.').collect(); + for end in (1..=parts.len()).rev() { + let mod_name = parts[..end].join("."); + if let Some(target_path) = module_to_path.get(&mod_name) { + if target_path != &file.relative { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target_path.clone(), + kind: "import".to_string(), + }); + break; + } + } + } + } + } + } + } else if file.language == "javascript" || file.language == "typescript" { + let parent = Path::new(&file.relative) + .parent() + .unwrap_or(Path::new("")) + .to_string_lossy() + .replace("\\", "/"); + for cap in js_import_re.captures_iter(&source) { + let spec = cap + .get(1) + .or_else(|| cap.get(2)) + .map(|m| m.as_str()) + .unwrap_or(""); + if !spec.starts_with('.') { + continue; + } + + let raw_base = if parent.is_empty() { + spec.to_string() + } else { + format!("{}/{}", parent, spec) + }; + let base_normalized = normalize_posix_path(&raw_base); + + let mut resolved = false; + let extensions = vec!["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"]; + for ext in extensions { + let cand = if ext.is_empty() { + base_normalized.clone() + } else { + format!("{}{}", base_normalized, ext) + }; + if let Some(target) = absolute_to_file.get(&cand) { + if !target.is_binary && target.relative != file.relative { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target.relative.clone(), + kind: "import".to_string(), + }); + resolved = true; + break; + } + } + } + + if !resolved { + let index_names = vec!["index.ts", "index.tsx", "index.js", "index.jsx"]; + for idx in index_names { + let cand = format!("{}/{}", base_normalized, idx); + if let Some(target) = absolute_to_file.get(&cand) { + if !target.is_binary && target.relative != file.relative { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target.relative.clone(), + kind: "import".to_string(), + }); + break; + } + } + } + } + } + } else if file.language == "rust" { + let parent = Path::new(&file.relative) + .parent() + .unwrap_or(Path::new("")) + .to_string_lossy() + .replace("\\", "/"); + let mut mod_specs = Vec::new(); + + for cap in rust_mod_re.captures_iter(&source) { + if let Some(m) = cap.get(1) { + mod_specs.push(("mod".to_string(), m.as_str().to_string())); + } + } + + for cap in rust_use_re.captures_iter(&source) { + if let Some(m) = cap.get(1) { + let spec = m.as_str().trim(); + if spec.contains('{') { + if let Some(idx) = spec.find('{') { + let base = spec[..idx].trim(); + let rest = spec[idx + 1..].replace('}', ""); + for part in rest.split(',') { + let part_trimmed = part.trim(); + if !part_trimmed.is_empty() { + mod_specs.push(( + "use".to_string(), + format!("{}{}", base, part_trimmed), + )); + } + } + } + } else { + mod_specs.push(("use".to_string(), spec.to_string())); + } + } + } + + // Resolve rust + for (kind, spec) in mod_specs { + if kind == "mod" { + let cand1 = if parent.is_empty() { + format!("{}.rs", spec) + } else { + format!("{}/{}.rs", parent, spec) + }; + let cand2 = if parent.is_empty() { + format!("{}/mod.rs", spec) + } else { + format!("{}/{}/mod.rs", parent, spec) + }; + for cand in &[cand1, cand2] { + if let Some(target) = absolute_to_file.get(cand) { + if target.relative != file.relative { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target.relative.clone(), + kind: "mod".to_string(), + }); + break; + } + } + } + } else { + let parts: Vec<&str> = spec.split("::").collect(); + if parts.is_empty() { + continue; + } + + let mut crate_root = "".to_string(); + let mut curr = Path::new(&file.relative).parent(); + while let Some(c) = curr { + let c_str = c.to_string_lossy().replace("\\", "/"); + let cargo_toml = if c_str.is_empty() { + "Cargo.toml".to_string() + } else { + format!("{}/Cargo.toml", c_str) + }; + let src_dir = if c_str.is_empty() { + "src".to_string() + } else { + format!("{}/src", c_str) + }; + + let has_cargo = absolute_to_file.contains_key(&cargo_toml); + let has_src = absolute_to_file + .keys() + .any(|k| k.starts_with(&format!("{}/", src_dir)) || *k == src_dir); + + if has_cargo || has_src { + crate_root = if has_src { src_dir } else { c_str }; + break; + } + curr = c.parent(); + } + + if crate_root.is_empty() { + crate_root = parent.clone(); + } + + if parts[0] == "crate" || parts[0] == "super" || parts[0] == "self" { + let base_dir = match parts[0] { + "crate" => crate_root, + "super" => Path::new(&parent) + .parent() + .unwrap_or(Path::new("")) + .to_string_lossy() + .replace("\\", "/"), + _ => parent.clone(), + }; + + let sub_parts = &parts[1..]; + if !sub_parts.is_empty() { + let mut resolved = false; + for end in (1..=sub_parts.len()).rev() { + let sub_path = sub_parts[..end].join("/"); + let path_str = if base_dir.is_empty() { + sub_path + } else { + format!("{}/{}", base_dir, sub_path) + }; + let cand1 = format!("{}.rs", path_str); + let cand2 = format!("{}/mod.rs", path_str); + for cand in &[cand1, cand2] { + if let Some(target) = absolute_to_file.get(cand) { + if target.relative != file.relative { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target.relative.clone(), + kind: "use".to_string(), + }); + resolved = true; + break; + } + } + } + if resolved { + break; + } + } + } + } + } + } + } else if file.language == "go" { + let mut specs = Vec::new(); + for cap in go_import_single_re.captures_iter(&source) { + specs.push(cap.get(1).unwrap().as_str().to_string()); + } + for cap in go_import_block_re.captures_iter(&source) { + let block = cap.get(1).unwrap().as_str(); + for line in block.lines() { + let line_trimmed = line.trim(); + if line_trimmed.starts_with("//") { + continue; + } + if let Some(idx) = line_trimmed.find('"') { + let sub = &line_trimmed[idx + 1..]; + if let Some(end) = sub.find('"') { + specs.push(sub[..end].to_string()); + } + } + } + } + + if let Some(ref mod_name) = go_module_name { + for spec in specs { + if spec.starts_with(mod_name) { + let rel_spec = spec[mod_name.len()..].trim_start_matches('/').to_string(); + if let Some(targets) = dir_to_files.get(&rel_spec) { + for target in targets { + if target.ends_with(".go") && target != &file.relative { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target.clone(), + kind: "import".to_string(), + }); + } + } + } + } + } + } + } else if file.language == "c" || file.language == "cpp" { + let parent = Path::new(&file.relative) + .parent() + .unwrap_or(Path::new("")) + .to_string_lossy() + .replace("\\", "/"); + for cap in cpp_include_re.captures_iter(&source) { + let spec = cap.get(1).unwrap().as_str(); + let raw_base = if parent.is_empty() { + spec.to_string() + } else { + format!("{}/{}", parent, spec) + }; + let base_normalized = normalize_posix_path(&raw_base); + + if let Some(target) = absolute_to_file.get(&base_normalized) { + if !target.is_binary && target.relative != file.relative { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target.relative.clone(), + kind: "include".to_string(), + }); + } + } else { + for (rel_path, target) in &absolute_to_file { + if target.is_binary { + continue; + } + if (*rel_path == spec || rel_path.ends_with(&format!("/{}", spec))) + && target.relative != file.relative + { + edges.push(NativeImportEdge { + from: file.relative.clone(), + to: target.relative.clone(), + kind: "include".to_string(), + }); + break; + } + } + } + } + } + } + + Ok(edges) +} diff --git a/rust/scriber_native/src/io.rs b/rust/scriber_native/src/io.rs new file mode 100644 index 0000000..2d4508b --- /dev/null +++ b/rust/scriber_native/src/io.rs @@ -0,0 +1,48 @@ +use pyo3::exceptions::PyOSError; +use pyo3::prelude::*; +use std::fs; +use std::path::Path; + +pub fn io_err(context: &str, path: &str, err: std::io::Error) -> PyErr { + PyOSError::new_err(format!("{}: {}: {}", context, path, err)) +} + +pub fn read_text_lossy_native(path: &str) -> PyResult { + let bytes = fs::read(path).map_err(|e| io_err("Failed to read", path, e))?; + Ok(String::from_utf8_lossy(&bytes).into_owned()) +} + +pub fn write_text_native(path: &str, content: &str) -> PyResult<()> { + let p = Path::new(path); + if let Some(parent) = p.parent() { + fs::create_dir_all(parent) + .map_err(|e| io_err("Failed to create parent directory", path, e))?; + } + fs::write(path, content).map_err(|e| io_err("Failed to write", path, e)) +} + +pub fn is_binary_native(path: &str) -> PyResult { + Ok(is_binary(Path::new(path))) +} + +pub fn is_binary(path: &Path) -> bool { + use std::fs::File; + use std::io::Read; + let mut file = match File::open(path) { + Ok(f) => f, + Err(_) => return true, + }; + let mut buf = [0u8; 4096]; + let n = match file.read(&mut buf) { + Ok(n) => n, + Err(_) => return true, + }; + memchr::memchr(0, &buf[..n]).is_some() +} + +pub fn read_many_text_native(paths: Vec) -> PyResult> { + paths + .into_iter() + .map(|path| read_text_lossy_native(&path)) + .collect() +} diff --git a/rust/scriber_native/src/lib.rs b/rust/scriber_native/src/lib.rs new file mode 100644 index 0000000..4b854dd --- /dev/null +++ b/rust/scriber_native/src/lib.rs @@ -0,0 +1,92 @@ +use pyo3::prelude::*; + +mod import; +mod io; +mod render; +mod scan; +mod score; + +#[pyfunction] +#[pyo3(name = "read_text")] +fn read_text(path: &str) -> PyResult { + io::read_text_lossy_native(path) +} + +#[pyfunction] +#[pyo3(name = "write_text")] +fn write_text(path: &str, content: &str) -> PyResult<()> { + io::write_text_native(path, content) +} + +#[pyfunction] +#[pyo3(name = "is_probably_binary")] +fn is_probably_binary(path: &str) -> PyResult { + io::is_binary_native(path) +} + +#[pyfunction] +#[pyo3(name = "read_many_text")] +fn read_many_text(paths: Vec) -> PyResult> { + io::read_many_text_native(paths) +} + +#[pyfunction] +#[pyo3(name = "scan_project")] +#[allow(clippy::too_many_arguments)] +fn scan_project( + root_path: &str, + use_gitignore: bool, + hard_ignore_patterns: Vec, + code_patterns: Vec, + support_patterns: Vec, + support_full_patterns: Vec, + support_tree_only_patterns: Vec, + support_default_policy: String, + support_enabled: bool, +) -> PyResult> { + scan::scan_project_native( + root_path, + use_gitignore, + hard_ignore_patterns, + code_patterns, + support_patterns, + support_full_patterns, + support_tree_only_patterns, + support_default_policy, + support_enabled, + ) +} + +#[pyfunction] +fn native_api_version() -> u32 { + 1 +} + +#[pyfunction] +fn build_info() -> PyResult { + Ok(format!( + "scriber-native {} {}", + env!("CARGO_PKG_VERSION"), + std::env::consts::OS + )) +} + +#[pymodule] +#[allow(deprecated)] +fn _native(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(read_text, m)?)?; + m.add_function(wrap_pyfunction!(write_text, m)?)?; + m.add_function(wrap_pyfunction!(is_probably_binary, m)?)?; + m.add_function(wrap_pyfunction!(read_many_text, m)?)?; + m.add_function(wrap_pyfunction!(scan_project, m)?)?; + m.add_function(wrap_pyfunction!(import::build_import_graph, m)?)?; + m.add_function(wrap_pyfunction!(score::score_candidates_native, m)?)?; + m.add_function(wrap_pyfunction!(render::render_tree, m)?)?; + m.add_function(wrap_pyfunction!(native_api_version, m)?)?; + m.add_function(wrap_pyfunction!(build_info, m)?)?; + Ok(()) +} diff --git a/rust/scriber_native/src/render.rs b/rust/scriber_native/src/render.rs new file mode 100644 index 0000000..c3189c6 --- /dev/null +++ b/rust/scriber_native/src/render.rs @@ -0,0 +1,42 @@ +use pyo3::prelude::*; +use std::collections::BTreeMap; + +#[derive(Default)] +struct TreeNode { + children: BTreeMap, +} + +fn walk(node: &TreeNode, prefix: &str) -> Vec { + let mut lines = Vec::new(); + let items: Vec<(&String, &TreeNode)> = node.children.iter().collect(); + for (index, (name, child)) in items.iter().enumerate() { + let is_last = index == items.len() - 1; + let branch = if is_last { "└── " } else { "β”œβ”€β”€ " }; + lines.push(format!("{}{}{}", prefix, branch, name)); + let extension = if is_last { " " } else { "β”‚ " }; + lines.extend(walk(child, &format!("{}{}", prefix, extension))); + } + lines +} + +#[pyfunction] +pub fn render_tree(paths: Vec) -> PyResult { + let mut root = TreeNode::default(); + for path_str in paths { + let mut curr = &mut root; + // Support both backslash and forward slash + let clean_path = path_str.replace("\\", "/"); + for part in clean_path.split('/') { + if part.is_empty() || part == "." { + continue; + } + curr = curr.children.entry(part.to_string()).or_default(); + } + } + + if root.children.is_empty() { + Ok(".".to_string()) + } else { + Ok(format!(".\n{}", walk(&root, "").join("\n"))) + } +} diff --git a/rust/scriber_native/src/scan.rs b/rust/scriber_native/src/scan.rs new file mode 100644 index 0000000..ce43103 --- /dev/null +++ b/rust/scriber_native/src/scan.rs @@ -0,0 +1,356 @@ +use globset::GlobBuilder; +use ignore::WalkBuilder; +use pyo3::prelude::*; +use std::path::Path; + +#[pyclass] +#[derive(Clone)] +pub struct NativeFileInfo { + #[pyo3(get)] + pub relative: String, + #[pyo3(get)] + pub kind: String, + #[pyo3(get)] + pub language: String, + #[pyo3(get)] + pub size_bytes: u64, + #[pyo3(get)] + pub is_binary: bool, + #[pyo3(get)] + pub support_category: Option, + #[pyo3(get)] + pub content_policy: String, + #[pyo3(get)] + pub mtime_ns: u64, +} + +#[derive(Clone)] +pub struct PreparedPattern { + pub normalized_pat: String, + pub prefix_star_star: Option, + pub matcher: globset::GlobMatcher, + pub double_star_short_matcher: Option, +} + +#[derive(Clone)] +pub struct PathMatcher { + patterns: Vec, +} + +impl PathMatcher { + pub fn new(raw_patterns: &[String]) -> Self { + let mut patterns = Vec::new(); + for raw in raw_patterns { + let mut pat = raw.replace("\\", "/").trim().to_string(); + if pat.is_empty() { + continue; + } + if pat.starts_with('/') { + pat = pat[1..].to_string(); + } + if pat.ends_with('/') { + pat = pat[..pat.len() - 1].to_string(); + } + + let mut prefix_star_star = None; + if pat.ends_with("/**") { + let prefix = pat[..pat.len() - 3].trim_matches('/').to_string(); + prefix_star_star = Some(prefix); + } + + let mut double_star_short_glob = None; + if let Some(short) = pat.strip_prefix("**/") { + if let Ok(g) = GlobBuilder::new(short).literal_separator(false).build() { + double_star_short_glob = Some(g); + } + } + + if let Ok(g) = GlobBuilder::new(&pat).literal_separator(false).build() { + let matcher = g.compile_matcher(); + let double_star_short_matcher = double_star_short_glob + .as_ref() + .map(|d_g| d_g.compile_matcher()); + patterns.push(PreparedPattern { + normalized_pat: pat, + prefix_star_star, + matcher, + double_star_short_matcher, + }); + } + } + PathMatcher { patterns } + } + + pub fn matches(&self, rel_path: &str) -> bool { + if self.patterns.is_empty() { + return false; + } + let rel = rel_path.replace("\\", "/").trim_matches('/').to_string(); + for p in &self.patterns { + if rel == p.normalized_pat { + return true; + } + if let Some(ref prefix) = p.prefix_star_star { + if rel == *prefix || rel.starts_with(&format!("{}/", prefix)) { + return true; + } + } + if p.matcher.is_match(&rel) { + return true; + } + if !p.normalized_pat.contains('/') { + if let Some(filename) = rel.rsplit('/').next() { + if p.matcher.is_match(filename) { + return true; + } + } + } + if let Some(ref short_matcher) = p.double_star_short_matcher { + if short_matcher.is_match(&rel) { + return true; + } + if let Some(filename) = rel.rsplit('/').next() { + if short_matcher.is_match(filename) { + return true; + } + } + } + } + false + } +} + +fn to_posix_string(path: &Path) -> String { + path.to_string_lossy().replace("\\", "/") +} + +fn language_for(name: &str) -> String { + if name.starts_with("Dockerfile") { + return "dockerfile".to_string(); + } + let suffix = match name.rfind('.') { + Some(idx) => &name[idx..], + None => "", + }; + let lang = match suffix.to_lowercase().as_str() { + ".py" | ".pyi" => "python", + ".rs" => "rust", + ".js" | ".jsx" => "javascript", + ".ts" | ".tsx" => "typescript", + ".go" => "go", + ".java" => "java", + ".kt" => "kotlin", + ".c" | ".h" => "c", + ".cpp" | ".hpp" | ".cc" | ".cxx" | ".hh" | ".hxx" => "cpp", + ".toml" => "toml", + ".yaml" | ".yml" => "yaml", + ".json" => "json", + ".md" => "markdown", + ".rst" => "rst", + ".txt" => "text", + ".ini" | ".cfg" => "ini", + ".lock" => "lock", + _ => "text", + }; + lang.to_string() +} + +fn support_category(rel_s: &str, name: &str) -> String { + if name == "pyproject.toml" + || name.ends_with(".toml") + || name == "setup.py" + || name == "setup.cfg" + || name == "tox.ini" + || name == "pytest.ini" + || name == "mypy.ini" + || name == "ruff.toml" + || name == ".ruff.toml" + { + return "project config".to_string(); + } + if name.ends_with(".lock") + || name == "requirements.txt" + || name == "poetry.lock" + || name == "uv.lock" + || name == "Pipfile" + || name == "Pipfile.lock" + || name == "package.json" + || name == "package-lock.json" + || name == "pnpm-lock.yaml" + || name == "yarn.lock" + || name == "Cargo.toml" + || name == "Cargo.lock" + || name == "go.mod" + || name == "go.sum" + || rel_s.starts_with("requirements/") + { + return "dependency file".to_string(); + } + if name.starts_with("README") + || name == "CHANGELOG.md" + || name == "CONTRIBUTING.md" + || rel_s.starts_with("docs/") + { + return "documentation".to_string(); + } + if name.starts_with("Dockerfile") + || name.starts_with("docker-compose") + || name.starts_with("compose") + { + return "runtime support".to_string(); + } + if rel_s.starts_with(".github/workflows/") || name == ".gitlab-ci.yml" { + return "ci support".to_string(); + } + if name.starts_with(".env") || rel_s.starts_with("config/") || rel_s.starts_with("settings/") { + return "runtime config".to_string(); + } + if name == ".pre-commit-config.yaml" + || name == "tsconfig.json" + || name.starts_with("vite.config") + || name.starts_with("webpack.config") + { + return "tooling config".to_string(); + } + "support file".to_string() +} + +#[allow(clippy::too_many_arguments)] +pub fn scan_project_native( + root_path: &str, + use_gitignore: bool, + hard_ignore_patterns: Vec, + code_patterns: Vec, + support_patterns: Vec, + support_full_patterns: Vec, + support_tree_only_patterns: Vec, + support_default_policy: String, + support_enabled: bool, +) -> PyResult> { + let root = Path::new(root_path); + let hard_ignore_matcher = PathMatcher::new(&hard_ignore_patterns); + let code_matcher = PathMatcher::new(&code_patterns); + let support_matcher = PathMatcher::new(&support_patterns); + let support_tree_only_matcher = PathMatcher::new(&support_tree_only_patterns); + let support_full_matcher = PathMatcher::new(&support_full_patterns); + + let mut builder = WalkBuilder::new(root); + builder.standard_filters(use_gitignore); + builder.hidden(false); + + let hard_ignore_matcher_clone = hard_ignore_matcher.clone(); + let root_clone = root.to_path_buf(); + builder.filter_entry(move |entry| { + if let Ok(rel) = entry.path().strip_prefix(&root_clone) { + let rel_s = to_posix_string(rel); + if rel_s != "." && !rel_s.is_empty() && hard_ignore_matcher_clone.matches(&rel_s) { + return false; + } + } + true + }); + + let mut file_infos = Vec::new(); + + for result in builder.build() { + let entry = match result { + Ok(e) => e, + Err(_) => continue, + }; + + if !entry.file_type().is_some_and(|ft| ft.is_file()) { + continue; + } + + let path = entry.path(); + let rel = match path.strip_prefix(root) { + Ok(r) => r, + Err(_) => continue, + }; + let rel_s = to_posix_string(rel); + + if rel_s.is_empty() { + continue; + } + + if hard_ignore_matcher.matches(&rel_s) { + continue; + } + + let kind; + let mut category = None; + let mut policy = "auto".to_string(); + + if code_matcher.matches(&rel_s) { + kind = "code"; + } else if support_enabled && support_matcher.matches(&rel_s) { + kind = "support"; + let name = path.file_name().and_then(|n| n.to_str()).unwrap_or(""); + category = Some(support_category(&rel_s, name)); + if support_tree_only_matcher.matches(&rel_s) { + policy = "tree_only".to_string(); + } else if support_full_matcher.matches(&rel_s) { + policy = "full".to_string(); + } else { + policy = support_default_policy.clone(); + } + } else { + continue; + } + + let metadata = match entry.metadata() { + Ok(m) => m, + Err(_) => continue, + }; + let size_bytes = metadata.len(); + + let mtime_ns = match metadata.modified() { + Ok(t) => t + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_nanos() as u64), + Err(_) => 0, + }; + + let is_binary = crate::io::is_binary(path); + + file_infos.push(NativeFileInfo { + relative: rel_s, + kind: kind.to_string(), + language: language_for(path.file_name().and_then(|n| n.to_str()).unwrap_or("")), + size_bytes, + is_binary, + support_category: category, + content_policy: policy, + mtime_ns, + }); + } + + Ok(file_infos) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_double_star_suffix() { + let matcher = PathMatcher::new(&["**/*.py".to_string()]); + assert!(matcher.matches("src/main.py")); + assert!(matcher.matches("main.py")); + assert!(!matcher.matches("src/main.rs")); + } + + #[test] + fn matches_dir_prefix() { + let matcher = PathMatcher::new(&["target/**".to_string()]); + assert!(matcher.matches("target/debug/x")); + assert!(!matcher.matches("src/target.rs")); + } + + #[test] + fn matches_basename() { + let matcher = PathMatcher::new(&["Cargo.toml".to_string()]); + assert!(matcher.matches("Cargo.toml")); + assert!(matcher.matches("crates/a/Cargo.toml")); + } +} diff --git a/rust/scriber_native/src/score.rs b/rust/scriber_native/src/score.rs new file mode 100644 index 0000000..a4efbff --- /dev/null +++ b/rust/scriber_native/src/score.rs @@ -0,0 +1,817 @@ +use crate::import::NativeImportEdge; +use crate::scan::NativeFileInfo; +use pyo3::prelude::*; +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +#[pyclass] +#[derive(Clone, Debug)] +pub struct NativeCandidate { + #[pyo3(get)] + pub path: String, + #[pyo3(get)] + pub kind: String, + #[pyo3(get, set)] + pub score: i32, + #[pyo3(get, set)] + pub reasons: Vec, + #[pyo3(get, set)] + pub reason_summary: String, + #[pyo3(get, set)] + pub include_content: bool, + #[pyo3(get, set)] + pub omitted_reason: Option, +} + +#[pyclass] +#[derive(Clone, Debug)] +pub struct NativePackOptions { + #[pyo3(get, set)] + pub mode: String, + #[pyo3(get, set)] + pub max_files: usize, + #[pyo3(get, set)] + pub min_score: i32, + #[pyo3(get, set)] + pub tree_min_score: i32, + + // Config scoring values + #[pyo3(get, set)] + pub seed_file_score: i32, + #[pyo3(get, set)] + pub seed_folder_file_score: i32, + #[pyo3(get, set)] + pub direct_dependency_score: i32, + #[pyo3(get, set)] + pub reverse_dependency_score: i32, + #[pyo3(get, set)] + pub same_package_score: i32, + #[pyo3(get, set)] + pub parent_entrypoint_score: i32, + #[pyo3(get, set)] + pub related_test_score: i32, + #[pyo3(get, set)] + pub name_similarity_score: i32, + #[pyo3(get, set)] + pub support_near_seed_score: i32, + #[pyo3(get, set)] + pub project_config_score: i32, + #[pyo3(get, set)] + pub dependency_file_score: i32, + #[pyo3(get, set)] + pub runtime_support_score: i32, + #[pyo3(get, set)] + pub documentation_score: i32, + #[pyo3(get, set)] + pub shared_dependency_bonus: i32, + + // Module flags + #[pyo3(get, set)] + pub modules_enabled: bool, + #[pyo3(get, set)] + pub include_direct_dependencies: bool, + #[pyo3(get, set)] + pub include_reverse_dependencies: bool, + #[pyo3(get, set)] + pub include_same_package: bool, + #[pyo3(get, set)] + pub include_parent_entrypoints: bool, + #[pyo3(get, set)] + pub include_tests: bool, + #[pyo3(get, set)] + pub include_project_configs: bool, + #[pyo3(get, set)] + pub depth: usize, + + // Support file scanning + #[pyo3(get, set)] + pub support_enabled: bool, + + // Python module info + #[pyo3(get, set)] + pub entrypoint_patterns: Vec, + #[pyo3(get, set)] + pub test_roots: Vec, +} + +#[pymethods] +impl NativePackOptions { + #[new] + #[pyo3(signature = ( + mode = "focused".to_string(), + max_files = 0, + min_score = 0, + tree_min_score = 0, + seed_file_score = 100, + seed_folder_file_score = 90, + direct_dependency_score = 80, + reverse_dependency_score = 70, + same_package_score = 75, + parent_entrypoint_score = 70, + related_test_score = 85, + name_similarity_score = 65, + support_near_seed_score = 50, + project_config_score = 70, + dependency_file_score = 60, + runtime_support_score = 50, + documentation_score = 45, + shared_dependency_bonus = 10, + modules_enabled = true, + include_direct_dependencies = true, + include_reverse_dependencies = true, + include_same_package = true, + include_parent_entrypoints = true, + include_tests = true, + include_project_configs = true, + depth = 2, + support_enabled = true, + entrypoint_patterns = Vec::new(), + test_roots = Vec::new(), + ))] + #[allow(clippy::too_many_arguments)] + fn new( + mode: String, + max_files: usize, + min_score: i32, + tree_min_score: i32, + seed_file_score: i32, + seed_folder_file_score: i32, + direct_dependency_score: i32, + reverse_dependency_score: i32, + same_package_score: i32, + parent_entrypoint_score: i32, + related_test_score: i32, + name_similarity_score: i32, + support_near_seed_score: i32, + project_config_score: i32, + dependency_file_score: i32, + runtime_support_score: i32, + documentation_score: i32, + shared_dependency_bonus: i32, + modules_enabled: bool, + include_direct_dependencies: bool, + include_reverse_dependencies: bool, + include_same_package: bool, + include_parent_entrypoints: bool, + include_tests: bool, + include_project_configs: bool, + depth: usize, + support_enabled: bool, + entrypoint_patterns: Vec, + test_roots: Vec, + ) -> Self { + NativePackOptions { + mode, + max_files, + min_score, + tree_min_score, + seed_file_score, + seed_folder_file_score, + direct_dependency_score, + reverse_dependency_score, + same_package_score, + parent_entrypoint_score, + related_test_score, + name_similarity_score, + support_near_seed_score, + project_config_score, + dependency_file_score, + runtime_support_score, + documentation_score, + shared_dependency_bonus, + modules_enabled, + include_direct_dependencies, + include_reverse_dependencies, + include_same_package, + include_parent_entrypoints, + include_tests, + include_project_configs, + depth, + support_enabled, + entrypoint_patterns, + test_roots, + } + } +} + +// Internal Candidate builder struct to aggregate reasons +struct ScoringCandidate { + info: NativeFileInfo, + score: i32, + reasons: Vec, + reason_counts: HashMap, + reason_examples: HashMap>, + seed_sources: HashSet, +} + +fn add_reason(c: &mut ScoringCandidate, kind: &str, label: &str, example: Option<&str>) { + *c.reason_counts.entry(kind.to_string()).or_default() += 1; + if let Some(ex) = example { + let examples = c.reason_examples.entry(kind.to_string()).or_default(); + if !examples.contains(&ex.to_string()) { + examples.push(ex.to_string()); + } + } + if !c.reasons.contains(&label.to_string()) { + c.reasons.push(label.to_string()); + } +} + +fn build_reason_summary(c: &ScoringCandidate) -> String { + let mut parts = Vec::new(); + let order = vec![ + "seed_file", + "seed_folder_file", + "direct_dependency", + "reverse_dependency", + "related_test", + "same_package", + "parent_entrypoint", + "name_similarity", + "support_near_seed", + "project_support", + "shared_dependency", + "entrypoint", + "test_file", + "code_file", + "other_file", + ]; + + for kind in order { + if let Some(&count) = c.reason_counts.get(kind) { + let examples = c.reason_examples.get(kind); + if kind == "seed_file" { + parts.push("seed file".to_string()); + } else if kind == "seed_folder_file" { + parts.push("seed folder file".to_string()); + } else if kind == "direct_dependency" { + if count > 1 { + parts.push(format!("imports {} included files", count)); + } else if let Some(exs) = examples { + if !exs.is_empty() { + let filename = Path::new(&exs[0]) + .file_name() + .unwrap_or(std::ffi::OsStr::new("")) + .to_string_lossy(); + parts.push(format!("imports {}", filename)); + } else { + parts.push("imports seed".to_string()); + } + } else { + parts.push("imports seed".to_string()); + } + } else if kind == "reverse_dependency" { + if count > 1 { + parts.push(format!("imported by {} included files", count)); + } else if let Some(exs) = examples { + if !exs.is_empty() { + let filename = Path::new(&exs[0]) + .file_name() + .unwrap_or(std::ffi::OsStr::new("")) + .to_string_lossy(); + parts.push(format!("imported by {}", filename)); + } else { + parts.push("imported by seed".to_string()); + } + } else { + parts.push("imported by seed".to_string()); + } + } else if kind == "related_test" { + parts.push("related test".to_string()); + } else if kind == "same_package" { + parts.push("same package".to_string()); + } else if kind == "parent_entrypoint" { + parts.push("parent entrypoint".to_string()); + } else if kind == "name_similarity" { + parts.push("name similarity".to_string()); + } else if kind == "support_near_seed" { + parts.push("support file".to_string()); + } else if kind == "project_support" { + parts.push("project support file".to_string()); + } else if kind == "shared_dependency" { + parts.push("shared dependency bonus".to_string()); + } else if kind == "entrypoint" { + parts.push("entrypoint file".to_string()); + } else if kind == "test_file" { + parts.push("test file".to_string()); + } else if kind == "code_file" { + parts.push("code file".to_string()); + } else if kind == "other_file" { + parts.push("other file".to_string()); + } + } + } + parts.join("; ") +} + +fn is_test_file(rel: &str, test_roots: &[String]) -> bool { + let p = Path::new(rel); + let name = p + .file_name() + .unwrap_or(std::ffi::OsStr::new("")) + .to_string_lossy() + .to_lowercase(); + for part in p.components().filter_map(|c| c.as_os_str().to_str()) { + if test_roots.contains(&part.to_string()) { + return true; + } + } + name.starts_with("test_") || name.ends_with("_test.py") || name.ends_with(".test.py") +} + +fn name_related(a: &str, b: &str) -> bool { + let a_stem = Path::new(a) + .file_stem() + .unwrap_or(std::ffi::OsStr::new("")) + .to_string_lossy() + .to_lowercase() + .replace("test_", "") + .replace("_test", ""); + let b_stem = Path::new(b) + .file_stem() + .unwrap_or(std::ffi::OsStr::new("")) + .to_string_lossy() + .to_lowercase() + .replace("test_", "") + .replace("_test", ""); + if a_stem.is_empty() || b_stem.is_empty() { + return false; + } + a_stem.contains(&b_stem) || b_stem.contains(&a_stem) +} + +fn is_near_seed(support_file: &str, seed: &str) -> bool { + let sf_parent = Path::new(support_file).parent().unwrap_or(Path::new("")); + if sf_parent == Path::new("") { + return true; + } + let seed_parent = Path::new(seed).parent().unwrap_or(Path::new("")); + sf_parent == seed_parent + || sf_parent.starts_with(seed_parent) + || seed_parent.starts_with(sf_parent) +} + +fn walk_neighbors( + edges: &HashMap>, + start: &str, + depth: usize, +) -> HashMap { + let mut found = HashMap::new(); + let mut frontier = HashSet::new(); + frontier.insert(start.to_string()); + let mut visited = HashSet::new(); + visited.insert(start.to_string()); + + for distance in 1..=depth { + let mut next_frontier = HashSet::new(); + for item in frontier { + if let Some(neighbors) = edges.get(&item) { + for neighbor in neighbors { + if visited.contains(neighbor) { + continue; + } + visited.insert(neighbor.clone()); + found.insert(neighbor.clone(), distance); + next_frontier.insert(neighbor.clone()); + } + } + } + frontier = next_frontier; + if frontier.is_empty() { + break; + } + } + found +} + +fn support_base_score(file: &NativeFileInfo, options: &NativePackOptions) -> i32 { + let cat = file.support_category.as_deref().unwrap_or("support file"); + match cat { + "project config" => options.project_config_score, + "dependency file" => options.dependency_file_score, + "runtime support" | "runtime config" | "ci support" | "tooling config" => { + options.runtime_support_score + } + "documentation" => options.documentation_score, + _ => options.documentation_score, + } +} + +fn matches_entrypoint(rel: &str, entrypoint_patterns: &[String]) -> bool { + let name = Path::new(rel) + .file_name() + .unwrap_or(std::ffi::OsStr::new("")) + .to_string_lossy() + .to_string(); + // Simple glob matcher for entrypoints + for pat in entrypoint_patterns { + let pat_clean = pat.replace("*", ""); + if pat.starts_with('*') && pat.ends_with('*') { + if name.contains(&pat_clean) { + return true; + } + } else if pat.starts_with('*') { + if name.ends_with(&pat_clean) { + return true; + } + } else if pat.ends_with('*') { + if name.starts_with(&pat_clean) { + return true; + } + } else if name == *pat { + return true; + } + } + false +} + +#[pyfunction] +pub fn score_candidates_native( + files: Vec, + seeds_list: Vec, + edges: Vec, + options: NativePackOptions, +) -> PyResult> { + let mut mapped_files = HashMap::new(); + for f in files { + mapped_files.insert( + f.relative.clone(), + ScoringCandidate { + info: f.clone(), + score: 0, + reasons: Vec::new(), + reason_counts: HashMap::new(), + reason_examples: HashMap::new(), + seed_sources: HashSet::new(), + }, + ); + } + + // Build graph edges maps + let mut graph_imports: HashMap> = HashMap::new(); + let mut graph_imported_by: HashMap> = HashMap::new(); + for edge in edges { + graph_imports + .entry(edge.from.clone()) + .or_default() + .insert(edge.to.clone()); + graph_imported_by + .entry(edge.to.clone()) + .or_default() + .insert(edge.from.clone()); + } + + if options.mode == "project_snapshot" { + for (rel, c) in &mut mapped_files { + if c.info.kind == "code" { + if matches_entrypoint(rel, &options.entrypoint_patterns) { + c.score = 90; + add_reason(c, "entrypoint", "entrypoint file", None); + } else if is_test_file(rel, &options.test_roots) { + c.score = 60; + add_reason(c, "test_file", "test file", None); + } else { + c.score = 80; + add_reason(c, "code_file", "code file", None); + } + } else if c.info.kind == "support" && options.support_enabled { + let base = support_base_score(&c.info, &options); + let cat = c + .info + .support_category + .clone() + .unwrap_or("support file".to_string()); + c.score = base; + add_reason(c, "project_support", &cat, None); + } + } + } else { + // Focused mode scoring + let mut seed_files = Vec::new(); + for s in &seeds_list { + // Find all files matching or under seed paths + for rel in mapped_files.keys() { + if rel == s || rel.starts_with(&format!("{}/", s)) { + seed_files.push(rel.clone()); + } + } + } + let seed_set: HashSet = seed_files.iter().cloned().collect(); + + // 1. Seed paths scores + for s in &seeds_list { + for rel in &seed_files { + if rel == s || rel.starts_with(&format!("{}/", s)) { + let is_dir = rel != s; + let (score, key, reason) = if is_dir { + ( + options.seed_folder_file_score, + "seed_folder_file", + format!("file inside seed folder `{}`", s), + ) + } else { + ( + options.seed_file_score, + "seed_file", + "seed file".to_string(), + ) + }; + if let Some(c) = mapped_files.get_mut(rel) { + c.score = std::cmp::max(c.score, score); + let r_clone = rel.clone(); + add_reason(c, key, &reason, Some(&r_clone)); + c.seed_sources.insert(r_clone); + } + } + } + } + + // 2. Dependencies / Related files scores + if options.modules_enabled { + for seed_rel in &seed_files { + // Direct dependencies + if options.include_direct_dependencies { + for (dep, distance) in walk_neighbors(&graph_imports, seed_rel, options.depth) { + let score = std::cmp::max( + options.tree_min_score, + options.direct_dependency_score - ((distance as i32 - 1) * 10), + ); + if let Some(c) = mapped_files.get_mut(&dep) { + c.score = std::cmp::max(c.score, score); + add_reason( + c, + "direct_dependency", + &format!("direct dependency of `{}`", seed_rel), + Some(seed_rel), + ); + c.seed_sources.insert(seed_rel.clone()); + } + } + } + + // Reverse dependencies + if options.include_reverse_dependencies { + for (dep, distance) in + walk_neighbors(&graph_imported_by, seed_rel, options.depth) + { + let score = std::cmp::max( + options.tree_min_score, + options.reverse_dependency_score - ((distance as i32 - 1) * 10), + ); + if let Some(c) = mapped_files.get_mut(&dep) { + c.score = std::cmp::max(c.score, score); + add_reason( + c, + "reverse_dependency", + &format!("imports seed `{}`", seed_rel), + Some(seed_rel), + ); + c.seed_sources.insert(seed_rel.clone()); + } + } + } + + // Same package + if options.include_same_package { + let seed_parent = Path::new(seed_rel) + .parent() + .unwrap_or(Path::new("")) + .to_string_lossy() + .to_string(); + for (rel, c) in &mut mapped_files { + if c.info.kind == "code" && !seed_set.contains(rel) { + let rel_parent = Path::new(rel) + .parent() + .unwrap_or(Path::new("")) + .to_string_lossy() + .to_string(); + if rel_parent == seed_parent { + c.score = std::cmp::max(c.score, options.same_package_score); + add_reason( + c, + "same_package", + &format!("same package as `{}`", seed_rel), + Some(seed_rel), + ); + c.seed_sources.insert(seed_rel.clone()); + } + } + } + } + + // Parent entrypoints + if options.include_parent_entrypoints { + for (rel, c) in &mut mapped_files { + if c.info.kind == "code" + && matches_entrypoint(rel, &options.entrypoint_patterns) + { + let rel_p = Path::new(rel); + let seed_p = Path::new(seed_rel); + let is_parent = rel_p.parent() == Some(Path::new("")) + || seed_p.starts_with(rel_p.parent().unwrap()) + || rel_p.starts_with(seed_p.parent().unwrap()); + if is_parent { + c.score = std::cmp::max(c.score, options.parent_entrypoint_score); + add_reason( + c, + "parent_entrypoint", + &format!("parent/entrypoint near `{}`", seed_rel), + Some(seed_rel), + ); + c.seed_sources.insert(seed_rel.clone()); + } + } + } + } + + // Related tests + if options.include_tests { + for (rel, c) in &mut mapped_files { + if c.info.kind == "code" && is_test_file(rel, &options.test_roots) { + let matches_name = name_related(rel, seed_rel); + let is_dep = graph_imports + .get(rel) + .is_some_and(|deps| deps.contains(seed_rel)); + if matches_name || is_dep { + c.score = std::cmp::max(c.score, options.related_test_score); + add_reason( + c, + "related_test", + &format!("related test for `{}`", seed_rel), + Some(seed_rel), + ); + c.seed_sources.insert(seed_rel.clone()); + } + } + } + } + + // Name similarity + for (rel, c) in &mut mapped_files { + if c.info.kind == "code" + && !seed_set.contains(rel) + && name_related(rel, seed_rel) + { + c.score = std::cmp::max(c.score, options.name_similarity_score); + add_reason( + c, + "name_similarity", + &format!("name similarity with `{}`", seed_rel), + Some(seed_rel), + ); + c.seed_sources.insert(seed_rel.clone()); + } + } + } + + // Support files + if options.support_enabled { + for (rel, c) in &mut mapped_files { + if c.info.kind == "support" { + let base = support_base_score(&c.info, &options); + let cat = c + .info + .support_category + .clone() + .unwrap_or("support file".to_string()); + if rel == "pyproject.toml" { + c.score = std::cmp::max(c.score, options.project_config_score); + add_reason(c, "project_support", "project config/root file", None); + continue; + } + + let mut added = false; + for seed_rel in &seed_files { + if is_near_seed(rel, seed_rel) { + c.score = std::cmp::max( + c.score, + std::cmp::max(base, options.support_near_seed_score), + ); + add_reason( + c, + "support_near_seed", + &format!("{} near `{}`", cat, seed_rel), + Some(seed_rel), + ); + c.seed_sources.insert(seed_rel.clone()); + added = true; + } + } + + if !added + && Path::new(rel).parent() == Some(Path::new("")) + && options.include_project_configs + { + c.score = std::cmp::max(c.score, base); + add_reason(c, "project_support", &cat, None); + } + } + } + } + } else { + // Modules disabled fallback + if options.support_enabled { + if let Some(c) = mapped_files.get_mut("pyproject.toml") { + c.score = std::cmp::max(c.score, options.project_config_score); + add_reason(c, "project_support", "project config/root file", None); + } + } + } + + // Shared dependency bonus + for c in mapped_files.values_mut() { + if c.seed_sources.len() > 1 { + c.score = std::cmp::min(100, c.score + options.shared_dependency_bonus); + add_reason( + c, + "shared_dependency", + "shared by multiple seed paths", + None, + ); + } + } + } + + // Build summaries & NativeCandidate objects + let mut candidates = Vec::new(); + let seed_set: HashSet = seeds_list.iter().cloned().collect(); + + for c in mapped_files.values() { + let is_seed = seed_set.contains(&c.info.relative) + || seeds_list + .iter() + .any(|s| c.info.relative.starts_with(&format!("{}/", s))); + let is_valid_score = c.score >= options.min_score || c.score >= options.tree_min_score; + if is_seed || is_valid_score { + let summary = build_reason_summary(c); + candidates.push(NativeCandidate { + path: c.info.relative.clone(), + kind: c.info.kind.clone(), + score: c.score, + reasons: c.reasons.clone(), + reason_summary: summary, + include_content: true, + omitted_reason: None, + }); + } + } + + // Sort by score desc, kind desc (code first), relative path asc + candidates.sort_by(|a, b| { + let score_cmp = b.score.cmp(&a.score); + if score_cmp != std::cmp::Ordering::Equal { + return score_cmp; + } + let kind_cmp = (b.kind == "code").cmp(&(a.kind == "code")); + if kind_cmp != std::cmp::Ordering::Equal { + return kind_cmp; + } + a.path.cmp(&b.path) + }); + + // Enforce max files limit + if options.max_files > 0 && candidates.len() > options.max_files { + let is_snap = options.mode == "project_snapshot"; + + let mut seeds_first = Vec::new(); + let mut rest = Vec::new(); + for cand in candidates { + let belongs_in_seeds = if is_snap { + cand.path == "pyproject.toml" || cand.path == "README.md" + } else { + let is_seed = seed_set.contains(&cand.path) + || seeds_list + .iter() + .any(|s| cand.path.starts_with(&format!("{}/", s))); + is_seed || cand.path == "pyproject.toml" || cand.path == "README.md" + }; + if belongs_in_seeds { + seeds_first.push(cand); + } else { + rest.push(cand); + } + } + let remaining = if options.max_files > seeds_first.len() { + options.max_files - seeds_first.len() + } else { + 0 + }; + seeds_first.extend(rest.into_iter().take(remaining)); + candidates = seeds_first; + + // Resort final list + candidates.sort_by(|a, b| { + let score_cmp = b.score.cmp(&a.score); + if score_cmp != std::cmp::Ordering::Equal { + return score_cmp; + } + let kind_cmp = (b.kind == "code").cmp(&(a.kind == "code")); + if kind_cmp != std::cmp::Ordering::Equal { + return kind_cmp; + } + a.path.cmp(&b.path) + }); + } + + Ok(candidates) +} diff --git a/src/run.py b/src/run.py deleted file mode 100644 index 25db586..0000000 --- a/src/run.py +++ /dev/null @@ -1,6 +0,0 @@ -import os -from scriber.cli import main - -os.environ['SCRIBER_EXEC_MODE'] = 'RUN_PY' - -main() \ No newline at end of file diff --git a/src/scriber/__init__.py b/src/scriber/__init__.py index 3c21a13..1aef752 100644 --- a/src/scriber/__init__.py +++ b/src/scriber/__init__.py @@ -1,10 +1,8 @@ -""" -ProjectScriber: A tool for mapping and compiling project source code. +"""ProjectScriber 2.0.""" -This package provides the core functionality and command-line interface for -ProjectScriber. The main `Scriber` class can be imported directly for -programmatic use. -""" -from .core import Scriber, ScriberConfig +from .packer.pack import build_pack, build_and_write_pack +from .core.models import ScriberPack -__all__ = ["Scriber", "ScriberConfig"] \ No newline at end of file +__all__ = ["build_pack", "build_and_write_pack", "ScriberPack"] + +__version__ = "2.0.0" diff --git a/src/scriber/__main__.py b/src/scriber/__main__.py new file mode 100644 index 0000000..89b4af8 --- /dev/null +++ b/src/scriber/__main__.py @@ -0,0 +1,4 @@ +from .cli.main import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/scriber/cache.py b/src/scriber/cache.py new file mode 100644 index 0000000..f96f0fc --- /dev/null +++ b/src/scriber/cache.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +import os +import sys +import json +import hashlib +from pathlib import Path +from typing import Any, TYPE_CHECKING + +if TYPE_CHECKING: + from scriber.core.models import ScriberConfig + + +def get_config_hash(config: ScriberConfig) -> str: + from scriber import __version__ + data = { + "code_patterns": config.code_patterns, + "support_patterns": config.support_patterns, + "hard_ignore_patterns": config.hard_ignore_patterns, + "support": config.support, + "support_content_default": config.support_content.default, + "support_content_full": config.support_content.full, + "support_content_tree_only": config.support_content.tree_only, + "support_content_auto_max_bytes": config.support_content.auto_max_bytes, + "use_gitignore": config.use_gitignore, + "python_source_roots": config.python.source_roots, + "python_module_init_files": config.python.module_init_files, + "scriber_version": __version__, + "native_scanner_version": 1, + } + dump = json.dumps(data, sort_keys=True) + return hashlib.sha256(dump.encode("utf-8")).hexdigest() + + +class ScriberCache: + def __init__(self, config: ScriberConfig, project_root: Path): + self.enabled = config.cache.enabled + self.cache_dir = project_root / config.cache.dir + self.files_cache_path = self.cache_dir / "files.json" + self.graph_cache_path = self.cache_dir / "import_graph.json" + self.config_hash = get_config_hash(config) + self.python_version = f"{sys.version_info.major}.{sys.version_info.minor}" + + self.files_data: dict[str, dict[str, Any]] = {} + self.graph_data: dict[str, list[str]] = {} + self._load() + + def _load(self) -> None: + if not self.enabled: + return + + try: + if self.files_cache_path.exists(): + with self.files_cache_path.open("r", encoding="utf-8") as f: + self.files_data = json.load(f) + if self.graph_cache_path.exists(): + with self.graph_cache_path.open("r", encoding="utf-8") as f: + self.graph_data = json.load(f) + except Exception: + # Silently fallback to empty cache on read errors + self.files_data = {} + self.graph_data = {} + + def get_file(self, rel_path: Path, mtime_ns: int, size: int) -> dict[str, Any] | None: + if not self.enabled: + return None + + key = rel_path.as_posix() + entry = self.files_data.get(key) + if entry is None: + return None + + if (entry.get("mtime_ns") == mtime_ns and + entry.get("size") == size and + entry.get("python_version") == self.python_version and + entry.get("config_hash") == self.config_hash): + return entry.get("data") + return None + + def set_file(self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any]) -> None: + if not self.enabled: + return + key = rel_path.as_posix() + self.files_data[key] = { + "mtime_ns": mtime_ns, + "size": size, + "python_version": self.python_version, + "config_hash": self.config_hash, + "data": data + } + + def get_imports(self, rel_path: Path) -> set[Path] | None: + if not self.enabled: + return None + key = rel_path.as_posix() + imports = self.graph_data.get(key) + if imports is not None: + return {Path(p) for p in imports} + return None + + def set_imports(self, rel_path: Path, imports: set[Path]) -> None: + if not self.enabled: + return + key = rel_path.as_posix() + self.graph_data[key] = [p.as_posix() for p in sorted(imports)] + + def save(self, active_files: set[Path] | None = None) -> None: + if not self.enabled: + return + + try: + self.cache_dir.mkdir(parents=True, exist_ok=True) + + # Simple cleanup mechanism: + # 1. Prune stale cache entries (entries for files no longer in active_files) + if active_files is not None: + active_keys = {p.as_posix() for p in active_files} + self.files_data = {k: v for k, v in self.files_data.items() if k in active_keys} + self.graph_data = {k: v for k, v in self.graph_data.items() if k in active_keys} + + # 2. Enforce absolute limit of max 1000 entries to prevent infinite growth + if len(self.files_data) > 1000: + # Remove oldest keys + sorted_keys = sorted(self.files_data.keys(), key=lambda k: self.files_data[k].get("mtime_ns", 0)) + to_remove = sorted_keys[:len(sorted_keys) - 1000] + for k in to_remove: + self.files_data.pop(k, None) + self.graph_data.pop(k, None) + + with self.files_cache_path.open("w", encoding="utf-8") as f: + json.dump(self.files_data, f, indent=2) + with self.graph_cache_path.open("w", encoding="utf-8") as f: + json.dump(self.graph_data, f, indent=2) + except Exception: + pass # Fail silently on write errors to not interrupt execution diff --git a/src/scriber/cli.py b/src/scriber/cli.py deleted file mode 100644 index e689c64..0000000 --- a/src/scriber/cli.py +++ /dev/null @@ -1,361 +0,0 @@ -import argparse -import io -import json -import os -import re -import sys -from importlib import metadata -from pathlib import Path -from typing import Any - -import pyperclip -import tomlkit -from dotenv import load_dotenv - -from .core import DEFAULT_CONFIG, Scriber - -try: - import rich.box - from rich.console import Console - from rich.panel import Panel - from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn - from rich.prompt import Confirm, Prompt - from rich.table import Table - from rich.text import Text - - RICH_AVAILABLE = True -except ImportError: - RICH_AVAILABLE = False - -load_dotenv() - - -class SimpleConsole: - """A fallback console that mimics rich.Console with simple print statements.""" - - def print(self, message: Any = "") -> None: - """Strips rich markup and prints the message, handling potential Unicode errors. - - This method attempts to print the message directly. If a UnicodeEncodeError - occurs, it falls back to encoding the message using the system's stdout - encoding, replacing any unsupported characters to prevent crashes. - - Args: - message: The object or text to print. - """ - message_str = str(message) - cleaned_message = re.sub(r'\[/?[a-zA-Z\s=]+\]', '', message_str) - try: - print(cleaned_message) - except UnicodeEncodeError: - safe_message = cleaned_message.encode( - sys.stdout.encoding, errors='replace' - ).decode(sys.stdout.encoding) - print(safe_message) - - -def format_bytes(byte_count: int) -> str: - """Formats a byte count into a human-readable string (KB, MB). - - Args: - byte_count: The number of bytes. - - Returns: - A formatted string representing the size. - """ - if byte_count > 1024 * 1024: - return f"{byte_count / (1024 * 1024):.2f} MB" - if byte_count > 1024: - return f"{byte_count / 1024:.2f} KB" - return f"{byte_count} Bytes" - - -def save_to_json(console: Any, config: dict[str, Any]): - """Saves configuration to a .scriber.json file. - - Args: - console: The console instance for printing output. - config: The configuration dictionary to save. - """ - config_path = Path.cwd() / ".scriber.json" - try: - with open(config_path, "w", encoding="utf-8") as f: - json.dump(config, f, indent=2) - console.print(f"\nβœ… [bold green]Configuration saved to:[/] {config_path}") - except IOError as e: - console.print(f"\n❌ [bold red]Error saving config file:[/] {e}") - - -def save_to_toml(console: Any, config: dict[str, Any]): - """Saves configuration to the pyproject.toml file. - - Args: - console: The console instance for printing output. - config: The configuration dictionary to save. - """ - toml_path = Path.cwd() / "pyproject.toml" - if not toml_path.exists(): - console.print(f"\n❌ [bold red]Error: `pyproject.toml` not found in the current directory.[/]") - return - - try: - with open(toml_path, "r+", encoding="utf-8") as f: - doc = tomlkit.parse(f.read()) - - tool_table = doc.setdefault("tool", tomlkit.table()) - scriber_table = tool_table.setdefault("scriber", tomlkit.table()) - scriber_table.update(config) - - f.seek(0) - f.truncate() - f.write(tomlkit.dumps(doc)) - - console.print(f"\nβœ… [bold green]Configuration saved to:[/] {toml_path}") - except Exception as e: - console.print(f"\n❌ [bold red]Error updating `pyproject.toml`:[/] {e}") - - -def handle_init(args: argparse.Namespace, console: Any, rich_available: bool): - """Handles the interactive initialization of a config file. - - Args: - args: The parsed command-line arguments. - console: The console instance for printing output. - rich_available: A boolean indicating if the 'rich' library is installed. - """ - if rich_available: - console.print(Panel("[bold cyan]Scriber Configuration Setup[/]", expand=False)) - else: - console.print("--- Scriber Configuration Setup ---") - console.print("This utility will help you create a configuration file.\n") - - config: dict[str, Any] = {} - - if rich_available: - config["use_gitignore"] = Confirm.ask("✨ Would you like to respect `.gitignore` rules?", default=True) - default_exclude = ", ".join(DEFAULT_CONFIG.exclude) - exclude_str = Prompt.ask("πŸ“‚ Enter patterns to exclude (comma-separated)", default=default_exclude) - include_str = Prompt.ask("πŸ“„ Enter patterns to include (optional, comma-separated)", default="") - hidden_str = Prompt.ask("πŸ™ˆ Enter patterns to hide content for (e.g., lock files, optional, comma-separated)", - default="") - config["single_process"] = Confirm.ask("βš™οΈ Run in a single process? (for Celery or similar environments)", - default=False) - else: - answer = input("✨ Would you like to respect `.gitignore` rules? (Y/n) ").strip().lower() - config["use_gitignore"] = answer not in ['n', 'no'] - default_exclude = ", ".join(DEFAULT_CONFIG.exclude) - exclude_str = input( - f"πŸ“‚ Enter patterns to exclude (comma-separated, default: {default_exclude}): ") or default_exclude - include_str = input("πŸ“„ Enter patterns to include (optional, comma-separated): ") - hidden_str = input("πŸ™ˆ Enter patterns to hide content for (e.g., lock files, optional, comma-separated): ") - answer = input("βš™οΈ Run in a single process? (for Celery or similar environments) (y/N) ").strip().lower() - config["single_process"] = answer in ['y', 'yes'] - - config["exclude"] = [item.strip() for item in exclude_str.split(',') if item.strip()] - include_patterns = [item.strip() for item in include_str.split(',') if item.strip()] - if include_patterns: - config["include"] = include_patterns - hidden_patterns = [item.strip() for item in hidden_str.split(",") if item.strip()] - if hidden_patterns: - config["hidden"] = hidden_patterns - - console.print("\n[bold]Choose a save location:[/bold]") - console.print(" [cyan]1[/]: Save to `.scriber.json` (project-specific override)") - console.print(" [cyan]2[/]: Save to `pyproject.toml` (project default)") - - if rich_available: - save_target = Prompt.ask("Enter your choice", choices=["1", "2"], default="1") - else: - save_target = input("Enter your choice (1/2, default: 1): ") or "1" - - if save_target == '1': - save_to_json(console, config) - elif save_target == '2': - save_to_toml(console, config) - - -def run_scriber(args: argparse.Namespace, console: Any, version: str, rich_available: bool): - """Handles the main logic of mapping and generating the project output. - - Args: - args: The parsed command-line arguments. - console: The console instance for printing output. - version: The current version of the application. - rich_available: A boolean indicating if the 'rich' library is installed. - """ - if rich_available: - title_text = Text(f"Scriber v{version}", justify="center", style="bold magenta") - subtitle_text = Text("An intelligent tool to map, analyze, and compile project source code for LLM context.", - justify="center", style="cyan") - console.print(Panel(Text.assemble(title_text, "\n", subtitle_text), expand=False, border_style="blue")) - else: - console.print(f"--- Scriber v{version} ---") - - scriber = Scriber(args.root_path.resolve(), config_path=args.config) - if args.single_process: - scriber.single_process = True - - scriber.map_project() - - progress = None - task_id = None - if rich_available: - progress_manager = Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), - BarColumn(), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - console=console, transient=True) - total_files = scriber.get_file_count() - if total_files > 0 and not args.tree_only: - task_id = progress_manager.add_task("[green]Processing files...", total=total_files) - progress = progress_manager - else: - console.print("Processing files...") - - output_content = "" - if progress: - with progress: - output_content = scriber.get_output_as_string(tree_only=args.tree_only, progress=progress, task_id=task_id) - else: - output_content = scriber.get_output_as_string(tree_only=args.tree_only) - - stats = scriber.get_stats() - config_file_display = str(scriber.config_path_used) if scriber.config_path_used else "Defaults" - - if rich_available: - summary_table = Table(box=rich.box.ROUNDED, show_header=False, title="[bold]Run Summary[/]", - title_justify="left") - summary_table.add_column("Parameter", style="cyan", no_wrap=True) - summary_table.add_column("Value", style="magenta") - summary_table.add_row("Project Path", str(args.root_path.resolve())) - summary_table.add_row("Config File", config_file_display) - if not args.copy_only: - summary_table.add_row("Output File", args.output or scriber.config.output) - console.print(summary_table) - else: - console.print("\n--- Run Summary ---") - console.print(f"Project Path: {str(args.root_path.resolve())}") - console.print(f"Config File: {config_file_display}") - if not args.copy_only: - console.print(f"Output File: {args.output or scriber.config.output}") - - if stats['total_files'] > 0: - if rich_available: - results_table = Table(box=rich.box.ROUNDED, show_header=False, title="[bold]πŸ“Š Analysis Results[/]", - title_justify="left") - results_table.add_column("Metric", style="cyan", no_wrap=True) - results_table.add_column("Value", style="magenta", justify="right") - results_table.add_row("Files Mapped", str(stats['total_files'])) - if stats.get('skipped_binary') > 0: - results_table.add_row("Binary Skipped", str(stats['skipped_binary'])) - results_table.add_section() - results_table.add_row("Total Size", format_bytes(stats['total_size_bytes'])) - results_table.add_row("Est. Tokens (cl100k)", f"{stats['total_tokens']:,}") - results_table.add_section() - results_table.add_row("[bold]Language Breakdown[/]", "") - for lang, count in stats['language_counts'].most_common(): - results_table.add_row(f" {lang.capitalize()}", str(count)) - console.print(results_table) - else: - console.print("\n--- Analysis Results ---") - console.print(f"Files Mapped: {stats['total_files']}") - if stats.get('skipped_binary') > 0: - console.print(f"Binary Skipped: {stats['skipped_binary']}") - console.print(f"Total Size: {format_bytes(stats['total_size_bytes'])}") - console.print(f"Est. Tokens (cl100k): {stats['total_tokens']:,}") - console.print("Language Breakdown:") - for lang, count in stats['language_counts'].most_common(): - console.print(f" {lang.capitalize()}: {count}") - else: - if rich_available: - console.print(Panel("[yellow]No files were mapped based on the current configuration.[/]", expand=False)) - else: - console.print("No files were mapped based on the current configuration.") - - if not args.copy_only: - output_filename = args.output or scriber.config.output - output_location = Path(args.root_path).resolve() / output_filename - try: - with open(output_location, 'w', encoding='utf-8') as f: - f.write(output_content) - console.print("\nβœ… [green]Success! Output saved to:[/green]") - console.print(str(output_location)) - except IOError as e: - console.print(f"\n❌ [bold red]Error saving output file:[/] {e}") - - if args.copy or args.copy_only: - try: - pyperclip.copy(output_content) - if args.copy_only: - console.print("\nβœ… [green]Success! Output copied to clipboard.[/green]") - else: - console.print("πŸ“‹ [green]Content copied to clipboard.[/green]") - except Exception as e: - console.print(f"❌ [bold red]Could not copy to clipboard: {e}[/bold red]") - - -def main() -> None: - """Parses arguments and runs the appropriate command.""" - if RICH_AVAILABLE: - # On Windows, the default console (cmd.exe) often doesn't support Unicode - # emojis. We detect this environment and disable emojis to prevent crashes, - # unless we are in a modern terminal like Windows Terminal. - is_legacy_windows = ( - sys.platform == "win32" - and not os.environ.get("WT_SESSION") - and not os.environ.get("TERMINUS_SUCKS") - and sys.stdout.encoding != "utf-8" - ) - console = Console(emoji=not is_legacy_windows) - else: - console = SimpleConsole() - - try: - version = metadata.version("project-scriber") - except metadata.PackageNotFoundError: - version = "1.0.0 (local)" - - parser = argparse.ArgumentParser( - description="Scriber: An intelligent tool to map, analyze, and compile project source code for LLM context.") - parser.add_argument("-v", "--version", action="version", version=f"%(prog)s v{version}", - help="Show the version number and exit.") - subparsers = parser.add_subparsers(dest="command", title="Commands") - - init_parser = subparsers.add_parser("init", help="Create a new configuration file interactively.") - init_parser.set_defaults(func=lambda args: handle_init(args, console, RICH_AVAILABLE)) - - run_parser = subparsers.add_parser("run", help="Map the project structure (default command).") - exec_mode = os.environ.get('SCRIBER_EXEC_MODE') - default_path = Path.cwd().parent if exec_mode == 'RUN_PY' else Path.cwd() - if exec_mode == 'RUN_PY': - del os.environ['SCRIBER_EXEC_MODE'] - - run_parser.add_argument("root_path", nargs="?", default=os.environ.get("PROJECT_SCRIBER_ROOT", default_path), - type=Path, - help="The root directory of the project to map. Defaults to the current directory.") - run_parser.add_argument("-o", "--output", help="The name of the output file. Overrides config file settings.") - run_parser.add_argument("--config", default=os.environ.get("PROJECT_SCRIBER_CONFIG"), type=Path, - help="Path to a custom configuration file.") - run_parser.add_argument("-c", "--copy", action="store_true", help="Copy the final output to the clipboard.") - run_parser.add_argument("--copy-only", action="store_true", - help="Generate the output and copy it to the clipboard without saving to a file.") - run_parser.add_argument("--tree-only", action="store_true", - help="Generate only the file tree structure without file content.") - run_parser.add_argument("--single-process", action="store_true", - help="Run in a single process to avoid issues in daemonic environments.") - run_parser.set_defaults(func=lambda args: run_scriber(args, console, version, RICH_AVAILABLE)) - - args_to_parse = sys.argv[1:] - global_flags = ['-h', '--help', '-v', '--version'] - - if not args_to_parse or args_to_parse[0] not in list(subparsers.choices) + global_flags: - args_to_parse.insert(0, 'run') - - args = parser.parse_args(args_to_parse) - - if hasattr(args, 'func'): - args.func(args) - else: - parser.print_help() - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/src/scriber/cli/__init__.py b/src/scriber/cli/__init__.py new file mode 100644 index 0000000..021eb45 --- /dev/null +++ b/src/scriber/cli/__init__.py @@ -0,0 +1,3 @@ +from .main import main, build_parser + +__all__ = ["main", "build_parser"] diff --git a/src/scriber/cli/main.py b/src/scriber/cli/main.py new file mode 100644 index 0000000..c943226 --- /dev/null +++ b/src/scriber/cli/main.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from typing import Sequence + +from scriber.core.config import DEFAULT_CONFIG_BLOCK, load_raw_pyproject, load_config, validate_config, validate_raw_config +from scriber.core.errors import ScriberError +from scriber.core.init_config import init_project +from scriber.core.root import resolve_config_path +from scriber.packer.pack import build_and_write_pack + + + + + +def _progress(msg: str) -> None: + # Use carriage return and padding to avoid external dependencies like rich + sys.stderr.write(f"\r[Scriber] {msg}".ljust(80)) + sys.stderr.flush() + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="scriber", + description="Scriber 2.0: build an intelligent code pack from one or more project paths.", + ) + parser.add_argument("paths", nargs="*", help="Project file/folder paths used as seeds. Defaults to current directory.") + parser.add_argument("--config", help="Path to pyproject.toml. Its parent directory becomes the project root.") + parser.add_argument("--path-base", choices=["project", "cwd"], default="project", help="Base directory for relative paths when --config is used.") + parser.add_argument("--format", choices=["md", "txt"], dest="output_format", help="Output format.") + parser.add_argument("--output", help="Output file path, relative to project root unless absolute. Use '-' for stdout.") + parser.add_argument("--only-tree", action="store_true", help="Render only scored tree/map, without file contents.") + parser.add_argument("--modules", dest="modules", action="store_true", help="Enable automatic related module selection.") + parser.add_argument("--no-modules", dest="modules", action="store_false", help="Disable automatic related module selection.") + parser.set_defaults(modules=None) + parser.add_argument("--support", dest="support", action="store_true", help="Enable support files.") + parser.add_argument("--no-support", dest="support", action="store_false", help="Disable support files.") + parser.set_defaults(support=None) + parser.add_argument("--support-content", choices=["full", "auto", "tree_only"], help="Override default support file content policy.") + parser.add_argument("--max-files", type=int, help="Maximum number of files in the pack.") + parser.add_argument("--max-tokens", type=int, help="Approximate token budget for included file contents. 0 disables budget.") + parser.add_argument("--min-score", type=int, help="Minimum score for non-seed files.") + parser.add_argument("--init", action="store_true", help="Append a default [tool.scriber] config to pyproject.toml and exit.") + parser.add_argument("--force", action="store_true", help="Allow --init to append even if [tool.scriber] already exists.") + parser.add_argument("--project", action="store_true", help="Force project snapshot mode.") + parser.add_argument("--explain-selection", action="store_true", help="Explain reason for file selection in detail.") + parser.add_argument("--validate-config", action="store_true", help="Validate pyproject.toml scriber config.") + parser.add_argument("--dry-run", action="store_true", help="Perform a dry run without saving the pack file.") + parser.add_argument("--open", action="store_true", help="Open the output file automatically after creation.") + parser.add_argument("--timings", action="store_true", help="Show execution timings for each phase.") + parser.add_argument("--version", action="store_true", help="Show version information and exit.") + return parser + + +def main(argv: Sequence[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + try: + if args.version: + from scriber import __version__ + print(f"scriber {__version__}") + from scriber.native import is_native_available, require_native + if is_native_available(): + native = require_native() + if hasattr(native, "build_info"): + print(f"native {native.build_info()}") + return 0 + + if args.validate_config: + config_path = resolve_config_path(args.paths or ["."], args.config) + if not config_path.exists(): + print(f"Error: Config file not found at {config_path}", file=sys.stderr) + return 1 + try: + raw_data = load_raw_pyproject(config_path) + raw_issues = validate_raw_config(raw_data) + if raw_issues: + issues = raw_issues + else: + config = load_config(config_path) + issues = validate_config(config, raw_data, config_path) + + if not issues: + print("Scriber config is valid.", file=sys.stderr) + return 0 + else: + errors = 0 + warnings = 0 + for issue in issues: + severity = issue.severity.upper() + if severity == "ERROR": + errors += 1 + else: + warnings += 1 + print(f"[{severity}] {issue.message}", file=sys.stderr) + print(f"\nValidation completed: {errors} error(s), {warnings} warning(s)", file=sys.stderr) + return 1 if errors > 0 else 0 + except Exception as exc: + print(f"Error: Failed to parse pyproject.toml: {exc}", file=sys.stderr) + return 1 + + if args.init: + path = init_project(args.config, args.force) + print(f"Scriber config written to: {path}") + return 0 + + if args.dry_run: + from scriber.packer.pack import build_pack + from scriber.core.config import apply_overrides + pack = build_pack( + args.paths or ["."], + config_path=args.config, + output=args.output, + output_format=args.output_format, + only_tree=True if args.only_tree else None, + modules=args.modules, + support=args.support, + max_files=args.max_files, + max_tokens=args.max_tokens, + min_score=args.min_score, + support_content=args.support_content, + progress_callback=_progress, + project=args.project, + path_base=args.path_base, + ) + sys.stderr.write("\r".ljust(80) + "\r") + sys.stderr.flush() + + code_count = len([c for c in pack.candidates if c.file.kind == "code" and c.include_content]) + support_count = len([c for c in pack.candidates if c.file.kind == "support" and c.include_content]) + total_count = len(pack.candidates) + + print("Scriber dry-run completed.", file=sys.stderr) + print("----------------------------------------", file=sys.stderr) + print(f" Mode: {pack.mode}", file=sys.stderr) + print(f" Code files selected: {code_count}", file=sys.stderr) + print(f" Support files selected: {support_count}", file=sys.stderr) + print(f" Total files in pack: {total_count}", file=sys.stderr) + print(f" Estimated tokens: {pack.total_tokens}", file=sys.stderr) + if args.timings and pack.timings: + print("----------------------------------------", file=sys.stderr) + print("Timings:", file=sys.stderr) + for phase, duration in pack.timings.items(): + print(f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s", file=sys.stderr) + print(f" total: {sum(pack.timings.values()):.4f}s", file=sys.stderr) + + config = load_config(pack.config_path) + config = apply_overrides(config, output=args.output) + output_path = config.output + if not output_path.is_absolute(): + output_path = pack.project_root / output_path + print(f" Proposed output path: {output_path}", file=sys.stderr) + print("----------------------------------------", file=sys.stderr) + return 0 + + output, pack = build_and_write_pack( + args.paths or ["."], + config_path=args.config, + output=args.output, + output_format=args.output_format, + only_tree=True if args.only_tree else None, + modules=args.modules, + support=args.support, + max_files=args.max_files, + max_tokens=args.max_tokens, + min_score=args.min_score, + support_content=args.support_content, + progress_callback=_progress, + project=args.project, + explain_selection=args.explain_selection, + path_base=args.path_base, + ) + + sys.stderr.write("\r".ljust(80) + "\r") + sys.stderr.flush() + + code_count = 0 + support_count = 0 + omitted_count = 0 + for cand in pack.candidates: + if cand.include_content: + if cand.file.kind == "code": + code_count += 1 + elif cand.file.kind == "support": + support_count += 1 + else: + omitted_count += 1 + + sys.stderr.write("Scriber build completed.\n") + sys.stderr.write("----------------------------------------\n") + sys.stderr.write(f" Code files included: {code_count}\n") + sys.stderr.write(f" Support files included: {support_count}\n") + sys.stderr.write(f" Files omitted/skipped: {omitted_count}\n") + sys.stderr.write(f" Estimated tokens: {pack.total_tokens}\n") + sys.stderr.write("----------------------------------------\n") + if args.timings and pack.timings: + sys.stderr.write("Timings:\n") + for phase, duration in pack.timings.items(): + sys.stderr.write(f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n") + sys.stderr.write(f" - total: {sum(pack.timings.values()):.4f}s\n") + sys.stderr.write("----------------------------------------\n") + + if output is not None: + print(f"Scriber pack written to: {output}") + if args.open: + from scriber.core.open_file import open_path + open_path(output) + return 0 + except ScriberError as exc: + parser.exit(2, f"scriber: error: {exc}\n") + except KeyboardInterrupt: + parser.exit(130, "scriber: interrupted\n") + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/scriber/config.py b/src/scriber/config.py deleted file mode 100644 index 6f205d5..0000000 --- a/src/scriber/config.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Configuration data structure for the Scriber application. -""" -from dataclasses import asdict, dataclass, field -from typing import Any, Dict, List, Set - -_DEFAULT_OUTPUT_FILENAME = "scriber_output.txt" -_CONFIG_FILE_NAME = ".scriber.json" - - -@dataclass -class ScriberConfig: - """ - A dataclass to hold all configuration settings for Scriber. - - This provides a structured, type-safe way to manage configuration, - replacing the previous dictionary-based approach. It includes methods - for easy conversion to and from dictionaries. - """ - use_gitignore: bool = True - exclude: List[str] = field(default_factory=lambda: [ - "LICENSE", - ".git/", - ".idea/", ".vscode/", ".project/", ".settings/", ".classpath/", - "__pycache__/", "*.pyc", ".venv/", "venv/", ".pytest_cache/", "uv.lock", - "node_modules/", "npm-debug.log*", "yarn-error.log", - "build/", "dist/", "target/", "bin/", "obj/", "out/", - "vendor/", "bower_components/", - "*.log", "*.lock", "*.tmp", "temp/", "tmp/", - ".DS_Store", "Thumbs.db", "*~", "*.swp", "*.swo", - _DEFAULT_OUTPUT_FILENAME, _CONFIG_FILE_NAME - ]) - include: List[str] = field(default_factory=list) - hidden: List[str] = field(default_factory=list) - exclude_map: Dict[str, List[str]] = field(default_factory=dict) - output: str = _DEFAULT_OUTPUT_FILENAME - single_process: bool = False - - def to_dict(self) -> Dict[str, Any]: - """ - Converts the configuration dataclass to a dictionary. - - Returns: - A dictionary representation of the configuration settings. - """ - return asdict(self) \ No newline at end of file diff --git a/src/scriber/core.py b/src/scriber/core.py deleted file mode 100644 index 185a5b8..0000000 --- a/src/scriber/core.py +++ /dev/null @@ -1,682 +0,0 @@ -import fnmatch -import io -import json -import multiprocessing -import os -import sys -from collections import Counter -from concurrent.futures import ProcessPoolExecutor, as_completed -from pathlib import Path -from typing import Any, Dict, List, Optional, Set, TextIO, Union - -try: - import tomllib -except ImportError: - import tomli as tomllib - -try: - import pathspec -except ImportError: - pathspec = None - -import tiktoken - -from .config import ScriberConfig - -DEFAULT_CONFIG = ScriberConfig() - - -def _process_file_worker( - file_path: Path, - containing_root: Path, - hidden_patterns: Set[str], - language_map: Dict[str, str], - tokenizer: Optional[Any], -) -> Dict[str, Any]: - """Processes a single file to gather stats; safe for multiprocessing. - - This function is defined at the top level to avoid pickling issues with - instance methods that have un-pickleable attributes (like rich.Console). - - Args: - file_path: The path of the file to process. - containing_root: The root directory that contains the file. - hidden_patterns: A set of patterns for files whose content should be hidden. - language_map: A dictionary mapping file extensions to languages. - tokenizer: The tiktoken tokenizer instance. - - Returns: - A dictionary containing the size, token count, and language of the file. - """ - stats: Dict[str, Any] = {"size": 0, "tokens": 0, "lang": "other"} - try: - stats["size"] = file_path.stat().st_size - stats["lang"] = language_map.get(file_path.suffix, language_map.get(file_path.name, "")) or "other" - - is_hidden = False - if hidden_patterns: - relative_path_str = file_path.relative_to(containing_root).as_posix() - if any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in hidden_patterns): - is_hidden = True - - if not is_hidden and tokenizer: - content = file_path.read_text(encoding="utf-8", errors="ignore") - stats["tokens"] = len(tokenizer.encode(content)) - except Exception: - pass - return stats - - -class Scriber: - """ - Maps, analyzes, and compiles a project's source code into a single output. - - This class can be used programmatically to gain fine-grained control over the - project mapping process, access intermediate data like file lists and - statistics, and get the final output as a string for further processing. - """ - _CONFIG_FILE_NAME = ".scriber.json" - _LANGUAGE_MAP = { - ".asm": "asm", ".s": "asm", ".html": "html", ".htm": "html", ".css": "css", - ".scss": "scss", ".sass": "sass", ".less": "less", ".js": "javascript", - ".mjs": "javascript", ".cjs": "javascript", ".jsx": "jsx", ".ts": "typescript", - ".tsx": "tsx", ".vue": "vue", ".svelte": "svelte", ".py": "python", ".pyw": "python", - ".rb": "ruby", ".java": "java", ".kt": "kotlin", ".kts": "kotlin", ".scala": "scala", - ".go": "go", ".php": "php", ".c": "c", ".h": "c", ".cpp": "cpp", ".hpp": "cpp", - ".cs": "csharp", ".rs": "rust", ".swift": "swift", ".dart": "dart", ".pl": "perl", - ".pm": "perl", ".hs": "haskell", ".lua": "lua", ".erl": "erlang", ".ex": "elixir", - ".exs": "elixir", ".clj": "clojure", ".lisp": "lisp", ".f": "fortran", - ".f90": "fortran", ".zig": "zig", ".d": "d", ".v": "v", ".cr": "crystal", - ".nim": "nim", ".pas": "pascal", ".ml": "ocaml", ".sh": "bash", ".bash": "bash", - ".zsh": "zsh", ".fish": "fish", ".ps1": "powershell", ".bat": "batch", - ".json": "json", ".jsonc": "jsonc", ".xml": "xml", ".yaml": "yaml", ".yml": "yaml", - ".toml": "toml", ".ini": "ini", ".properties": "properties", ".env": "dotenv", - "Dockerfile": "dockerfile", ".tf": "terraform", ".hcl": "hcl", ".groovy": "groovy", - ".gradle": "groovy", ".cmake": "cmake", "CMakeLists.txt": "cmake", ".md": "markdown", - ".mdx": "mdx", ".rst": "rst", ".tex": "latex", "LICENSE": "text", ".sql": "sql", - ".graphql": "graphql", ".proto": "protobuf", ".glsl": "glsl", ".frag": "glsl", - ".vert": "glsl", ".vb": "vbnet", ".vbs": "vbscript", - } - - def __init__( - self, - root_path: Union[Path, List[Path]], - config: Optional[Union[Dict[str, Any], ScriberConfig]] = None, - config_path: Optional[Path] = None - ): - """Initializes the Scriber instance. - - Args: - root_path: An absolute path or a list of absolute paths to the root - directories of the project(s) to be mapped. - config: An optional dictionary or ScriberConfig object of settings. - Takes the highest precedence if provided. - config_path: An optional path to a specific configuration file. - """ - raw_paths = [root_path] if isinstance(root_path, Path) else root_path - self.root_paths: List[Path] = [p.resolve() for p in raw_paths] - self.primary_root: Path = self.root_paths[0] - - self.mapped_files: List[Path] = [] - self._user_config_path = config_path - self._user_config_input = config - self.config: ScriberConfig = ScriberConfig() - self.config_path_used: Optional[Path] = None - self.gitignore_spec: Optional[Any] = None - self.dir_exclude_spec: Optional[Any] = None - self.general_exclude_spec: Optional[Any] = None - self.hidden_patterns: Set[str] = set() - self.include_patterns: List[str] = [] - self.exclude_patterns: List[str] = [] - self.exclude_map: Dict[str, List[str]] = {} - self.single_process: bool = False - - self.stats = {} - self._has_mapped = False - self._reset_stats() - self._load_config() - try: - self._tokenizer = tiktoken.get_encoding("cl100k_base") - except Exception: - self._tokenizer = None - - def _reset_stats(self): - """Resets the statistics and mapped files to their initial state.""" - self.mapped_files = [] - self.stats = { - "total_files": 0, - "total_size_bytes": 0, - "total_tokens": 0, - "language_counts": Counter(), - "skipped_binary": 0, - } - self._has_mapped = False - - def _create_default_config_file(self) -> None: - """Creates a default .scriber.json config file if no other config is found.""" - config_path = self.primary_root / self._CONFIG_FILE_NAME - print(f"✨ No config found. Creating default configuration at: {config_path}", file=sys.stderr) - - file_config = { - "use_gitignore": DEFAULT_CONFIG.use_gitignore, - "exclude": DEFAULT_CONFIG.exclude, - "include": DEFAULT_CONFIG.include, - "hidden": DEFAULT_CONFIG.hidden - } - try: - with config_path.open("w", encoding="utf-8") as f: - json.dump(file_config, f, indent=2) - except IOError as e: - print(f"❌ Could not create default config file: {e}", file=sys.stderr) - - def _load_config(self) -> None: - """Loads configuration with a clear precedence: direct config > config_path > local files.""" - config_data = DEFAULT_CONFIG.to_dict() - config_source_loaded = False - - if self._user_config_input: - if isinstance(self._user_config_input, ScriberConfig): - config_data.update(self._user_config_input.to_dict()) - else: - config_data.update(self._user_config_input) - config_source_loaded = True - self.config_path_used = None - else: - config_path_to_use = self._user_config_path - if config_path_to_use: - if not config_path_to_use.is_file(): - print(f"Warning: Config file specified by --config not found at {self._user_config_path}", file=sys.stderr) - config_path_to_use = None - else: - json_path = self.primary_root / self._CONFIG_FILE_NAME - toml_path = self.primary_root / "pyproject.toml" - if json_path.is_file(): - config_path_to_use = json_path - elif toml_path.is_file(): - config_path_to_use = toml_path - - if config_path_to_use: - self.config_path_used = config_path_to_use - try: - if config_path_to_use.suffix == ".toml": - with config_path_to_use.open("rb") as f: - toml_data = tomllib.load(f) - if "tool" in toml_data and "scriber" in toml_data["tool"]: - config_data.update(toml_data["tool"]["scriber"]) - config_source_loaded = True - else: - with config_path_to_use.open("r", encoding="utf-8") as f: - config_data.update(json.load(f)) - config_source_loaded = True - except (json.JSONDecodeError, tomllib.TOMLDecodeError, IOError) as e: - print(f"Error parsing config file {self.config_path_used}: {e}", file=sys.stderr) - - if not config_source_loaded and not self._user_config_input and self._user_config_path is None: - self._create_default_config_file() - - self.config = ScriberConfig(**config_data) - self.include_patterns = self.config.include - self.exclude_patterns = self.config.exclude - self.hidden_patterns = set(self.config.hidden) - self.exclude_map = self.config.exclude_map - self.single_process = self.config.single_process - - if not pathspec: - print("Warning: 'pathspec' not installed. .gitignore and advanced exclude patterns will be ignored.", file=sys.stderr) - else: - dir_exclude_patterns = [p for p in self.exclude_patterns if p.endswith('/')] - general_exclude_patterns = [p for p in self.exclude_patterns if not p.endswith('/')] - - self.dir_exclude_spec = pathspec.PathSpec.from_lines("gitwildmatch", dir_exclude_patterns) - self.general_exclude_spec = pathspec.PathSpec.from_lines("gitwildmatch", general_exclude_patterns) - self._load_gitignore(self.config.use_gitignore) - - def _load_gitignore(self, use_gitignore: bool) -> None: - """Loads gitignore patterns from the .gitignore file if enabled. - - Args: - use_gitignore: A boolean indicating whether to use .gitignore rules. - """ - self.gitignore_spec: Optional[pathspec.PathSpec] = None - if not use_gitignore or not pathspec: - return - - gitignore_path = self.primary_root / ".gitignore" - if gitignore_path.is_file(): - try: - with gitignore_path.open("r", encoding="utf-8") as f: - self.gitignore_spec = pathspec.PathSpec.from_lines("gitwildmatch", f) - except IOError: - pass - - def _find_containing_root(self, path: Path) -> Optional[Path]: - """Finds which root directory from self.root_paths contains the given path. - - Args: - path: The path to check. - - Returns: - The containing root path, or None if not found. - """ - for r in self.root_paths: - try: - if path.is_relative_to(r): - return r - except ValueError: - continue - return None - - def _is_binary(self, path: Path) -> bool: - """Checks if a file is likely a binary file. - - Args: - path: The path to the file. - - Returns: - True if the file contains null bytes, False otherwise. - """ - try: - with path.open('rb') as f: - return b'\0' in f.read(1024) - except IOError: - return True - - def _is_excluded(self, path: Path) -> bool: - """Determines if a file or directory should be excluded from mapping. - - Args: - path: The path to check. - - Returns: - True if the path should be excluded, False otherwise. - """ - containing_root = self._find_containing_root(path) - if not containing_root: - return True - - # When checking a directory for pruning, its path might not have a trailing - # slash, so we treat it as such for matching. - is_dir = path.is_dir() - - if self.gitignore_spec: - try: - relative_path_for_gitignore = path.relative_to(self.primary_root).as_posix() - if is_dir and not relative_path_for_gitignore.endswith('/'): - relative_path_for_gitignore += '/' - if self.gitignore_spec.match_file(relative_path_for_gitignore): - return True - except ValueError: - pass - - relative_path_str = path.relative_to(containing_root).as_posix() - - if is_dir: - path_for_dir_spec = relative_path_str + '/' - if self.dir_exclude_spec and self.dir_exclude_spec.match_file(path_for_dir_spec): - return True - if self.general_exclude_spec and self.general_exclude_spec.match_file(relative_path_str): - return True - else: # Is a file - if self.general_exclude_spec and self.general_exclude_spec.match_file(relative_path_str): - return True - - if path.is_file(): - global_patterns = self.exclude_map.get("global", []) - if any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in global_patterns): - return True - - lang = self._get_language(path) - if lang and lang in self.exclude_map: - lang_patterns = self.exclude_map.get(lang, []) - if any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in lang_patterns): - return True - - if self.include_patterns: - return not any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in self.include_patterns) - - return False - - def _is_hidden(self, path: Path) -> bool: - """Checks if a path matches any of the hidden patterns. - - Args: - path: The path to check. - - Returns: - True if the path matches a hidden pattern, False otherwise. - """ - if not self.hidden_patterns: - return False - containing_root = self._find_containing_root(path) - if not containing_root: - return False - relative_path_str = path.relative_to(containing_root).as_posix() - return any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in self.hidden_patterns) - - def _collect_files(self, perform_binary_check: bool = True) -> None: - """Walks the project directory and collects all non-excluded files. - - Args: - perform_binary_check: If False, skips the check for binary files. - """ - collected = set() - for root_dir in self.root_paths: - for root, dirs, files in os.walk(root_dir, topdown=True): - current_root = Path(root) - dirs[:] = [d for d in dirs if not self._is_excluded(current_root / d)] - for file in files: - file_path = current_root / file - if not self._is_excluded(file_path): - if perform_binary_check and self._is_binary(file_path): - self.stats['skipped_binary'] += 1 - continue - collected.add(file_path) - self.mapped_files = sorted(list(collected)) - - def map_project(self) -> None: - """Maps all relevant project files and gathers statistics.""" - self._reset_stats() - self._collect_files(perform_binary_check=True) - self._gather_stats() - self._has_mapped = True - - def map_tree_only(self) -> None: - """Maps only the project file structure without reading file contents.""" - self._reset_stats() - self._collect_files(perform_binary_check=False) - self.stats['total_files'] = len(self.mapped_files) - self._has_mapped = True - - def _gather_stats(self) -> None: - """Gathers statistics about the mapped files.""" - if not self.mapped_files: - return - - self.stats['total_files'] = len(self.mapped_files) - total_size = 0 - total_tokens = 0 - language_counts: Counter = Counter() - - if self.single_process: - for path in self.mapped_files: - containing_root = self._find_containing_root(path) - if containing_root: - try: - file_stats = _process_file_worker( - path, containing_root, self.hidden_patterns, self._LANGUAGE_MAP, self._tokenizer - ) - total_size += file_stats["size"] - total_tokens += file_stats["tokens"] - language_counts[file_stats["lang"]] += 1 - except Exception as exc: - print(f"File processing generated an exception: {exc}", file=sys.stderr) - else: - context = multiprocessing.get_context("spawn") - with ProcessPoolExecutor(mp_context=context) as executor: - futures = [] - for path in self.mapped_files: - containing_root = self._find_containing_root(path) - if containing_root: - futures.append(executor.submit( - _process_file_worker, - path, - containing_root, - self.hidden_patterns, - self._LANGUAGE_MAP, - self._tokenizer, - )) - - for future in as_completed(futures): - try: - file_stats = future.result() - total_size += file_stats["size"] - total_tokens += file_stats["tokens"] - language_counts[file_stats["lang"]] += 1 - except Exception as exc: - print(f"File processing generated an exception: {exc}", file=sys.stderr) - - self.stats['total_size_bytes'] = total_size - self.stats['total_tokens'] = total_tokens - self.stats['language_counts'] = language_counts - - def get_stats(self) -> Dict: - """Returns the collected project statistics. - - If the project has not been mapped yet, `map_project()` will be called first. - - Returns: - A dictionary containing project statistics. - """ - if not self._has_mapped: - self.map_project() - return self.stats - - def get_file_count(self) -> int: - """Returns the number of files that will be mapped. - - If the project has not been mapped yet, `map_project()` will be called first. - - Returns: - The total count of mapped files. - """ - if not self._has_mapped: - self.map_project() - return len(self.mapped_files) - - def get_mapped_files(self) -> List[Path]: - """Returns a list of all mapped file paths. - - If the project has not been mapped yet, `map_project()` will be called first. - - Returns: - A sorted list of `pathlib.Path` objects for all included files. - """ - if not self._has_mapped: - self.map_project() - return self.mapped_files - - def get_tree(self) -> str: - """Returns the formatted file tree representation as a string. - - If the project has not been mapped yet, `map_project()` will be called first. - - Returns: - A string containing the formatted file tree. - """ - if not self._has_mapped: - self.map_project() - return self._get_tree_representation() - - def get_output_as_string(self, tree_only: bool = False, progress=None, task_id=None) -> str: - """Generates the consolidated project output and returns it as a string. - - If the project has not been mapped yet, `map_project()` will be called first. - - Args: - tree_only: If True, the string will only contain the file tree. - progress: An optional Rich Progress instance for updating a progress bar. - task_id: An optional ID for the task in the Rich Progress instance. - - Returns: - A string containing the complete project map and file contents. - """ - if not self._has_mapped: - if tree_only: - self.map_tree_only() - else: - self.map_project() - output_buffer = io.StringIO() - self._write_output(output_buffer, tree_only, progress=progress, task_id=task_id) - return output_buffer.getvalue() - - def generate_output_file(self, output_filename: str, tree_only: bool = False, progress=None, task_id=None) -> None: - """Generates the consolidated project structure output file. - - Args: - output_filename: The name for the output file. - tree_only: If True, only the file tree is written. - progress: A Rich Progress instance for updating the progress bar. - task_id: The ID of the task in the Rich Progress instance. - """ - if not self._has_mapped: - if tree_only: - self.map_tree_only() - else: - self.map_project() - output_filepath = self.primary_root / output_filename - with output_filepath.open("w", encoding="utf-8") as f: - self._write_output(f, tree_only, progress, task_id) - - def _write_output(self, f: TextIO, tree_only: bool, progress, task_id) -> None: - """Writes the complete project map and file contents to an open file stream. - - Args: - f: The file stream to write to. - tree_only: If True, only write the file tree. - progress: A Rich Progress instance for updating the progress bar. - task_id: The ID of the task in the Rich Progress instance. - """ - f.write("=" * 3 + "\n Mapped Folder Structure\n" + "=" * 3 + "\n\n") - f.write(self._get_tree_representation() + "\n") - - if tree_only: - return - - for file_path in self.mapped_files: - if self._is_hidden(file_path): - self._write_hidden_file_placeholder(f, file_path) - else: - self._write_file_content(f, file_path) - if progress and task_id is not None: - progress.update(task_id, advance=1) - - def _get_display_path(self, file_path: Path) -> str: - """Gets the path to display in the output header. - - Args: - file_path: The absolute path to the file. - - Returns: - A string representing the path for display. - """ - containing_root = self._find_containing_root(file_path) - if not containing_root: - return file_path.name - - relative_path = file_path.relative_to(containing_root) - if len(self.root_paths) > 1: - return (Path(containing_root.name) / relative_path).as_posix() - return relative_path.as_posix() - - def _write_hidden_file_placeholder(self, f: TextIO, file_path: Path) -> None: - """Writes a placeholder for a hidden file's content. - - Args: - f: The file stream to write to. - file_path: The path of the hidden file. - """ - try: - display_path = self._get_display_path(file_path) - file_size = file_path.stat().st_size - except (OSError, ValueError): - return - - f.write("\n" + "-" * 3 + "\n") - f.write(f"File: {display_path}\nSize: {file_size} bytes\n" + "-" * 3 + "\n") - f.write("```\n[Content hidden based on configuration]\n```\n") - - def _write_file_content(self, f: TextIO, file_path: Path) -> None: - """Writes a single file's content to the output stream. - - Args: - f: The file stream to write to. - file_path: The path of the file to write. - """ - try: - display_path = self._get_display_path(file_path) - file_size = file_path.stat().st_size - lang = self._get_language(file_path) - content = file_path.read_text(encoding="utf-8", errors="ignore") - except (OSError, ValueError): - return - - f.write("\n" + "-" * 3 + "\n") - f.write(f"File: {display_path}\nSize: {file_size} bytes\n" + "-" * 3 + "\n") - f.write(f"```{lang}\n{content}\n```\n") - - def _get_language(self, file_path: Path) -> str: - """Determines the programming language of a file based on its extension. - - Args: - file_path: The path to the file. - - Returns: - A string representing the language, or an empty string if not found. - """ - return self._LANGUAGE_MAP.get(file_path.suffix, self._LANGUAGE_MAP.get(file_path.name, "")) - - def _get_tree_representation(self) -> str: - """Generates a string representation of the project's file tree. - - Returns: - A formatted string of the file tree. - """ - tree = self._build_file_tree() - if not tree: return "No files or folders to map." - - def format_tree(d: Dict, prefix: str = "") -> List[str]: - lines = [] - items = sorted(d.keys()) - for i, key in enumerate(items): - is_last = i == len(items) - 1 - connector = "└── " if is_last else "β”œβ”€β”€ " - lines.append(f"{prefix}{connector}{key}") - if d[key]: - new_prefix = prefix + (" " if is_last else "β”‚ ") - lines.extend(format_tree(d[key], new_prefix)) - return lines - - if len(self.root_paths) == 1: - root_name = list(tree.keys())[0] - output_lines = [root_name] - output_lines.extend(format_tree(tree[root_name])) - else: - output_lines = [] - for root_name, subtree in sorted(tree.items()): - output_lines.append(root_name) - output_lines.extend(format_tree(subtree)) - return "\n".join(output_lines) - - def _build_file_tree(self) -> Dict[str, Any]: - """Builds a nested dictionary representing the file tree structure. - - Returns: - A dictionary representing the project's file hierarchy. - """ - if not self.mapped_files: return {} - - if len(self.root_paths) == 1: - tree = {self.primary_root.name: {}} - project_level = tree[self.primary_root.name] - for path in self.mapped_files: - parts = path.relative_to(self.primary_root).parts - current_level = project_level - for part in parts: - current_level = current_level.setdefault(part, {}) - return tree - else: - tree = {} - for path in self.mapped_files: - containing_root = self._find_containing_root(path) - if not containing_root: - continue - - root_name = containing_root.name - if root_name not in tree: - tree[root_name] = {} - - parts = path.relative_to(containing_root).parts - current_level = tree[root_name] - for part in parts: - current_level = current_level.setdefault(part, {}) - return tree \ No newline at end of file diff --git a/src/scriber/core/__init__.py b/src/scriber/core/__init__.py new file mode 100644 index 0000000..b3b8256 --- /dev/null +++ b/src/scriber/core/__init__.py @@ -0,0 +1,64 @@ +from .errors import ScriberError +from .models import ( + Candidate, + ContentPolicy, + FileKind, + FileNode, + ModuleConfig, + ModuleGraph, + OutputFormat, + PythonConfig, + ScriberConfig, + ScriberPack, + SeedPath, + SupportContentConfig, +) +from .matchers import SimpleGitIgnore, match_pattern, matches_any, normalize_rel +from .root import ( + resolve_config_path, + project_root_from_config, + ensure_inside_root, + rel_to_root, +) +from .config import ( + DEFAULT_CODE_PATTERNS, + DEFAULT_SUPPORT_PATTERNS, + DEFAULT_SUPPORT_FULL, + DEFAULT_SUPPORT_TREE_ONLY, + DEFAULT_HARD_IGNORE, + DEFAULT_CONFIG_BLOCK, + load_config, + apply_overrides, +) + +__all__ = [ + "ScriberError", + "Candidate", + "ContentPolicy", + "FileKind", + "FileNode", + "ModuleConfig", + "ModuleGraph", + "OutputFormat", + "PythonConfig", + "ScriberConfig", + "ScriberPack", + "SeedPath", + "SupportContentConfig", + "SimpleGitIgnore", + "match_pattern", + "matches_any", + "normalize_rel", + "resolve_config_path", + "project_root_from_config", + "ensure_inside_root", + "rel_to_root", + "DEFAULT_CODE_PATTERNS", + "DEFAULT_SUPPORT_PATTERNS", + "DEFAULT_SUPPORT_FULL", + "DEFAULT_SUPPORT_TREE_ONLY", + "DEFAULT_HARD_IGNORE", + "DEFAULT_CONFIG_BLOCK", + "load_config", + "apply_overrides", +] diff --git a/src/scriber/core/config.py b/src/scriber/core/config.py new file mode 100644 index 0000000..fed0545 --- /dev/null +++ b/src/scriber/core/config.py @@ -0,0 +1,447 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any +from dataclasses import dataclass + +try: # pragma: no cover - exercised on Python < 3.11 only + import tomllib +except ModuleNotFoundError: # pragma: no cover + import tomli as tomllib # type: ignore[no-redef] + +from .models import CacheConfig, ModuleConfig, PythonConfig, ScriberConfig, SupportContentConfig, TokenConfig + +DEFAULT_CODE_PATTERNS = [ + "**/*.py", + "**/*.pyi", + "**/*.rs", + "**/*.js", + "**/*.jsx", + "**/*.ts", + "**/*.tsx", + "**/*.go", + "**/*.java", + "**/*.kt", + "**/*.c", + "**/*.cpp", + "**/*.h", + "**/*.hpp", +] + +DEFAULT_SUPPORT_PATTERNS = [ + "**/*.toml", + "**/*.lock", + "pyproject.toml", + "setup.py", + "setup.cfg", + "requirements.txt", + "requirements/*.txt", + "tox.ini", + "pytest.ini", + "mypy.ini", + "ruff.toml", + ".ruff.toml", + "Pipfile", + "README.md", + "README.rst", + "CHANGELOG.md", + "CONTRIBUTING.md", + "docs/**/*.md", + ".env.example", + ".env.template", + "config/*.toml", + "config/*.yaml", + "config/*.yml", + "config/*.json", + "settings/*.toml", + "settings/*.yaml", + "settings/*.yml", + "Dockerfile", + "Dockerfile.*", + "docker-compose.yml", + "docker-compose.yaml", + "compose.yml", + "compose.yaml", + ".github/workflows/*.yml", + ".github/workflows/*.yaml", + ".gitlab-ci.yml", + ".pre-commit-config.yaml", + "package.json", + "tsconfig.json", + "vite.config.*", + "webpack.config.*", + "Cargo.toml", + "Cargo.lock", + "go.mod", + "go.sum", + "poetry.lock", + "uv.lock", + "Pipfile.lock", + "package-lock.json", + "pnpm-lock.yaml", + "yarn.lock", + "**/*.svg", +] + +DEFAULT_SUPPORT_FULL = [ + "**/*.toml", + "pyproject.toml", + "requirements.txt", + "requirements/*.txt", + "pytest.ini", + "tox.ini", + "mypy.ini", + "ruff.toml", + ".ruff.toml", + ".env.example", + ".env.template", + "Dockerfile", + "Dockerfile.*", + "docker-compose.yml", + "docker-compose.yaml", + ".github/workflows/*.yml", + ".github/workflows/*.yaml", + "README.md", + "Cargo.toml", + "go.mod", +] + +DEFAULT_SUPPORT_TREE_ONLY = [ + "**/*.svg", + "**/*.lock", + "Cargo.lock", + "poetry.lock", + "uv.lock", + "Pipfile.lock", + "package-lock.json", + "pnpm-lock.yaml", + "yarn.lock", + "go.sum", +] + +DEFAULT_HARD_IGNORE = [ + ".git/**", + ".idea/**", + ".hg/**", + ".svn/**", + ".scriber/**", + ".venv/**", + "venv/**", + "env/**", + "__pycache__/**", + ".pytest_cache/**", + ".mypy_cache/**", + ".ruff_cache/**", + "node_modules/**", + "dist/**", + "build/**", + "target/**", + ".next/**", + ".turbo/**", +] + +DEFAULT_CONFIG_BLOCK = """ +[tool.scriber] +version = "2" +format = "md" +output = ".scriber/scriber_pack.md" +only_tree = false +use_gitignore = true +max_files = 60 +max_tokens = 100000 +min_score = 45 +path_style = "project-relative" +allow_external_paths = false + +[tool.scriber.code_files] +patterns = ["**/*.py", "**/*.pyi", "**/*.rs", "**/*.js", "**/*.jsx", "**/*.ts", "**/*.tsx"] + +[tool.scriber.support_files] +enabled = true +patterns = [ + "**/*.toml", + "**/*.lock", + "pyproject.toml", + "README.md", + "requirements.txt", + "requirements/*.txt", + ".env.example", + "Dockerfile", + "docker-compose.yml", + ".github/workflows/*.yml", + "**/*.svg", +] + +[tool.scriber.support_files.content] +default = "auto" +full = ["**/*.toml", "pyproject.toml", "README.md", "requirements.txt", "requirements/*.txt", ".env.example", "Dockerfile", "docker-compose.yml", ".github/workflows/*.yml"] +tree_only = ["**/*.svg", "**/*.lock"] + +[tool.scriber.modules] +enabled = true +depth = 2 +include_direct_dependencies = true +include_reverse_dependencies = true +include_tests = true +include_same_package = true +include_parent_entrypoints = true +include_project_configs = true +content_min_score = 50 +tree_min_score = 30 + +[tool.scriber.python] +source_roots = ["src", "app", "."] +test_roots = ["tests", "test"] +entrypoint_patterns = ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"] + +[tool.scriber.tokens] +estimator = "chars" +chars_per_token = 4 +""".strip() + + +def load_raw_pyproject(config_path: Path) -> dict[str, Any]: + with config_path.open("rb") as handle: + return tomllib.load(handle) + + +def load_config(config_path: Path) -> ScriberConfig: + raw = load_raw_pyproject(config_path) + tool = raw.get("tool", {}) if isinstance(raw, dict) else {} + data = tool.get("scriber", {}) if isinstance(tool, dict) else {} + + config = ScriberConfig( + code_patterns=list(DEFAULT_CODE_PATTERNS), + support_patterns=list(DEFAULT_SUPPORT_PATTERNS), + hard_ignore_patterns=list(DEFAULT_HARD_IGNORE), + ) + + if not isinstance(data, dict): + return config + + config.version = str(data.get("version", config.version)) + config.format = data.get("format", config.format) + config.output = Path(data.get("output", str(config.output))) + config.only_tree = bool(data.get("only_tree", config.only_tree)) + config.use_gitignore = bool(data.get("use_gitignore", config.use_gitignore)) + config.max_files = int(data.get("max_files", config.max_files)) + config.max_tokens = int(data.get("max_tokens", config.max_tokens)) + config.min_score = int(data.get("min_score", config.min_score)) + config.path_style = str(data.get("path_style", config.path_style)) + config.allow_external_paths = bool(data.get("allow_external_paths", config.allow_external_paths)) + + code_files = data.get("code_files", {}) + if isinstance(code_files, dict) and isinstance(code_files.get("patterns"), list): + config.code_patterns = [str(item) for item in code_files["patterns"]] + + support_files = data.get("support_files", {}) + if isinstance(support_files, dict): + config.support = bool(support_files.get("enabled", config.support)) + if isinstance(support_files.get("patterns"), list): + config.support_patterns = [str(item) for item in support_files["patterns"]] + content = support_files.get("content", {}) + if isinstance(content, dict): + config.support_content = SupportContentConfig( + default=content.get("default", config.support_content.default), + full=[str(item) for item in content.get("full", config.support_content.full)], + tree_only=[str(item) for item in content.get("tree_only", config.support_content.tree_only)], + auto_max_bytes=int(content.get("auto_max_bytes", config.support_content.auto_max_bytes)), + ) + if not config.support_content.full: + config.support_content.full = list(DEFAULT_SUPPORT_FULL) + if not config.support_content.tree_only: + config.support_content.tree_only = list(DEFAULT_SUPPORT_TREE_ONLY) + + hard_ignore = data.get("hard_ignore", {}) + if isinstance(hard_ignore, dict) and isinstance(hard_ignore.get("patterns"), list): + config.hard_ignore_patterns = [str(item) for item in hard_ignore["patterns"]] + + modules = data.get("modules", {}) + if isinstance(modules, dict): + scoring = dict(config.modules_config.scoring) + raw_scoring = modules.get("scoring", {}) + if isinstance(raw_scoring, dict): + scoring.update({str(key): int(value) for key, value in raw_scoring.items()}) + config.modules_config = ModuleConfig( + enabled=bool(modules.get("enabled", config.modules_config.enabled)), + depth=int(modules.get("depth", config.modules_config.depth)), + include_direct_dependencies=bool(modules.get("include_direct_dependencies", config.modules_config.include_direct_dependencies)), + include_reverse_dependencies=bool(modules.get("include_reverse_dependencies", config.modules_config.include_reverse_dependencies)), + include_tests=bool(modules.get("include_tests", config.modules_config.include_tests)), + include_same_package=bool(modules.get("include_same_package", config.modules_config.include_same_package)), + include_parent_entrypoints=bool(modules.get("include_parent_entrypoints", config.modules_config.include_parent_entrypoints)), + include_project_configs=bool(modules.get("include_project_configs", config.modules_config.include_project_configs)), + content_min_score=int(modules.get("content_min_score", config.modules_config.content_min_score)), + tree_min_score=int(modules.get("tree_min_score", config.modules_config.tree_min_score)), + scoring=scoring, + ) + config.modules = config.modules_config.enabled + + python = data.get("python", {}) + if isinstance(python, dict): + config.python = PythonConfig( + source_roots=[str(item) for item in python.get("source_roots", config.python.source_roots)], + test_roots=[str(item) for item in python.get("test_roots", config.python.test_roots)], + module_init_files=[str(item) for item in python.get("module_init_files", config.python.module_init_files)], + entrypoint_patterns=[str(item) for item in python.get("entrypoint_patterns", config.python.entrypoint_patterns)], + ) + + tokens = data.get("tokens", {}) + if isinstance(tokens, dict): + config.tokens = TokenConfig( + estimator=str(tokens.get("estimator", config.tokens.estimator)), + chars_per_token=int(tokens.get("chars_per_token", config.tokens.chars_per_token)), + ) + + cache = data.get("cache", {}) + if isinstance(cache, dict): + config.cache = CacheConfig( + enabled=bool(cache.get("enabled", config.cache.enabled)), + dir=str(cache.get("dir", config.cache.dir)), + ) + + return config + + +def apply_overrides( + config: ScriberConfig, + *, + output: str | None = None, + output_format: str | None = None, + only_tree: bool | None = None, + modules: bool | None = None, + support: bool | None = None, + max_files: int | None = None, + max_tokens: int | None = None, + min_score: int | None = None, + support_content: str | None = None, +) -> ScriberConfig: + if output is not None: + config.output = Path(output) + if output_format is not None: + config.format = output_format # type: ignore[assignment] + if only_tree is not None: + config.only_tree = only_tree + if modules is not None: + config.modules = modules + config.modules_config.enabled = modules + if support is not None: + config.support = support + if max_files is not None: + config.max_files = max_files + if max_tokens is not None: + config.max_tokens = max_tokens + if min_score is not None: + config.min_score = min_score + if support_content is not None: + if support_content not in {"full", "auto", "tree_only"}: + raise ValueError("support_content must be one of: full, auto, tree_only") + config.support_content.default = support_content # type: ignore[assignment] + return config + + +@dataclass(slots=True) +class ConfigIssue: + severity: str # "warning" or "error" + message: str + + +def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]: + issues: list[ConfigIssue] = [] + + # 1. check if raw_data contains tool.scriber + tool = raw_data.get("tool", {}) if isinstance(raw_data, dict) else {} + if not isinstance(tool, dict): + issues.append(ConfigIssue("error", "[tool] in pyproject.toml must be a table.")) + return issues + + data = tool.get("scriber", {}) if isinstance(tool, dict) else {} + if not data: + issues.append(ConfigIssue("warning", "[tool.scriber] section is missing or empty.")) + return issues + + if not isinstance(data, dict): + issues.append(ConfigIssue("error", "[tool.scriber] must be a table.")) + return issues + + # 2. check output format + if "format" in data and data["format"] not in {"md", "txt"}: + issues.append(ConfigIssue("error", f"Invalid format: '{data['format']}'. Must be 'md' or 'txt'.")) + + # 4. check support_content default + support_files = data.get("support_files", {}) + if isinstance(support_files, dict): + content = support_files.get("content", {}) + if isinstance(content, dict) and "default" in content: + val = content["default"] + if val not in {"full", "auto", "tree_only"}: + issues.append(ConfigIssue("error", f"Invalid support_files.content.default: '{val}'. Must be 'full', 'auto', or 'tree_only'.")) + + # 5. check numeric values >= 0 + for field in ["max_files", "max_tokens", "min_score"]: + if field in data: + try: + val = int(data[field]) + if val < 0: + issues.append(ConfigIssue("error", f"{field} must be a number >= 0. Got: {val}")) + except (ValueError, TypeError): + issues.append(ConfigIssue("error", f"{field} must be an integer. Got: {data[field]}")) + + # 6. check patterns are list of strings + def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None: + if "patterns" in parent_dict: + patterns = parent_dict["patterns"] + if not isinstance(patterns, list): + issues.append(ConfigIssue("error", f"{path_name}.patterns must be a list of strings.")) + else: + for item in patterns: + if not isinstance(item, str): + issues.append(ConfigIssue("error", f"Pattern in {path_name}.patterns must be a string. Got: {item}")) + + code_files = data.get("code_files", {}) + if isinstance(code_files, dict): + check_pattern_list(code_files, "code_files") + elif "code_files" in data: + issues.append(ConfigIssue("error", "code_files must be a table.")) + + if isinstance(support_files, dict): + check_pattern_list(support_files, "support_files") + + # Check support_files.content full and tree_only patterns + content = support_files.get("content", {}) + if isinstance(content, dict): + for field in ["full", "tree_only"]: + if field in content: + patterns = content[field] + if not isinstance(patterns, list): + issues.append(ConfigIssue("error", f"support_files.content.{field} must be a list of strings.")) + else: + for item in patterns: + if not isinstance(item, str): + issues.append(ConfigIssue("error", f"Pattern in support_files.content.{field} must be a string. Got: {item}")) + elif "support_files" in data: + issues.append(ConfigIssue("error", "support_files must be a table.")) + + hard_ignore = data.get("hard_ignore", {}) + if isinstance(hard_ignore, dict): + check_pattern_list(hard_ignore, "hard_ignore") + elif "hard_ignore" in data: + issues.append(ConfigIssue("error", "hard_ignore must be a table.")) + + return issues + +def validate_config(config: ScriberConfig, raw_data: dict[str, Any], config_path: Path | None = None) -> list[ConfigIssue]: + issues = validate_raw_config(raw_data) + + # Check output path is not a directory + output_path = config.output + if not output_path.is_absolute() and config_path: + output_path = config_path.parent / output_path + + if output_path.suffix == "" and not str(output_path).endswith("-"): + issues.append(ConfigIssue("warning", f"Output path '{output_path}' has no extension. Is it a directory?")) + if output_path.exists() and output_path.is_dir(): + issues.append(ConfigIssue("error", f"Output path '{output_path}' points to an existing directory.")) + + return issues + diff --git a/src/scriber/core/errors.py b/src/scriber/core/errors.py new file mode 100644 index 0000000..08992d1 --- /dev/null +++ b/src/scriber/core/errors.py @@ -0,0 +1,2 @@ +class ScriberError(Exception): + """Base exception for expected Scriber failures.""" diff --git a/src/scriber/core/init_config.py b/src/scriber/core/init_config.py new file mode 100644 index 0000000..f345fe8 --- /dev/null +++ b/src/scriber/core/init_config.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from pathlib import Path +from scriber.core.errors import ScriberError +from scriber.core.config import DEFAULT_CONFIG_BLOCK + + +def replace_existing_tool_scriber_block(content: str, default_block: str) -> str: + lines = content.splitlines() + new_lines = [] + in_scriber = False + + for line in lines: + stripped = line.strip() + if stripped.startswith("[") and stripped.endswith("]"): + header = stripped[1:-1].strip() + if header == "tool.scriber" or header.startswith("tool.scriber."): + in_scriber = True + continue + else: + in_scriber = False + + if not in_scriber: + new_lines.append(line) + + cleaned = "\n".join(new_lines).strip() + if cleaned: + return cleaned + "\n\n" + default_block + "\n" + return default_block + "\n" + + +def init_project(config_path: str | None = None, force: bool = False) -> Path: + path = Path(config_path or "pyproject.toml") + if path.is_dir(): + path = path / "pyproject.toml" + if not path.is_absolute(): + path = Path.cwd() / path + + if path.exists(): + content = path.read_text(encoding="utf-8") + has_scriber = "[tool.scriber]" in content + + if has_scriber and not force: + raise ScriberError(f"Scriber config already exists. Use --force to replace it.") + + if has_scriber: + new_content = replace_existing_tool_scriber_block(content, DEFAULT_CONFIG_BLOCK) + else: + if content and not content.endswith("\n"): + content += "\n" + new_content = content + "\n" + DEFAULT_CONFIG_BLOCK + "\n" + + path.write_text(new_content, encoding="utf-8") + else: + path.write_text(DEFAULT_CONFIG_BLOCK + "\n", encoding="utf-8") + + return path diff --git a/src/scriber/core/matchers.py b/src/scriber/core/matchers.py new file mode 100644 index 0000000..703a8c0 --- /dev/null +++ b/src/scriber/core/matchers.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import fnmatch +from pathlib import PurePosixPath + + +def normalize_rel(value: str) -> str: + return value.replace("\\", "/").strip("/") + + +def match_pattern(path: str | PurePosixPath, pattern: str) -> bool: + """Match a project-relative POSIX path against a pragmatic glob pattern. + + This intentionally stays dependency-free. It is not a full gitwildmatch + implementation, but it handles the common patterns used in pyproject config: + `*.py`, `**/*.py`, `dir/**`, `dir/`, exact file paths and basename patterns. + """ + + rel = normalize_rel(str(path)) + pat = pattern.replace("\\", "/").strip() + if not pat: + return False + if pat.startswith("/"): + pat = pat[1:] + pat = pat.strip("/") if pat.endswith("/") else pat + + if rel == pat: + return True + + if pat.endswith("/**"): + prefix = pat[:-3].strip("/") + return rel == prefix or rel.startswith(prefix + "/") + + if fnmatch.fnmatch(rel, pat): + return True + + name = rel.rsplit("/", 1)[-1] + if "/" not in pat and fnmatch.fnmatch(name, pat): + return True + + if pat.startswith("**/"): + short = pat[3:] + if fnmatch.fnmatch(rel, short) or fnmatch.fnmatch(name, short): + return True + + try: + return PurePosixPath(rel).match(pat) + except ValueError: + return False + + +def matches_any(path: str | PurePosixPath, patterns: list[str]) -> bool: + return any(match_pattern(path, pattern) for pattern in patterns) + + +class SimpleGitIgnore: + """Small .gitignore-style matcher used only for dependency-free defaults.""" + + def __init__(self, patterns: list[tuple[bool, str]]) -> None: + self.patterns = patterns + + @classmethod + def from_file(cls, path): + if not path.exists(): + return cls([]) + parsed: list[tuple[bool, str]] = [] + for raw in path.read_text(encoding="utf-8", errors="ignore").splitlines(): + line = raw.strip() + if not line or line.startswith("#"): + continue + negated = line.startswith("!") + if negated: + line = line[1:].strip() + if line: + parsed.append((negated, line)) + return cls(parsed) + + def ignores(self, rel_path: str, is_dir: bool = False) -> bool: + rel = normalize_rel(rel_path) + ignored = False + for negated, pattern in self.patterns: + if self._matches(rel, pattern, is_dir): + ignored = not negated + return ignored + + def _matches(self, rel: str, pattern: str, is_dir: bool) -> bool: + pat = pattern.replace("\\", "/").strip() + if not pat: + return False + if pat.startswith("/"): + pat = pat[1:] + + if pat.endswith("/"): + prefix = pat.strip("/") + return rel == prefix or rel.startswith(prefix + "/") + + if "/" not in pat: + parts = rel.split("/") + return any(fnmatch.fnmatch(part, pat) for part in parts) + + return match_pattern(rel, pat) diff --git a/src/scriber/core/models.py b/src/scriber/core/models.py new file mode 100644 index 0000000..c521c49 --- /dev/null +++ b/src/scriber/core/models.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +FileKind = Literal["code", "support", "other"] +ContentPolicy = Literal["full", "auto", "tree_only"] +OutputFormat = Literal["md", "txt"] +PackMode = Literal["focused", "project_snapshot"] + + + +DEFAULT_SCORING: dict[str, int] = { + "seed_file": 100, + "seed_folder_file": 100, + "direct_dependency": 90, + "reverse_dependency": 85, + "related_test": 80, + "same_package": 65, + "parent_entrypoint": 60, + "support_near_seed": 60, + "project_config": 55, + "dependency_file": 52, + "runtime_support": 50, + "documentation": 45, + "name_similarity": 45, + "shared_dependency_bonus": 10, +} + + +@dataclass(slots=True) +class ModuleConfig: + enabled: bool = True + depth: int = 2 + include_direct_dependencies: bool = True + include_reverse_dependencies: bool = True + include_tests: bool = True + include_same_package: bool = True + include_parent_entrypoints: bool = True + include_project_configs: bool = True + content_min_score: int = 50 + tree_min_score: int = 30 + scoring: dict[str, int] = field(default_factory=lambda: dict(DEFAULT_SCORING)) + + +@dataclass(slots=True) +class PythonConfig: + source_roots: list[str] = field(default_factory=lambda: ["src", "app", "."]) + test_roots: list[str] = field(default_factory=lambda: ["tests", "test"]) + module_init_files: list[str] = field(default_factory=lambda: ["__init__.py"]) + entrypoint_patterns: list[str] = field( + default_factory=lambda: ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"] + ) + + +@dataclass(slots=True) +class SupportContentConfig: + default: ContentPolicy = "auto" + full: list[str] = field(default_factory=list) + tree_only: list[str] = field(default_factory=list) + auto_max_bytes: int = 80_000 + + +@dataclass(slots=True) +class TokenConfig: + estimator: str = "chars" + chars_per_token: int = 4 + + +@dataclass(slots=True) +class CacheConfig: + enabled: bool = True + dir: str = ".scriber/cache" + + +@dataclass(slots=True) +class ScriberConfig: + version: str = "2" + format: OutputFormat = "md" + output: Path = Path(".scriber/scriber_pack.md") + only_tree: bool = False + modules: bool = True + support: bool = True + use_gitignore: bool = True + max_files: int = 60 + max_tokens: int = 100_000 + min_score: int = 45 + path_style: str = "project-relative" + allow_external_paths: bool = False + code_patterns: list[str] = field(default_factory=list) + support_patterns: list[str] = field(default_factory=list) + hard_ignore_patterns: list[str] = field(default_factory=list) + support_content: SupportContentConfig = field(default_factory=SupportContentConfig) + modules_config: ModuleConfig = field(default_factory=ModuleConfig) + python: PythonConfig = field(default_factory=PythonConfig) + tokens: TokenConfig = field(default_factory=TokenConfig) + cache: CacheConfig = field(default_factory=CacheConfig) + + +@dataclass(frozen=True, slots=True) +class FileNode: + absolute: Path + relative: Path + kind: FileKind + language: str + size_bytes: int + is_binary: bool = False + support_category: str | None = None + content_policy: ContentPolicy = "auto" + _cached_text: str | None = field(default=None, init=False, repr=False, compare=False, hash=False) + + def read_text(self) -> str: + if self._cached_text is not None: + return self._cached_text + from scriber.native import require_native + text = require_native().read_text(str(self.absolute)) + object.__setattr__(self, "_cached_text", text) + return text + + +@dataclass(slots=True) +class SeedPath: + original: Path + absolute: Path + relative: Path + is_dir: bool + expanded_files: list[Path] = field(default_factory=list) + + +@dataclass(slots=True) +class Candidate: + file: FileNode + score: int + reasons: list[str] = field(default_factory=list) + seed_sources: set[Path] = field(default_factory=set) + include_content: bool = False + content: str | None = None + token_estimate: int = 0 + omitted_reason: str | None = None + reason_counts: dict[str, int] = field(default_factory=dict) + reason_examples: dict[str, list[Path]] = field(default_factory=dict) + reason_summary: str = "" + + +@dataclass(slots=True) +class ModuleGraph: + imports: dict[Path, set[Path]] = field(default_factory=dict) + imported_by: dict[Path, set[Path]] = field(default_factory=dict) + + +@dataclass(slots=True) +class ScriberPack: + project_root: Path + config_path: Path + seed_paths: list[SeedPath] + candidates: list[Candidate] + graph: ModuleGraph + only_tree: bool + output_format: OutputFormat + mode: PackMode + total_tokens: int = 0 + timings: dict[str, float] = field(default_factory=dict) + + @property + def included_paths(self) -> list[Path]: + return [candidate.file.relative for candidate in self.candidates] diff --git a/src/scriber/core/open_file.py b/src/scriber/core/open_file.py new file mode 100644 index 0000000..55537b2 --- /dev/null +++ b/src/scriber/core/open_file.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import sys +import os +import subprocess +from pathlib import Path + + +def open_path(path: Path) -> None: + if not path.exists(): + return + + path_str = str(path.resolve()) + try: + if sys.platform == "win32": + os.startfile(path_str) + elif sys.platform == "darwin": + subprocess.run(["open", path_str], check=True) + else: + subprocess.run(["xdg-open", path_str], check=True) + except Exception as exc: + sys.stderr.write(f"Warning: Failed to open pack file: {exc}\n") diff --git a/src/scriber/core/root.py b/src/scriber/core/root.py new file mode 100644 index 0000000..b8042b5 --- /dev/null +++ b/src/scriber/core/root.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from pathlib import Path + +from .errors import ScriberError + + +def resolve_config_path(paths: list[str], explicit_config: str | None = None) -> Path: + if explicit_config: + config = Path(explicit_config).expanduser() + if config.is_dir(): + config = config / "pyproject.toml" + if not config.is_absolute(): + config = Path.cwd() / config + config = config.resolve() + if not config.exists(): + raise ScriberError(f"Config not found: {config}") + if config.name != "pyproject.toml": + raise ScriberError("Scriber 2.0 expects --config to point to pyproject.toml") + return config + + starts: list[Path] = [] + for raw in paths or ["."]: + path = Path(raw).expanduser() + if not path.is_absolute(): + path = Path.cwd() / path + # We allow paths that do not exist to report a better error later, but + # root discovery should still start from the nearest existing parent. + probe = path.resolve(strict=False) + if probe.exists() and probe.is_file(): + probe = probe.parent + elif not probe.exists() and probe.suffix: + probe = probe.parent + starts.append(probe) + starts.append(Path.cwd().resolve()) + + seen: set[Path] = set() + for start in starts: + for parent in [start, *start.parents]: + if parent in seen: + continue + seen.add(parent) + candidate = parent / "pyproject.toml" + if candidate.exists(): + return candidate.resolve() + + raise ScriberError("No pyproject.toml found. Run `scriber init` or pass `--config /path/to/pyproject.toml`.") + + +def project_root_from_config(config_path: Path) -> Path: + return config_path.resolve().parent + + +def ensure_inside_root(path: Path, root: Path, allow_external: bool) -> None: + if allow_external: + return + try: + path.resolve().relative_to(root.resolve()) + except ValueError as exc: + raise ScriberError(f"Path is outside project root: {path}") from exc + + +def rel_to_root(path: Path, root: Path) -> Path: + try: + return path.resolve().relative_to(root.resolve()) + except ValueError: + return path.resolve() diff --git a/src/scriber/engine/__init__.py b/src/scriber/engine/__init__.py new file mode 100644 index 0000000..eca1ba7 --- /dev/null +++ b/src/scriber/engine/__init__.py @@ -0,0 +1,3 @@ +from .scorer import score_candidates + +__all__ = ["score_candidates"] diff --git a/src/scriber/engine/scorer.py b/src/scriber/engine/scorer.py new file mode 100644 index 0000000..0dbfb24 --- /dev/null +++ b/src/scriber/engine/scorer.py @@ -0,0 +1,302 @@ +from __future__ import annotations + +from pathlib import Path + +from scriber.core.matchers import match_pattern +from scriber.core.models import Candidate, FileNode, ModuleGraph, ScriberConfig, SeedPath + + +def _score(config: ScriberConfig, key: str) -> int: + return int(config.modules_config.scoring.get(key, 0)) + + +def _add_reason(candidate: Candidate, kind: str, label: str, example: Path | None = None) -> None: + candidate.reason_counts[kind] = candidate.reason_counts.get(kind, 0) + 1 + if example is not None: + if kind not in candidate.reason_examples: + candidate.reason_examples[kind] = [] + if example not in candidate.reason_examples[kind]: + candidate.reason_examples[kind].append(example) + if label not in candidate.reasons: + candidate.reasons.append(label) + + +def _build_reason_summary(candidate: Candidate) -> str: + parts = [] + for kind, count in candidate.reason_counts.items(): + examples = candidate.reason_examples.get(kind, []) + if kind == "seed_file": + parts.append("seed file") + elif kind == "seed_folder_file": + parts.append("seed folder file") + elif kind == "direct_dependency": + if count > 1: + parts.append(f"imports {count} included files") + elif examples: + parts.append(f"imports {examples[0].name}") + else: + parts.append("imports seed") + elif kind == "reverse_dependency": + if count > 1: + parts.append(f"imported by {count} included files") + elif examples: + parts.append(f"imported by {examples[0].name}") + else: + parts.append("imported by seed") + elif kind == "related_test": + parts.append("related test") + elif kind == "same_package": + parts.append("same package") + elif kind == "parent_entrypoint": + parts.append("parent entrypoint") + elif kind == "name_similarity": + parts.append("name similarity") + elif kind == "support_near_seed": + parts.append("support file") + elif kind == "project_support": + parts.append("project support file") + elif kind == "shared_dependency": + parts.append("shared dependency bonus") + elif kind == "entrypoint": + parts.append("entrypoint file") + elif kind == "test_file": + parts.append("test file") + elif kind == "code_file": + parts.append("code file") + elif kind == "other_file": + parts.append("other file") + else: + parts.append(kind.replace("_", " ")) + return "; ".join(parts) + + +def _add( + candidates: dict[Path, Candidate], + files: dict[Path, FileNode], + rel: Path, + score: int, + kind: str, + label: str, + *, + seed: Path | None = None, +) -> None: + file = files.get(rel) + if file is None: + return + existing = candidates.get(rel) + if existing is None: + existing = Candidate(file=file, score=score) + candidates[rel] = existing + else: + existing.score = max(existing.score, score) + + _add_reason(existing, kind, label, example=seed) + if seed is not None: + existing.seed_sources.add(seed) + + +def _is_test_file(rel: Path, config: ScriberConfig) -> bool: + parts = rel.parts + name = rel.name.lower() + if any(part in set(config.python.test_roots) for part in parts): + return True + return name.startswith("test_") or name.endswith("_test.py") or name.endswith(".test.py") + + +def _name_related(a: Path, b: Path) -> bool: + a_stem = a.stem.lower().replace("test_", "").replace("_test", "") + b_stem = b.stem.lower().replace("test_", "").replace("_test", "") + if not a_stem or not b_stem: + return False + return a_stem in b_stem or b_stem in a_stem + + +def _walk_neighbors(edges: dict[Path, set[Path]], start: Path, depth: int) -> dict[Path, int]: + found: dict[Path, int] = {} + frontier = {start} + visited = {start} + for distance in range(1, max(1, depth) + 1): + next_frontier: set[Path] = set() + for item in frontier: + for neighbor in edges.get(item, set()): + if neighbor in visited: + continue + visited.add(neighbor) + found.setdefault(neighbor, distance) + next_frontier.add(neighbor) + frontier = next_frontier + if not frontier: + break + return found + + +def _support_base_score(file: FileNode, config: ScriberConfig) -> int: + category = file.support_category or "support file" + if category == "project config": + return _score(config, "project_config") + if category == "dependency file": + return _score(config, "dependency_file") + if category in {"runtime support", "runtime config", "ci support", "tooling config"}: + return _score(config, "runtime_support") + if category == "documentation": + return _score(config, "documentation") + return _score(config, "documentation") + + +def _is_near_seed(support_file: Path, seed: Path) -> bool: + if support_file.parent == Path("."): + return True + seed_parent = seed.parent + return support_file.parent == seed_parent or support_file.parent in seed_parent.parents or seed_parent in support_file.parent.parents + + +def _matches_entrypoint(rel: Path, config: ScriberConfig) -> bool: + return any(match_pattern(rel.name, pattern) for pattern in config.python.entrypoint_patterns) + + +def score_candidates_project_snapshot( + *, + files: dict[Path, FileNode], + graph: ModuleGraph, + config: ScriberConfig, +) -> list[Candidate]: + candidates: dict[Path, Candidate] = {} + + for rel, file in files.items(): + if file.kind == "code": + if _matches_entrypoint(rel, config): + _add(candidates, files, rel, 90, "entrypoint", "entrypoint file") + elif _is_test_file(rel, config): + _add(candidates, files, rel, 60, "test_file", "test file") + else: + _add(candidates, files, rel, 80, "code_file", "code file") + elif file.kind == "support" and config.support: + base = _support_base_score(file, config) + category = file.support_category or "support file" + _add(candidates, files, rel, base, "project_support", category) + elif file.kind == "other": + _add(candidates, files, rel, 40, "other_file", "other file") + + for candidate in candidates.values(): + candidate.reason_summary = _build_reason_summary(candidate) + + filtered = [ + candidate + for rel, candidate in candidates.items() + if candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score + ] + filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + + if config.max_files > 0 and len(filtered) > config.max_files: + pinned = [c for c in filtered if c.file.relative.name in {"pyproject.toml", "README.md"}] + rest = [c for c in filtered if c.file.relative.name not in {"pyproject.toml", "README.md"}] + remaining = max(0, config.max_files - len(pinned)) + filtered = pinned + rest[:remaining] + filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + + return filtered + + +def score_candidates( + *, + files: dict[Path, FileNode], + seeds: list[SeedPath], + graph: ModuleGraph, + config: ScriberConfig, + mode: str = "focused", +) -> list[Candidate]: + if mode == "project_snapshot": + return score_candidates_project_snapshot(files=files, graph=graph, config=config) + + candidates: dict[Path, Candidate] = {} + scoring = config.modules_config + seed_files = [file for seed in seeds for file in seed.expanded_files] + seed_set = set(seed_files) + + for seed in seeds: + for rel in seed.expanded_files: + key = "seed_folder_file" if seed.is_dir else "seed_file" + reason = f"file inside seed folder `{seed.relative.as_posix()}`" if seed.is_dir else "seed file" + _add(candidates, files, rel, _score(config, key), "seed_folder_file" if seed.is_dir else "seed_file", reason, seed=rel) + + if config.modules and scoring.enabled: + for seed_rel in seed_files: + if scoring.include_direct_dependencies: + for dep, distance in _walk_neighbors(graph.imports, seed_rel, scoring.depth).items(): + score = max(scoring.tree_min_score, _score(config, "direct_dependency") - ((distance - 1) * 10)) + _add(candidates, files, dep, score, "direct_dependency", f"direct dependency of `{seed_rel.as_posix()}`", seed=seed_rel) + + if scoring.include_reverse_dependencies: + for dep, distance in _walk_neighbors(graph.imported_by, seed_rel, scoring.depth).items(): + score = max(scoring.tree_min_score, _score(config, "reverse_dependency") - ((distance - 1) * 10)) + _add(candidates, files, dep, score, "reverse_dependency", f"imports seed `{seed_rel.as_posix()}`", seed=seed_rel) + + if scoring.include_same_package: + seed_parent = seed_rel.parent + for rel, file in files.items(): + if file.kind == "code" and rel.parent == seed_parent and rel not in seed_set: + _add(candidates, files, rel, _score(config, "same_package"), "same_package", f"same package as `{seed_rel.as_posix()}`", seed=seed_rel) + + if scoring.include_parent_entrypoints: + for rel, file in files.items(): + if file.kind == "code" and _matches_entrypoint(rel, config): + if rel.parent == Path(".") or rel.parent in seed_rel.parents or seed_rel.parent in rel.parents: + _add(candidates, files, rel, _score(config, "parent_entrypoint"), "parent_entrypoint", f"parent/entrypoint near `{seed_rel.as_posix()}`", seed=seed_rel) + + if scoring.include_tests: + for rel, file in files.items(): + if file.kind != "code" or not _is_test_file(rel, config): + continue + if _name_related(rel, seed_rel) or seed_rel in graph.imports.get(rel, set()): + _add(candidates, files, rel, _score(config, "related_test"), "related_test", f"related test for `{seed_rel.as_posix()}`", seed=seed_rel) + + for rel, file in files.items(): + if file.kind == "code" and rel not in seed_set and _name_related(rel, seed_rel): + _add(candidates, files, rel, _score(config, "name_similarity"), "name_similarity", f"name similarity with `{seed_rel.as_posix()}`", seed=seed_rel) + + if config.support: + for rel, file in files.items(): + if file.kind != "support": + continue + base = _support_base_score(file, config) + reason = file.support_category or "support file" + if rel.name == "pyproject.toml": + _add(candidates, files, rel, _score(config, "project_config"), "project_support", "project config/root file") + continue + added = False + for seed_rel in seed_files: + if _is_near_seed(rel, seed_rel): + _add(candidates, files, rel, max(base, _score(config, "support_near_seed")), "support_near_seed", f"{reason} near `{seed_rel.as_posix()}`", seed=seed_rel) + added = True + if not added and file.relative.parent == Path(".") and scoring.include_project_configs: + _add(candidates, files, rel, base, "project_support", reason) + else: + if config.support: + pyproject = files.get(Path("pyproject.toml")) + if pyproject: + _add(candidates, files, Path("pyproject.toml"), _score(config, "project_config"), "project_support", "project config/root file") + + for candidate in candidates.values(): + if len(candidate.seed_sources) > 1: + candidate.score = min(100, candidate.score + _score(config, "shared_dependency_bonus")) + _add_reason(candidate, "shared_dependency", "shared by multiple seed paths") + + for candidate in candidates.values(): + candidate.reason_summary = _build_reason_summary(candidate) + + required = set(seed_files) + filtered = [ + candidate + for rel, candidate in candidates.items() + if rel in required or candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score + ] + filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + + if config.max_files > 0 and len(filtered) > config.max_files: + seeds_first = [candidate for candidate in filtered if candidate.file.relative in required or candidate.file.relative.name in {"pyproject.toml", "README.md"}] + rest = [candidate for candidate in filtered if candidate.file.relative not in required and candidate.file.relative.name not in {"pyproject.toml", "README.md"}] + remaining = max(0, config.max_files - len(seeds_first)) + filtered = seeds_first + rest[:remaining] + filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + + return filtered diff --git a/src/scriber/graph/__init__.py b/src/scriber/graph/__init__.py new file mode 100644 index 0000000..1a60029 --- /dev/null +++ b/src/scriber/graph/__init__.py @@ -0,0 +1,3 @@ +from .builder import build_graph + +__all__ = ["build_graph"] diff --git a/src/scriber/graph/builder.py b/src/scriber/graph/builder.py new file mode 100644 index 0000000..a181441 --- /dev/null +++ b/src/scriber/graph/builder.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from pathlib import Path + +from scriber.core.models import FileNode, ModuleGraph, ScriberConfig +from scriber.graph.languages.python import build_module_map, parse_python_imports, resolve_import_record +from scriber.scanner.files import read_text_lossy + + +def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGraph: + graph = ModuleGraph() + if not files: + return graph + + path_to_module: dict[Path, str] = {} + module_to_path: dict[str, Path] = {} + + absolute_to_file: dict[Path, FileNode] = {} + dir_to_files: dict[Path, list[FileNode]] = {} + for node in files.values(): + absolute_to_file[node.absolute] = node + dir_to_files.setdefault(node.absolute.parent, []).append(node) + + sample = next(iter(files.values())) + root = Path(sample.absolute.as_posix()[:len(sample.absolute.as_posix()) - len(sample.relative.as_posix())]).resolve() + + from scriber.cache import ScriberCache + cache = ScriberCache(config, root) + + module_to_path, path_to_module = build_module_map(files, config.python) + + for rel, file in files.items(): + if file.kind != "code" or file.is_binary or file.language not in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}: + continue + + try: + stat = file.absolute.stat() + mtime_ns = stat.st_mtime_ns + size = stat.st_size + except OSError: + continue + + cached_data = cache.get_file(rel, mtime_ns, size) + if cached_data is not None: + cached_imports = cache.get_imports(rel) + if cached_imports is not None: + for target in cached_imports: + if target in files: + graph.imports.setdefault(rel, set()).add(target) + graph.imported_by.setdefault(target, set()).add(rel) + continue + + resolved_set = set() + + if file.language == "python": + current_module = path_to_module.get(rel) + if current_module: + try: + source = file.read_text() + except OSError: + continue + imports = parse_python_imports(file.absolute, source) + for record in imports: + for target in resolve_import_record( + record, + current_file=file, + current_module=current_module, + module_to_path=module_to_path, + ): + if target == rel: + continue + resolved_set.add(target) + + elif file.language in {"javascript", "typescript", "react"}: + from scriber.graph.languages.javascript import parse_javascript_imports, resolve_javascript_import + try: + source = file.read_text() + except OSError: + continue + imports = parse_javascript_imports(source) + for spec in imports: + for target in resolve_javascript_import(spec, file, absolute_to_file): + if target == rel: + continue + resolved_set.add(target) + + elif file.language == "rust": + from scriber.graph.languages.rust import parse_rust_imports, resolve_rust_import + try: + source = file.read_text() + except OSError: + continue + imports = parse_rust_imports(source) + for kind, spec in imports: + for target in resolve_rust_import(kind, spec, file, absolute_to_file): + if target == rel: + continue + resolved_set.add(target) + + elif file.language == "go": + from scriber.graph.languages.go import parse_go_imports, resolve_go_import + try: + source = file.read_text() + except OSError: + continue + imports = parse_go_imports(source) + for spec in imports: + for target in resolve_go_import(spec, file, dir_to_files, root): + if target == rel: + continue + resolved_set.add(target) + + elif file.language in {"c", "cpp"}: + from scriber.graph.languages.cpp import parse_cpp_includes, resolve_cpp_include + try: + source = file.read_text() + except OSError: + continue + imports = parse_cpp_includes(source) + for spec in imports: + for target in resolve_cpp_include(spec, file, absolute_to_file): + if target == rel: + continue + resolved_set.add(target) + + + for target in resolved_set: + graph.imports.setdefault(rel, set()).add(target) + graph.imported_by.setdefault(target, set()).add(rel) + + cache.set_imports(rel, resolved_set) + + for rel in files: + graph.imports.setdefault(rel, set()) + graph.imported_by.setdefault(rel, set()) + + cache.save(set(files.keys())) + return graph diff --git a/src/scriber/graph/languages/__init__.py b/src/scriber/graph/languages/__init__.py new file mode 100644 index 0000000..04bc547 --- /dev/null +++ b/src/scriber/graph/languages/__init__.py @@ -0,0 +1 @@ +# Languages package init. diff --git a/src/scriber/graph/languages/cpp.py b/src/scriber/graph/languages/cpp.py new file mode 100644 index 0000000..5c19732 --- /dev/null +++ b/src/scriber/graph/languages/cpp.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import re +from pathlib import Path +from scriber.core.models import FileNode + +# Match `#include "header.h"` or `#include ` +INCLUDE_RE = re.compile(r'#include\s*["<]([^">]+)[">]') + + +def parse_cpp_includes(source: str) -> list[str]: + """Parse all include specifiers from C/C++ source code.""" + includes = [] + for match in INCLUDE_RE.finditer(source): + val = match.group(1) + if val: + includes.append(val) + return includes + + +def resolve_cpp_include( + include_spec: str, + current_file: FileNode, + absolute_to_file: dict[Path, FileNode] +) -> set[Path]: + """Resolve a C/C++ include specifier to a project file path.""" + resolved = set() + parent = current_file.absolute.parent + + # 1. Try resolving relative to current file's directory + try: + candidate = (parent / include_spec).resolve(strict=False) + except Exception: + candidate = parent / include_spec + + node = absolute_to_file.get(candidate) + if node and not node.is_binary: + resolved.add(node.relative) + return resolved + + # 2. Try resolving relative to project root or search paths in absolute_to_file + for path, n in absolute_to_file.items(): + if n.is_binary: + continue + rel_posix = n.relative.as_posix() + # Match if the relative path matches the include spec exactly or ends with it (e.g. "subdir/header.h") + if rel_posix == include_spec or rel_posix.endswith("/" + include_spec): + resolved.add(n.relative) + return resolved + + return resolved diff --git a/src/scriber/graph/languages/go.py b/src/scriber/graph/languages/go.py new file mode 100644 index 0000000..25b5fab --- /dev/null +++ b/src/scriber/graph/languages/go.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import re +from pathlib import Path +from scriber.core.models import FileNode + + +IMPORT_SINGLE_RE = re.compile(r'\bimport\s+[\'"]([^\'"]+)[\'"]') +IMPORT_BLOCK_RE = re.compile(r'\bimport\s*\(([^)]+)\)') + + +def parse_go_imports(source: str) -> list[str]: + imports = [] + for match in IMPORT_SINGLE_RE.finditer(source): + imports.append(match.group(1)) + for match in IMPORT_BLOCK_RE.finditer(source): + block = match.group(1) + for line in block.splitlines(): + line = line.strip() + if line.startswith("//"): + continue + m = re.search(r'[\'"]([^\'"]+)[\'"]', line) + if m: + imports.append(m.group(1)) + return imports + + +def resolve_go_import(import_spec: str, current_file: FileNode, dir_to_files: dict[Path, list[FileNode]], project_root: Path) -> set[Path]: + resolved = set() + go_mod_path = project_root / "go.mod" + module_name = None + if go_mod_path.exists(): + try: + content = go_mod_path.read_text(encoding="utf-8") + m = re.search(r'^\s*module\s+(\S+)', content, re.MULTILINE) + if m: + module_name = m.group(1) + except Exception: + pass + + if module_name and import_spec.startswith(module_name): + rel_spec = import_spec[len(module_name):].lstrip("/") + target_dir = (project_root / rel_spec).resolve() + for node in dir_to_files.get(target_dir, []): + if node.language == "go": + resolved.add(node.relative) + + return resolved diff --git a/src/scriber/graph/languages/javascript.py b/src/scriber/graph/languages/javascript.py new file mode 100644 index 0000000..9ca43f7 --- /dev/null +++ b/src/scriber/graph/languages/javascript.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import re +import os +from pathlib import Path +from scriber.core.models import FileNode + + +IMPORT_RE = re.compile( + r'(?:import|export)\s+(?:[\w*\s{},]*\s+from\s+)?[\'"]([^\'"]+)[\'"]' + r'|require\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)' +) + + +def parse_javascript_imports(source: str) -> list[str]: + imports = [] + for match in IMPORT_RE.finditer(source): + val = match.group(1) or match.group(2) + if val: + imports.append(val) + return imports + + +def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]: + resolved = set() + if not import_spec.startswith("."): + return resolved + + parent = current_file.absolute.parent + try: + base_path = Path(os.path.abspath(parent / import_spec)) + except Exception: + base_path = (parent / import_spec).resolve(strict=False) + + extensions = ["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"] + for ext in extensions: + candidate = base_path.with_name(base_path.name + ext) if ext else base_path + node = absolute_to_file.get(candidate) + if node and not node.is_binary: + resolved.add(node.relative) + return resolved + + # Try index files + for index_name in ["index.ts", "index.tsx", "index.js", "index.jsx"]: + candidate = base_path / index_name + node = absolute_to_file.get(candidate) + if node and not node.is_binary: + resolved.add(node.relative) + return resolved + + return resolved diff --git a/src/scriber/graph/languages/python.py b/src/scriber/graph/languages/python.py new file mode 100644 index 0000000..60af766 --- /dev/null +++ b/src/scriber/graph/languages/python.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +import ast +from dataclasses import dataclass +from pathlib import Path, PurePosixPath + +from scriber.core.models import FileNode, PythonConfig + + +@dataclass(frozen=True, slots=True) +class ImportRecord: + kind: str + module: str + names: tuple[str, ...] = () + level: int = 0 + + +def parse_python_imports(path: Path, source: str) -> list[ImportRecord]: + try: + tree = ast.parse(source, filename=str(path)) + except SyntaxError: + return [] + + imports: list[ImportRecord] = [] + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + imports.append(ImportRecord(kind="import", module=alias.name, names=(), level=0)) + elif isinstance(node, ast.ImportFrom): + module = node.module or "" + names = tuple(alias.name for alias in node.names if alias.name != "*") + imports.append(ImportRecord(kind="from", module=module, names=names, level=node.level)) + return imports + + +def _is_under(rel: Path, root: str) -> bool: + if root in {"", "."}: + return True + root_path = PurePosixPath(root) + rel_path = PurePosixPath(rel.as_posix()) + try: + rel_path.relative_to(root_path) + return True + except ValueError: + return False + + +def _relative_to_root(rel: Path, root: str) -> Path: + if root in {"", "."}: + return rel + return Path(PurePosixPath(rel.as_posix()).relative_to(PurePosixPath(root))) + + +def module_name_for_file(file: FileNode, python: PythonConfig) -> str | None: + if file.language != "python": + return None + rel = file.relative + roots = sorted(python.source_roots, key=lambda item: 0 if item == "." else len(item), reverse=True) + for source_root in roots: + if not _is_under(rel, source_root): + continue + under = _relative_to_root(rel, source_root) + if under.suffix not in {".py", ".pyi"}: + continue + parts = list(under.with_suffix("").parts) + if not parts: + continue + if under.name in python.module_init_files: + parts = parts[:-1] + if not parts: + continue + return ".".join(parts) + return None + + +def build_module_map(files: dict[Path, FileNode], python: PythonConfig) -> tuple[dict[str, Path], dict[Path, str]]: + module_to_path: dict[str, Path] = {} + path_to_module: dict[Path, str] = {} + for rel, file in files.items(): + module = module_name_for_file(file, python) + if not module: + continue + path_to_module[rel] = module + module_to_path.setdefault(module, rel) + return module_to_path, path_to_module + + +def resolve_relative_module(current_module: str, current_is_init: bool, record: ImportRecord) -> str: + if record.level <= 0: + return record.module + if current_is_init: + package = current_module + else: + package = current_module.rsplit(".", 1)[0] if "." in current_module else "" + parts = package.split(".") if package else [] + up = max(0, record.level - 1) + if up: + parts = parts[:-up] if up <= len(parts) else [] + if record.module: + parts.extend(record.module.split(".")) + return ".".join(part for part in parts if part) + + +def resolve_import_record( + record: ImportRecord, + *, + current_file: FileNode, + current_module: str, + module_to_path: dict[str, Path], +) -> set[Path]: + candidates: list[str] = [] + current_is_init = current_file.absolute.name == "__init__.py" + + if record.kind == "import": + candidates.append(record.module) + else: + base = resolve_relative_module(current_module, current_is_init, record) if record.level else record.module + for name in record.names: + if base: + candidates.append(f"{base}.{name}") + else: + candidates.append(name) + if base: + candidates.append(base) + + resolved: set[Path] = set() + for candidate in candidates: + if not candidate: + continue + parts = candidate.split(".") + # Try the exact module first, then walk up to a package. This handles + # both `from package import symbol` and `from package import module`. + for end in range(len(parts), 0, -1): + module = ".".join(parts[:end]) + path = module_to_path.get(module) + if path is not None: + resolved.add(path) + break + return resolved diff --git a/src/scriber/graph/languages/rust.py b/src/scriber/graph/languages/rust.py new file mode 100644 index 0000000..14feecc --- /dev/null +++ b/src/scriber/graph/languages/rust.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +import re +from pathlib import Path +from scriber.core.models import FileNode + + +MOD_RE = re.compile(r'\bmod\s+(\w+)\s*;') +USE_RE = re.compile(r'\buse\s+([^;]+)\s*;') + + +def parse_rust_imports(source: str) -> list[tuple[str, str]]: + imports = [] + for match in MOD_RE.finditer(source): + imports.append(("mod", match.group(1))) + for match in USE_RE.finditer(source): + spec = match.group(1).strip() + if "{" in spec: + base, rest = spec.split("{", 1) + base = base.strip() + rest = rest.replace("}", "").strip() + for part in rest.split(","): + part = part.strip() + if part: + imports.append(("use", f"{base}{part}")) + else: + imports.append(("use", spec)) + return imports + + +def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]: + resolved = set() + parent = current_file.absolute.parent + + if kind == "mod": + candidates = [ + parent / f"{spec}.rs", + parent / spec / "mod.rs" + ] + for cand in candidates: + node = absolute_to_file.get(cand) + if node: + resolved.add(node.relative) + return resolved + return resolved + + parts = spec.split("::") + if not parts: + return resolved + + if parts[0] == "crate": + crate_root = None + curr = current_file.absolute.parent + while curr != curr.parent: + if (curr / "Cargo.toml").exists() or (curr / "src").exists(): + crate_root = curr / "src" if (curr / "src").exists() else curr + break + curr = curr.parent + if not crate_root: + crate_root = current_file.absolute.parent + + sub_parts = parts[1:] + if sub_parts: + for end in range(len(sub_parts), 0, -1): + module_path = crate_root / Path(*sub_parts[:end]) + candidates = [ + module_path.with_name(module_path.name + ".rs"), + module_path / "mod.rs" + ] + for cand in candidates: + node = absolute_to_file.get(cand) + if node: + resolved.add(node.relative) + return resolved + elif parts[0] == "super": + sub_parts = parts[1:] + crate_root = parent.parent + if sub_parts: + for end in range(len(sub_parts), 0, -1): + module_path = crate_root / Path(*sub_parts[:end]) + candidates = [ + module_path.with_name(module_path.name + ".rs"), + module_path / "mod.rs" + ] + for cand in candidates: + node = absolute_to_file.get(cand) + if node: + resolved.add(node.relative) + return resolved + elif parts[0] == "self": + sub_parts = parts[1:] + crate_root = parent + if sub_parts: + for end in range(len(sub_parts), 0, -1): + module_path = crate_root / Path(*sub_parts[:end]) + candidates = [ + module_path.with_name(module_path.name + ".rs"), + module_path / "mod.rs" + ] + for cand in candidates: + node = absolute_to_file.get(cand) + if node: + resolved.add(node.relative) + return resolved + + return resolved diff --git a/src/scriber/native.py b/src/scriber/native.py new file mode 100644 index 0000000..08b415c --- /dev/null +++ b/src/scriber/native.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Any + +_NATIVE_MODULE = None +_IMPORT_ERROR = None + + +def _load_native() -> Any: + global _NATIVE_MODULE, _IMPORT_ERROR + if _NATIVE_MODULE is not None: + return _NATIVE_MODULE + if _IMPORT_ERROR is not None: + raise _IMPORT_ERROR + try: + from scriber import _native + _NATIVE_MODULE = _native + return _NATIVE_MODULE + except ImportError as e: + _IMPORT_ERROR = e + raise e + + +def is_native_available() -> bool: + """Returns True if the native Rust module scriber._native is available.""" + try: + _load_native() + return True + except ImportError: + return False + + +def require_native() -> Any: + """Returns the native Rust module _native or raises ImportError with instructions.""" + try: + native = _load_native() + if hasattr(native, "native_api_version") and native.native_api_version() != 1: + raise RuntimeError("Niezgodna wersja natywnego backendu Scriber (oczekiwano wersji 1).") + return native + except ImportError as e: + raise ImportError( + "Natywny moduΕ‚ 'scriber._native' nie jest dostΔ™pny.\n" + "Upewnij siΔ™, ΕΌe projekt zostaΕ‚ poprawnie skompilowany " + "za pomocΔ… 'uv run maturin develop' lub 'uv sync'." + ) from e diff --git a/src/scriber/pack.py b/src/scriber/pack.py new file mode 100644 index 0000000..1b9626e --- /dev/null +++ b/src/scriber/pack.py @@ -0,0 +1,3 @@ +from .packer.pack import build_pack, build_and_write_pack + +__all__ = ["build_pack", "build_and_write_pack"] diff --git a/src/scriber/packer/__init__.py b/src/scriber/packer/__init__.py new file mode 100644 index 0000000..2a38d37 --- /dev/null +++ b/src/scriber/packer/__init__.py @@ -0,0 +1,3 @@ +from .pack import build_pack, build_and_write_pack + +__all__ = ["build_pack", "build_and_write_pack"] diff --git a/src/scriber/packer/pack.py b/src/scriber/packer/pack.py new file mode 100644 index 0000000..2e7011c --- /dev/null +++ b/src/scriber/packer/pack.py @@ -0,0 +1,365 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Callable + +from scriber.core.config import apply_overrides, load_config +from scriber.core.errors import ScriberError +from scriber.core.models import Candidate, FileNode, ScriberPack, SeedPath +from scriber.core.root import ensure_inside_root, project_root_from_config, rel_to_root, resolve_config_path +from scriber.engine.scorer import score_candidates +from scriber.graph.builder import build_graph +from scriber.rendering.renderer import render_pack +from scriber.scanner.files import classify_file, is_text_readable, read_text_lossy +from scriber.tokens import estimate_tokens +from scriber.scanner.scan import scan_project + + +def _resolve_input(path_value: str, root: Path, allow_external: bool, path_base: str = "cwd") -> Path: + path = Path(path_value).expanduser() + if not path.is_absolute(): + if path_base == "project": + path = (root / path).resolve(strict=False) + else: + path = (Path.cwd() / path).resolve(strict=False) + else: + path = path.resolve(strict=False) + if not path.exists(): + # Try relative to project root as a convenience for programmatic calls. + alt = (root / path_value).resolve(strict=False) + if alt.exists(): + path = alt + if not path.exists(): + raise ScriberError(f"Input path not found: {path_value}") + ensure_inside_root(path, root, allow_external) + return path.resolve() + + +def _ensure_seed_file(path: Path, root: Path, files: dict[Path, FileNode], config) -> FileNode: + rel = rel_to_root(path, root) + existing = files.get(rel) + if existing is not None: + return existing + node = classify_file(path, root, config) + if node is not None: + files[rel] = node + return node + # Explicit seed overrides hard-ignore classification if it is readable text. + node = FileNode( + absolute=path.resolve(), + relative=rel, + kind="other", + language="text", + size_bytes=path.stat().st_size, + is_binary=not is_text_readable(path), + support_category=None, + content_policy="auto", + ) + files[rel] = node + return node + + +def _expand_seed(path: Path, root: Path, files: dict[Path, FileNode], config) -> SeedPath: + rel = rel_to_root(path, root) + if path.is_file(): + node = _ensure_seed_file(path, root, files, config) + return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=False, expanded_files=[node.relative]) + + expanded: list[Path] = [] + for file_rel, node in files.items(): + try: + file_rel.relative_to(rel) + except ValueError: + continue + if not node.is_binary: + expanded.append(file_rel) + expanded.sort(key=lambda item: item.as_posix()) + if not expanded: + raise ScriberError(f"No readable project files found inside seed folder: {rel.as_posix()}") + return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=True, expanded_files=expanded) + + +def _decide_content(candidate: Candidate, *, config, only_tree: bool, budget_left: int | None, is_seed: bool) -> tuple[bool, str | None, str | None, int]: + if only_tree: + return False, None, "only-tree mode", 0 + file = candidate.file + if file.is_binary: + return False, None, "binary file", 0 + + should_include = False + reason: str | None = None + + if is_seed: + should_include = True + elif file.kind == "code": + should_include = candidate.score >= config.modules_config.content_min_score + if not should_include: + reason = f"score below content_min_score={config.modules_config.content_min_score}" + elif file.kind == "support": + if file.content_policy == "tree_only": + should_include = False + reason = "support content policy: tree_only" + elif file.content_policy == "full": + should_include = True + else: + should_include = file.size_bytes <= config.support_content.auto_max_bytes + if not should_include: + reason = f"support file larger than auto_max_bytes={config.support_content.auto_max_bytes}" + else: + should_include = is_seed + if not should_include: + reason = "other file not selected for content" + + if not should_include: + return False, None, reason, 0 + + try: + content = file.read_text() + except OSError as exc: + return False, None, f"read error: {exc}", 0 + + tokens = estimate_tokens(content, config.tokens) + if budget_left is not None and tokens > budget_left and not is_seed: + return False, None, "token budget exceeded", 0 + return True, content, None, tokens + + +def _apply_content_policy(pack: ScriberPack, config) -> None: + if pack.mode == "focused": + explicit_seed_files = {rel for seed in pack.seed_paths for rel in seed.expanded_files} + else: + explicit_seed_files = {rel for seed in pack.seed_paths if not seed.is_dir for rel in seed.expanded_files} + budget_left = config.max_tokens if config.max_tokens > 0 else None + total = 0 + for candidate in pack.candidates: + is_explicit_seed = candidate.file.relative in explicit_seed_files + include, content, omitted, tokens = _decide_content( + candidate, + config=config, + only_tree=pack.only_tree, + budget_left=budget_left, + is_seed=is_explicit_seed, + ) + candidate.include_content = include + candidate.content = content + candidate.omitted_reason = omitted + candidate.token_estimate = tokens + if include: + total += tokens + if budget_left is not None and not is_explicit_seed: + budget_left = max(0, budget_left - tokens) + pack.total_tokens = total + + +def build_pack( + paths: list[str] | None = None, + *, + config_path: str | None = None, + output: str | None = None, + output_format: str | None = None, + only_tree: bool | None = None, + modules: bool | None = None, + support: bool | None = None, + max_files: int | None = None, + max_tokens: int | None = None, + min_score: int | None = None, + support_content: str | None = None, + progress_callback: Callable[[str], None] | None = None, + project: bool | None = None, + path_base: str = "project", +) -> ScriberPack: + from time import perf_counter + timings = {} + + t_start = perf_counter() + paths = paths or ["."] + resolved_config = resolve_config_path(paths, config_path) + root = project_root_from_config(resolved_config) + config = load_config(resolved_config) + config = apply_overrides( + config, + output=output, + output_format=output_format, + only_tree=only_tree, + modules=modules, + support=support, + max_files=max_files, + max_tokens=max_tokens, + min_score=min_score, + support_content=support_content, + ) + timings["config_load"] = perf_counter() - t_start + + t_scan = perf_counter() + if progress_callback: progress_callback("Skanowanie plikow...") + from scriber.native import require_native, is_native_available + native_files = None + if is_native_available(): + from scriber.scanner.scan import scan_project_with_native + files, native_files = scan_project_with_native(root, config) + else: + files = scan_project(root, config) + resolved_inputs = [_resolve_input(item, root, config.allow_external_paths, path_base) for item in paths] + seeds = [_expand_seed(path, root, files, config) for path in resolved_inputs] + timings["scan"] = perf_counter() - t_scan + + # Detect mode + is_project_snapshot = False + if project: + is_project_snapshot = True + else: + for path in resolved_inputs: + if path == root: + is_project_snapshot = True + break + mode = "project_snapshot" if is_project_snapshot else "focused" + + # Use native code pack builder if available + if is_native_available(): + native = require_native() + + t_graph = perf_counter() + if progress_callback: progress_callback("Budowanie grafu modulow (natywnie)...") + + assert native_files is not None + + edges = native.build_import_graph( + str(root), + native_files, + config.python.source_roots, + config.python.module_init_files + ) + + from scriber.core.models import ModuleGraph + graph = ModuleGraph() + for edge in edges: + from_path = Path(getattr(edge, "from")) + to_path = Path(edge.to) + graph.imports.setdefault(from_path, set()).add(to_path) + graph.imported_by.setdefault(to_path, set()).add(from_path) + + timings["graph_build"] = perf_counter() - t_graph + + t_score = perf_counter() + if progress_callback: progress_callback("Ocenianie zaleznosci (natywnie)...") + scoring = config.modules_config.scoring + opts = native.NativePackOptions( + mode=mode, + max_files=config.max_files, + min_score=config.min_score, + tree_min_score=config.modules_config.tree_min_score, + seed_file_score=scoring.get("seed_file", 100), + seed_folder_file_score=scoring.get("seed_folder_file", 100), + direct_dependency_score=scoring.get("direct_dependency", 90), + reverse_dependency_score=scoring.get("reverse_dependency", 85), + same_package_score=scoring.get("same_package", 65), + parent_entrypoint_score=scoring.get("parent_entrypoint", 60), + related_test_score=scoring.get("related_test", 80), + name_similarity_score=scoring.get("name_similarity", 45), + support_near_seed_score=scoring.get("support_near_seed", 60), + project_config_score=scoring.get("project_config", 55), + dependency_file_score=scoring.get("dependency_file", 52), + runtime_support_score=scoring.get("runtime_support", 50), + documentation_score=scoring.get("documentation", 45), + shared_dependency_bonus=scoring.get("shared_dependency_bonus", 10), + modules_enabled=config.modules, + include_direct_dependencies=config.modules_config.include_direct_dependencies, + include_reverse_dependencies=config.modules_config.include_reverse_dependencies, + include_same_package=config.modules_config.include_same_package, + include_parent_entrypoints=config.modules_config.include_parent_entrypoints, + include_tests=config.modules_config.include_tests, + include_project_configs=config.modules_config.include_project_configs, + depth=config.modules_config.depth, + support_enabled=config.support, + entrypoint_patterns=config.python.entrypoint_patterns, + test_roots=config.python.test_roots, + ) + + rs_candidates = native.score_candidates_native( + native_files, + [seed.relative.as_posix() for seed in seeds], + edges, + opts + ) + + candidates = [] + for rc in rs_candidates: + rel = Path(rc.path) + file_node = files.get(rel) + if file_node: + c = Candidate( + file=file_node, + score=rc.score, + reasons=rc.reasons, + reason_summary=rc.reason_summary, + include_content=rc.include_content, + omitted_reason=rc.omitted_reason, + ) + candidates.append(c) + timings["scoring"] = perf_counter() - t_score + else: + t_graph = perf_counter() + if progress_callback: progress_callback("Budowanie grafu modulow...") + graph = build_graph(files, config) + timings["graph_build"] = perf_counter() - t_graph + + t_score = perf_counter() + if progress_callback: progress_callback("Ocenianie zaleznosci...") + candidates = score_candidates(files=files, seeds=seeds, graph=graph, config=config, mode=mode) + timings["scoring"] = perf_counter() - t_score + + pack = ScriberPack( + project_root=root, + config_path=resolved_config, + seed_paths=seeds, + candidates=candidates, + graph=graph, + only_tree=config.only_tree, + output_format=config.format, + mode=mode, + ) + + t_content = perf_counter() + if progress_callback: progress_callback("Aplikowanie regul zawartosci...") + _apply_content_policy(pack, config) + timings["content_read"] = perf_counter() - t_content + + pack.timings = timings + return pack + + +def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path | None, ScriberPack]: + explain_selection = kwargs.pop("explain_selection", False) + pack = build_pack(paths, **kwargs) + config_path = resolve_config_path(paths or ["."], kwargs.get("config_path")) + config = load_config(config_path) + config = apply_overrides( + config, + output=kwargs.get("output"), + output_format=kwargs.get("output_format"), + only_tree=kwargs.get("only_tree"), + modules=kwargs.get("modules"), + support=kwargs.get("support"), + max_files=kwargs.get("max_files"), + max_tokens=kwargs.get("max_tokens"), + min_score=kwargs.get("min_score"), + support_content=kwargs.get("support_content"), + ) + progress = kwargs.get("progress_callback") + if progress: progress("Renderowanie Markdown...") + rendered = render_pack(pack, explain_selection=explain_selection) + output = config.output + if str(output) == "-": + import sys + try: + sys.stdout.buffer.write(rendered.encode("utf-8")) + sys.stdout.flush() + except (AttributeError, OSError): + print(rendered) + return None, pack + if not output.is_absolute(): + output = pack.project_root / output + output.parent.mkdir(parents=True, exist_ok=True) + from scriber.native import require_native + require_native().write_text(str(output), rendered) + return output, pack diff --git a/src/scriber/render.py b/src/scriber/render.py new file mode 100644 index 0000000..c9eb710 --- /dev/null +++ b/src/scriber/render.py @@ -0,0 +1,3 @@ +from .rendering.renderer import render_pack, render_markdown, render_text + +__all__ = ["render_pack", "render_markdown", "render_text"] diff --git a/src/scriber/rendering/__init__.py b/src/scriber/rendering/__init__.py new file mode 100644 index 0000000..72031fd --- /dev/null +++ b/src/scriber/rendering/__init__.py @@ -0,0 +1,3 @@ +from .renderer import render_pack, render_markdown, render_text + +__all__ = ["render_pack", "render_markdown", "render_text"] diff --git a/src/scriber/rendering/renderer.py b/src/scriber/rendering/renderer.py new file mode 100644 index 0000000..657cd55 --- /dev/null +++ b/src/scriber/rendering/renderer.py @@ -0,0 +1,278 @@ +from __future__ import annotations + +from pathlib import Path + +from scriber.core.models import Candidate, ScriberPack + + +def _path(path: Path) -> str: + return path.as_posix() + + +def _escape_table(value: str) -> str: + return value.replace("|", "\\|").replace("\n", " ") + + +def _content_flag(candidate: Candidate) -> str: + if candidate.include_content: + return "yes" + if candidate.omitted_reason: + return f"no: {candidate.omitted_reason}" + return "no" + + +def _table(candidates: list[Candidate], explain_selection: bool = False) -> str: + if not candidates: + return "_None._\n" + lines = ["| Score | Content | Path | Reason |", "|---:|---|---|---|"] + for candidate in candidates: + reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary + lines.append( + f"| {candidate.score} | {_escape_table(_content_flag(candidate))} | `{_escape_table(_path(candidate.file.relative))}` | {_escape_table(reason)} |" + ) + return "\n".join(lines) + "\n" + + +def render_tree(paths: list[Path]) -> str: + tree: dict[str, dict] = {} + for path in sorted(paths, key=lambda item: item.as_posix()): + node = tree + for part in path.parts: + node = node.setdefault(part, {}) + + def walk(node: dict[str, dict], prefix: str = "") -> list[str]: + lines: list[str] = [] + items = sorted(node.items(), key=lambda item: item[0]) + for index, (name, child) in enumerate(items): + is_last = index == len(items) - 1 + branch = "└── " if is_last else "β”œβ”€β”€ " + lines.append(f"{prefix}{branch}{name}") + extension = " " if is_last else "β”‚ " + lines.extend(walk(child, prefix + extension)) + return lines + + return ".\n" + "\n".join(walk(tree)) if tree else "." + + +def render_module_graph(pack: ScriberPack) -> str: + included = set(pack.included_paths) + lines: list[str] = [] + + if pack.mode == "project_snapshot": + import_counts = [] + imported_by_counts = [] + for path in included: + imports = len(pack.graph.imports.get(path, set()) & included) + if imports > 0: + import_counts.append((path, imports)) + + imported_by = len(pack.graph.imported_by.get(path, set()) & included) + if imported_by > 0: + imported_by_counts.append((path, imported_by)) + + import_counts.sort(key=lambda x: (-x[1], x[0].as_posix())) + imported_by_counts.sort(key=lambda x: (-x[1], x[0].as_posix())) + + lines.append("Top 5 files with most dependencies:") + for path, count in import_counts[:5]: + lines.append(f"- `{_path(path)}`: imports {count} included files") + + lines.append("") + lines.append("Top 5 most imported files:") + for path, count in imported_by_counts[:5]: + lines.append(f"- `{_path(path)}`: imported by {count} included files") + + return "\n".join(lines).strip() or "No module graph available." + + for seed in pack.seed_paths: + for seed_file in seed.expanded_files: + lines.append(_path(seed_file)) + imports = sorted(pack.graph.imports.get(seed_file, set()) & included, key=lambda item: item.as_posix()) + imported_by = sorted(pack.graph.imported_by.get(seed_file, set()) & included, key=lambda item: item.as_posix()) + edges = [("imports", item) for item in imports] + [("imported by", item) for item in imported_by] + for index, (kind, target) in enumerate(edges): + branch = "└──" if index == len(edges) - 1 else "β”œβ”€β”€" + lines.append(f"{branch} {kind} {_path(target)}") + if not edges: + lines.append("└── no included import edges") + lines.append("") + return "\n".join(lines).strip() or "No module graph available." + + +def _language_fence(language: str) -> str: + if language in {"python", "rust", "javascript", "typescript", "go", "java", "kotlin", "c", "cpp", "toml", "yaml", "json", "markdown", "dockerfile", "ini"}: + return language + return "text" + + +def _fence_for(content: str) -> str: + longest = 0 + current = 0 + for char in content: + if char == "`": + current += 1 + longest = max(longest, current) + else: + current = 0 + return "`" * max(3, longest + 1) + + +def render_summary(pack: ScriberPack) -> str: + code_count = len([c for c in pack.candidates if c.file.kind == "code"]) + support_count = len([c for c in pack.candidates if c.file.kind == "support"]) + content_count = len([c for c in pack.candidates if c.include_content]) + tree_only_count = len([c for c in pack.candidates if not c.include_content]) + + lines = [ + "## Pack summary", + "", + f"- Mode: `{pack.mode}`", + f"- Seed paths: `{len(pack.seed_paths)}`", + f"- Included code files: `{code_count}`", + f"- Included support files: `{support_count}`", + f"- Content files: `{content_count}`", + f"- Tree-only files: `{tree_only_count}`", + f"- Estimated tokens: `{pack.total_tokens}`", + "" + ] + return "\n".join(lines) + + +def render_summary_text(pack: ScriberPack) -> str: + code_count = len([c for c in pack.candidates if c.file.kind == "code"]) + support_count = len([c for c in pack.candidates if c.file.kind == "support"]) + content_count = len([c for c in pack.candidates if c.include_content]) + tree_only_count = len([c for c in pack.candidates if not c.include_content]) + + lines = [ + "PACK SUMMARY", + "------------", + f"Mode: {pack.mode}", + f"Seed paths: {len(pack.seed_paths)}", + f"Included code files: {code_count}", + f"Included support files: {support_count}", + f"Content files: {content_count}", + f"Tree-only files: {tree_only_count}", + f"Estimated tokens: {pack.total_tokens}", + "" + ] + return "\n".join(lines) + + +def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str: + code = [candidate for candidate in pack.candidates if candidate.file.kind == "code"] + support = [candidate for candidate in pack.candidates if candidate.file.kind == "support"] + other = [candidate for candidate in pack.candidates if candidate.file.kind == "other"] + + lines: list[str] = [] + lines.append("# Scriber 2.0 Pack") + lines.append("") + lines.append(render_summary(pack).rstrip()) + lines.append("") + lines.append("## Project") + lines.append("") + lines.append(f"Root: `{pack.project_root}`") + lines.append(f"Config: `{pack.config_path.relative_to(pack.project_root).as_posix()}`") + lines.append(f"Format: `{pack.output_format}`") + lines.append(f"Only tree: `{str(pack.only_tree).lower()}`") + lines.append("") + lines.append("## Input paths") + lines.append("") + for seed in pack.seed_paths: + lines.append(f"- `{_path(seed.relative)}`") + lines.append("") + lines.append("## Included code files") + lines.append("") + lines.append(_table(code, explain_selection).rstrip()) + lines.append("") + lines.append("## Included support files") + lines.append("") + lines.append(_table(support, explain_selection).rstrip()) + if other: + lines.append("") + lines.append("## Included other files") + lines.append("") + lines.append(_table(other, explain_selection).rstrip()) + lines.append("") + lines.append("## Module graph") + lines.append("") + lines.append("```text") + lines.append(render_module_graph(pack)) + lines.append("```") + lines.append("") + lines.append("## Included project tree") + lines.append("") + lines.append("```text") + lines.append(render_tree(pack.included_paths)) + lines.append("```") + + if not pack.only_tree: + lines.append("") + lines.append("## File contents") + for candidate in pack.candidates: + lines.append("") + lines.append(f"### `{_path(candidate.file.relative)}`") + lines.append("") + if not candidate.include_content: + lines.append(f"_Content omitted: {candidate.omitted_reason or 'not selected for content'}._") + continue + content = candidate.content or "" + fence = _fence_for(content) + language = _language_fence(candidate.file.language) + lines.append(f"{fence}{language}") + lines.append(content.rstrip("\n")) + lines.append(fence) + + lines.append("") + return "\n".join(lines) + + +def render_text(pack: ScriberPack, explain_selection: bool = False) -> str: + lines: list[str] = [] + lines.append("SCRIBER 2.0 PACK") + lines.append("================") + lines.append("") + lines.append(render_summary_text(pack).rstrip()) + lines.append("") + lines.append(f"PROJECT ROOT: {pack.project_root}") + lines.append(f"CONFIG: {pack.config_path.relative_to(pack.project_root).as_posix()}") + lines.append(f"FORMAT: {pack.output_format}") + lines.append(f"ONLY TREE: {str(pack.only_tree).lower()}") + lines.append("") + lines.append("INPUT PATHS") + for seed in pack.seed_paths: + lines.append(f"- {_path(seed.relative)}") + lines.append("") + lines.append("INCLUDED FILES") + for candidate in pack.candidates: + reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary + lines.append(f"[{candidate.score:03d}] {_path(candidate.file.relative)}") + lines.append(f" kind: {candidate.file.kind}") + lines.append(f" content: {_content_flag(candidate)}") + lines.append(f" reason: {reason}") + lines.append("") + lines.append("MODULE GRAPH") + lines.append(render_module_graph(pack)) + lines.append("") + lines.append("INCLUDED PROJECT TREE") + lines.append(render_tree(pack.included_paths)) + + if not pack.only_tree: + lines.append("") + lines.append("FILE CONTENTS") + lines.append("=============") + for candidate in pack.candidates: + lines.append("") + lines.append(f"--- FILE: {_path(candidate.file.relative)} ---") + if not candidate.include_content: + lines.append(f"[content omitted: {candidate.omitted_reason or 'not selected for content'}]") + continue + lines.append(candidate.content or "") + lines.append("") + return "\n".join(lines) + + +def render_pack(pack: ScriberPack, explain_selection: bool = False) -> str: + if pack.output_format == "txt": + return render_text(pack, explain_selection=explain_selection) + return render_markdown(pack, explain_selection=explain_selection) diff --git a/src/scriber/scanner/__init__.py b/src/scriber/scanner/__init__.py new file mode 100644 index 0000000..9070647 --- /dev/null +++ b/src/scriber/scanner/__init__.py @@ -0,0 +1,21 @@ +from .files import ( + classify_file, + is_probably_binary, + is_text_readable, + language_for, + read_text_lossy, + support_category, + support_content_policy, +) +from .scan import scan_project + +__all__ = [ + "classify_file", + "is_probably_binary", + "is_text_readable", + "language_for", + "read_text_lossy", + "support_category", + "support_content_policy", + "scan_project", +] diff --git a/src/scriber/scanner/files.py b/src/scriber/scanner/files.py new file mode 100644 index 0000000..f203dde --- /dev/null +++ b/src/scriber/scanner/files.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from pathlib import Path + +from scriber.core.matchers import match_pattern, matches_any +from scriber.core.models import ContentPolicy, FileKind, FileNode, ScriberConfig + +LANGUAGE_BY_SUFFIX = { + ".py": "python", + ".pyi": "python", + ".rs": "rust", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".go": "go", + ".java": "java", + ".kt": "kotlin", + ".c": "c", + ".cpp": "cpp", + ".cc": "cpp", + ".cxx": "cpp", + ".h": "c", + ".hpp": "cpp", + ".hh": "cpp", + ".hxx": "cpp", + ".toml": "toml", + ".yaml": "yaml", + ".yml": "yaml", + ".json": "json", + ".md": "markdown", + ".rst": "rst", + ".txt": "text", + ".ini": "ini", + ".cfg": "ini", + ".lock": "lock", +} + + +def is_probably_binary(path: Path) -> bool: + from scriber.native import require_native + try: + return require_native().is_probably_binary(str(path)) + except Exception: + return True + + +def language_for(path: Path) -> str: + if path.name.startswith("Dockerfile"): + return "dockerfile" + return LANGUAGE_BY_SUFFIX.get(path.suffix.lower(), "text") + + +def support_category(rel: Path) -> str: + s = rel.as_posix() + name = rel.name + if name == "pyproject.toml" or name.endswith(".toml") or name in {"setup.py", "setup.cfg", "tox.ini", "pytest.ini", "mypy.ini", "ruff.toml", ".ruff.toml"}: + return "project config" + if name.endswith(".lock") or name in {"requirements.txt", "poetry.lock", "uv.lock", "Pipfile", "Pipfile.lock", "package.json", "package-lock.json", "pnpm-lock.yaml", "yarn.lock", "Cargo.toml", "Cargo.lock", "go.mod", "go.sum"} or s.startswith("requirements/"): + return "dependency file" + if name.startswith("README") or name in {"CHANGELOG.md", "CONTRIBUTING.md"} or s.startswith("docs/"): + return "documentation" + if name.startswith("Dockerfile") or name.startswith("docker-compose") or name.startswith("compose"): + return "runtime support" + if s.startswith(".github/workflows/") or name == ".gitlab-ci.yml": + return "ci support" + if name.startswith(".env") or s.startswith("config/") or s.startswith("settings/"): + return "runtime config" + if name in {".pre-commit-config.yaml", "tsconfig.json"} or name.startswith("vite.config") or name.startswith("webpack.config"): + return "tooling config" + return "support file" + + +def support_content_policy(rel: Path, config: ScriberConfig) -> ContentPolicy: + s = rel.as_posix() + if matches_any(s, config.support_content.tree_only): + return "tree_only" + if matches_any(s, config.support_content.full): + return "full" + return config.support_content.default + + +def classify_file(path: Path, root: Path, config: ScriberConfig) -> FileNode | None: + rel = path.resolve().relative_to(root.resolve()) + rel_s = rel.as_posix() + + if matches_any(rel_s, config.hard_ignore_patterns): + return None + + binary = is_probably_binary(path) + kind: FileKind = "other" + category = None + policy: ContentPolicy = "auto" + + if matches_any(rel_s, config.code_patterns): + kind = "code" + elif config.support and matches_any(rel_s, config.support_patterns): + kind = "support" + category = support_category(rel) + policy = support_content_policy(rel, config) + else: + return None + + try: + size = path.stat().st_size + except OSError: + size = 0 + + return FileNode( + absolute=path.resolve(), + relative=rel, + kind=kind, + language=language_for(path), + size_bytes=size, + is_binary=binary, + support_category=category, + content_policy=policy, + ) + + +def should_hard_ignore(rel: Path, config: ScriberConfig) -> bool: + return matches_any(rel.as_posix(), config.hard_ignore_patterns) + + +def is_text_readable(path: Path) -> bool: + if is_probably_binary(path): + return False + try: + path.read_text(encoding="utf-8") + return True + except UnicodeDecodeError: + return False + except OSError: + return False + + +def read_text_lossy(path: Path) -> str: + from scriber.native import require_native + return require_native().read_text(str(path)) + + + diff --git a/src/scriber/scanner/scan.py b/src/scriber/scanner/scan.py new file mode 100644 index 0000000..e2fa8a4 --- /dev/null +++ b/src/scriber/scanner/scan.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from pathlib import Path +from scriber.core.models import FileNode, ScriberConfig +from scriber.native import require_native + + +def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]: + files, _ = scan_project_with_native(root, config) + return files + + +def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Path, FileNode], list]: + root = root.resolve() + native = require_native() + + native_files = native.scan_project( + str(root), + config.use_gitignore, + config.hard_ignore_patterns, + config.code_patterns, + config.support_patterns, + config.support_content.full, + config.support_content.tree_only, + config.support_content.default, + config.support + ) + + files: dict[Path, FileNode] = {} + + from scriber.cache import ScriberCache + cache = ScriberCache(config, root) + active_files: set[Path] = set() + + for item in native_files: + rel = Path(item.relative) + active_files.add(rel) + + cached_data = cache.get_file(rel, item.mtime_ns, item.size_bytes) + if cached_data is not None: + node = FileNode( + absolute=(root / Path(cached_data["relative"])).resolve(strict=False), + relative=Path(cached_data["relative"]), + kind=cached_data["kind"], + language=cached_data["language"], + size_bytes=cached_data["size_bytes"], + is_binary=cached_data["is_binary"], + support_category=cached_data["support_category"], + content_policy=cached_data["content_policy"] + ) + files[node.relative] = node + else: + node = FileNode( + absolute=(root / rel).resolve(strict=False), + relative=rel, + kind=item.kind, + language=item.language, + size_bytes=item.size_bytes, + is_binary=item.is_binary, + support_category=item.support_category, + content_policy=item.content_policy + ) + files[node.relative] = node + cache.set_file(rel, item.mtime_ns, item.size_bytes, { + "relative": node.relative.as_posix(), + "kind": node.kind, + "language": node.language, + "size_bytes": node.size_bytes, + "is_binary": node.is_binary, + "support_category": node.support_category, + "content_policy": node.content_policy + }) + + cache.save(active_files) + return files, native_files diff --git a/src/scriber/scanner/scan_py.py b/src/scriber/scanner/scan_py.py new file mode 100644 index 0000000..2c0ebae --- /dev/null +++ b/src/scriber/scanner/scan_py.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import os +from pathlib import Path + +from scriber.core.matchers import SimpleGitIgnore +from scriber.core.models import FileNode, ScriberConfig +from scriber.scanner.files import classify_file, should_hard_ignore + + +def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]: + root = root.resolve() + gitignore = SimpleGitIgnore.from_file(root / ".gitignore") if config.use_gitignore else SimpleGitIgnore([]) + files: dict[Path, FileNode] = {} + + from scriber.cache import ScriberCache + cache = ScriberCache(config, root) + active_files: set[Path] = set() + + for dirpath, dirnames, filenames in os.walk(root): + current = Path(dirpath) + rel_dir = current.relative_to(root) + + kept_dirs: list[str] = [] + for dirname in dirnames: + child_rel = (rel_dir / dirname) if rel_dir.as_posix() != "." else Path(dirname) + if should_hard_ignore(child_rel, config): + continue + if config.use_gitignore and gitignore.ignores(child_rel.as_posix(), is_dir=True): + continue + kept_dirs.append(dirname) + dirnames[:] = kept_dirs + + for filename in filenames: + path = current / filename + rel = path.relative_to(root) + if should_hard_ignore(rel, config): + continue + if config.use_gitignore and gitignore.ignores(rel.as_posix(), is_dir=False): + continue + + try: + stat = path.stat() + mtime_ns = stat.st_mtime_ns + size = stat.st_size + except OSError: + continue + + active_files.add(rel) + + cached_data = cache.get_file(rel, mtime_ns, size) + if cached_data is not None: + node = FileNode( + absolute=(root / Path(cached_data["relative"])).resolve(strict=False), + relative=Path(cached_data["relative"]), + kind=cached_data["kind"], + language=cached_data["language"], + size_bytes=cached_data["size_bytes"], + is_binary=cached_data["is_binary"], + support_category=cached_data["support_category"], + content_policy=cached_data["content_policy"] + ) + files[node.relative] = node + else: + node = classify_file(path, root, config) + if node is not None: + files[node.relative] = node + cache.set_file(rel, mtime_ns, size, { + "relative": node.relative.as_posix(), + "kind": node.kind, + "language": node.language, + "size_bytes": node.size_bytes, + "is_binary": node.is_binary, + "support_category": node.support_category, + "content_policy": node.content_policy + }) + + cache.save(active_files) + return files diff --git a/src/scriber/tokens.py b/src/scriber/tokens.py new file mode 100644 index 0000000..5b83624 --- /dev/null +++ b/src/scriber/tokens.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from scriber.core.models import TokenConfig + + +def estimate_tokens(text: str, config: TokenConfig | None = None) -> int: + if config is None: + return max(1, len(text) // 4) + if config.estimator == "chars": + return max(1, len(text) // config.chars_per_token) + return max(1, len(text) // 4) diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..5c141bf --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import json +from pathlib import Path +from scriber.core.models import ScriberConfig +from scriber.cache import ScriberCache, get_config_hash + + +def test_cache_functionality(tmp_path: Path) -> None: + config = ScriberConfig() + # Ensure cache is enabled + config.cache.enabled = True + config.cache.dir = ".scriber/cache" + + cache = ScriberCache(config, tmp_path) + + rel_path = Path("src/main.py") + mtime = 123456789 + size = 1000 + data = {"kind": "code", "language": "python", "size_bytes": 1000, "is_binary": False, "support_category": None, "content_policy": "auto", "absolute": "src/main.py", "relative": "src/main.py"} + + assert cache.get_file(rel_path, mtime, size) is None + + cache.set_file(rel_path, mtime, size, data) + assert cache.get_file(rel_path, mtime, size) == data + + # Check imports cache + imports = {Path("src/auth.py"), Path("src/db.py")} + assert cache.get_imports(rel_path) is None + cache.set_imports(rel_path, imports) + assert cache.get_imports(rel_path) == imports + + # Save cache + cache.save(active_files={rel_path}) + + # Check that cache files were created + assert (tmp_path / ".scriber/cache/files.json").exists() + assert (tmp_path / ".scriber/cache/import_graph.json").exists() + + # Reload cache and check if retrieved properly + new_cache = ScriberCache(config, tmp_path) + assert new_cache.get_file(rel_path, mtime, size) == data + assert new_cache.get_imports(rel_path) == imports diff --git a/tests/test_config_schema.py b/tests/test_config_schema.py new file mode 100644 index 0000000..ab377de --- /dev/null +++ b/tests/test_config_schema.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from pathlib import Path +from scriber.core.config import load_config + + +def test_config_schema_parsing(tmp_path: Path) -> None: + config_file = tmp_path / "pyproject.toml" + config_file.write_text(""" +[tool.scriber] +format = "txt" +max_tokens = 50000 +max_files = 30 +only_tree = true +allow_external_paths = true + +[tool.scriber.modules] +enabled = false +content_min_score = 40 + +[tool.scriber.code_files] +patterns = ["**/*.py", "**/*.rs"] + +[tool.scriber.support_files] +enabled = true +patterns = ["pyproject.toml", "Dockerfile"] + +[tool.scriber.support_files.content] +default = "tree_only" +auto_max_bytes = 20000 +full = ["pyproject.toml"] +tree_only = ["Dockerfile"] + +[tool.scriber.hard_ignore] +patterns = [".git/**", "node_modules/**"] +""".strip(), encoding="utf-8") + + config = load_config(config_file) + + assert config.format == "txt" + assert config.max_tokens == 50000 + assert config.max_files == 30 + assert config.only_tree is True + assert config.allow_external_paths is True + + assert config.modules is False + assert config.modules_config.enabled is False + assert config.modules_config.content_min_score == 40 + + assert config.code_patterns == ["**/*.py", "**/*.rs"] + + assert config.support is True + assert config.support_patterns == ["pyproject.toml", "Dockerfile"] + + assert config.support_content.default == "tree_only" + assert config.support_content.auto_max_bytes == 20000 + assert config.support_content.full == ["pyproject.toml"] + assert config.support_content.tree_only == ["Dockerfile"] + + assert config.hard_ignore_patterns == [".git/**", "node_modules/**"] + + +def test_validate_config_cli(tmp_path: Path, monkeypatch) -> None: + from scriber.cli.main import main + + # 1. Valid config + config_file = tmp_path / "pyproject.toml" + config_file.write_text("[tool.scriber]\nformat = 'md'\n", encoding="utf-8") + monkeypatch.chdir(tmp_path) + + code = main(["--validate-config"]) + assert code == 0 + + # 2. Invalid config format + config_file.write_text("[tool.scriber]\nformat = 'invalid'\n", encoding="utf-8") + code = main(["--validate-config"]) + assert code == 1 diff --git a/tests/test_init_config.py b/tests/test_init_config.py new file mode 100644 index 0000000..59d0e85 --- /dev/null +++ b/tests/test_init_config.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +import pytest +from pathlib import Path +from scriber.core.errors import ScriberError +from scriber.core.init_config import init_project, replace_existing_tool_scriber_block + + +def test_replace_existing_block() -> None: + content = """ +[build-system] +requires = ["setuptools>=61"] + +[tool.scriber] +version = "1" + +[tool.scriber.code_files] +patterns = ["*.py"] + +[tool.pytest.ini_options] +addopts = "-q" +""".strip() + + default_block = """ +[tool.scriber] +version = "2" +""".strip() + + expected = """ +[build-system] +requires = ["setuptools>=61"] + +[tool.pytest.ini_options] +addopts = "-q" + +[tool.scriber] +version = "2" +""".strip() + "\n" + + res = replace_existing_tool_scriber_block(content, default_block) + assert res == expected + + +def test_init_project_file_missing(tmp_path: Path) -> None: + config_path = tmp_path / "pyproject.toml" + assert not config_path.exists() + + path = init_project(str(config_path)) + assert path == config_path.resolve() + assert config_path.exists() + assert "[tool.scriber]" in config_path.read_text(encoding="utf-8") + + +def test_init_project_exists_no_scriber(tmp_path: Path) -> None: + config_path = tmp_path / "pyproject.toml" + config_path.write_text("[build-system]\n", encoding="utf-8") + + init_project(str(config_path)) + content = config_path.read_text(encoding="utf-8") + assert "[build-system]" in content + assert "[tool.scriber]" in content + + +def test_init_project_exists_with_scriber_raises(tmp_path: Path) -> None: + config_path = tmp_path / "pyproject.toml" + config_path.write_text("[tool.scriber]\nversion = '1'\n", encoding="utf-8") + + with pytest.raises(ScriberError, match="Scriber config already exists"): + init_project(str(config_path)) + + +def test_init_project_exists_with_scriber_force(tmp_path: Path) -> None: + config_path = tmp_path / "pyproject.toml" + config_path.write_text(""" +[build-system] +requires = ["setuptools>=61"] + +[tool.scriber] +version = '1' +""".strip() + "\n", encoding="utf-8") + + init_project(str(config_path), force=True) + content = config_path.read_text(encoding="utf-8") + assert "[build-system]" in content + assert "[tool.scriber]" in content + assert "version = '1'" not in content # must be replaced with the default block + + # Ensure there is exactly one [tool.scriber] header in pyproject.toml + assert content.count("[tool.scriber]") == 1 diff --git a/tests/test_languages.py b/tests/test_languages.py new file mode 100644 index 0000000..5f53f23 --- /dev/null +++ b/tests/test_languages.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from pathlib import Path +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.builder import build_graph + + +def test_javascript_typescript_graph(tmp_path: Path) -> None: + config = ScriberConfig() + + auth_path = tmp_path / "src/auth.ts" + auth_path.parent.mkdir(parents=True, exist_ok=True) + auth_path.write_text("export class Auth {}", encoding="utf-8") + + main_path = tmp_path / "src/main.ts" + main_path.write_text("import { Auth } from './auth';\nimport 'lodash';", encoding="utf-8") + + files = { + Path("src/auth.ts"): FileNode( + absolute=auth_path.resolve(), + relative=Path("src/auth.ts"), + kind="code", + language="typescript", + size_bytes=auth_path.stat().st_size + ), + Path("src/main.ts"): FileNode( + absolute=main_path.resolve(), + relative=Path("src/main.ts"), + kind="code", + language="typescript", + size_bytes=main_path.stat().st_size + ) + } + + graph = build_graph(files, config) + assert Path("src/auth.ts") in graph.imports[Path("src/main.ts")] + assert Path("src/main.ts") in graph.imported_by[Path("src/auth.ts")] + + +def test_rust_graph(tmp_path: Path) -> None: + config = ScriberConfig() + + cargo_toml = tmp_path / "Cargo.toml" + cargo_toml.write_text("[package]\nname = 'test'", encoding="utf-8") + + auth_path = tmp_path / "src/auth.rs" + auth_path.parent.mkdir(parents=True, exist_ok=True) + auth_path.write_text("pub struct Auth;", encoding="utf-8") + + main_path = tmp_path / "src/main.rs" + main_path.write_text("mod auth;\nuse crate::auth::Auth;\nuse super::unrelated;", encoding="utf-8") + + files = { + Path("src/auth.rs"): FileNode( + absolute=auth_path.resolve(), + relative=Path("src/auth.rs"), + kind="code", + language="rust", + size_bytes=auth_path.stat().st_size + ), + Path("src/main.rs"): FileNode( + absolute=main_path.resolve(), + relative=Path("src/main.rs"), + kind="code", + language="rust", + size_bytes=main_path.stat().st_size + ) + } + + graph = build_graph(files, config) + assert Path("src/auth.rs") in graph.imports[Path("src/main.rs")] + assert Path("src/main.rs") in graph.imported_by[Path("src/auth.rs")] + + +def test_go_graph(tmp_path: Path) -> None: + config = ScriberConfig() + + go_mod = tmp_path / "go.mod" + go_mod.write_text("module github.com/user/project\n", encoding="utf-8") + + db_path = tmp_path / "pkg/db/db.go" + db_path.parent.mkdir(parents=True, exist_ok=True) + db_path.write_text("package db\n", encoding="utf-8") + + main_path = tmp_path / "cmd/main.go" + main_path.parent.mkdir(parents=True, exist_ok=True) + main_path.write_text('package main\nimport "github.com/user/project/pkg/db"\n', encoding="utf-8") + + files = { + Path("pkg/db/db.go"): FileNode( + absolute=db_path.resolve(), + relative=Path("pkg/db/db.go"), + kind="code", + language="go", + size_bytes=db_path.stat().st_size + ), + Path("cmd/main.go"): FileNode( + absolute=main_path.resolve(), + relative=Path("cmd/main.go"), + kind="code", + language="go", + size_bytes=main_path.stat().st_size + ) + } + + graph = build_graph(files, config) + assert Path("pkg/db/db.go") in graph.imports[Path("cmd/main.go")] + assert Path("cmd/main.go") in graph.imported_by[Path("pkg/db/db.go")] + + +def test_cpp_graph(tmp_path: Path) -> None: + config = ScriberConfig() + + header_path = tmp_path / "src/auth.h" + header_path.parent.mkdir(parents=True, exist_ok=True) + header_path.write_text("class Auth {};", encoding="utf-8") + + main_path = tmp_path / "src/main.cpp" + main_path.write_text('#include "auth.h"\n#include \n#include "utils/helper.hpp"', encoding="utf-8") + + helper_path = tmp_path / "src/utils/helper.hpp" + helper_path.parent.mkdir(parents=True, exist_ok=True) + helper_path.write_text("void helper();", encoding="utf-8") + + files = { + Path("src/auth.h"): FileNode( + absolute=header_path.resolve(), + relative=Path("src/auth.h"), + kind="code", + language="c", + size_bytes=header_path.stat().st_size + ), + Path("src/main.cpp"): FileNode( + absolute=main_path.resolve(), + relative=Path("src/main.cpp"), + kind="code", + language="cpp", + size_bytes=main_path.stat().st_size + ), + Path("src/utils/helper.hpp"): FileNode( + absolute=helper_path.resolve(), + relative=Path("src/utils/helper.hpp"), + kind="code", + language="cpp", + size_bytes=helper_path.stat().st_size + ) + } + + graph = build_graph(files, config) + assert Path("src/auth.h") in graph.imports[Path("src/main.cpp")] + assert Path("src/main.cpp") in graph.imported_by[Path("src/auth.h")] + assert Path("src/utils/helper.hpp") in graph.imports[Path("src/main.cpp")] + assert Path("src/main.cpp") in graph.imported_by[Path("src/utils/helper.hpp")] + diff --git a/tests/test_native.py b/tests/test_native.py new file mode 100644 index 0000000..643d795 --- /dev/null +++ b/tests/test_native.py @@ -0,0 +1,422 @@ +from __future__ import annotations + +from pathlib import Path + +from scriber.core.models import ScriberConfig +from scriber.native import is_native_available, require_native +from scriber.scanner.scan import scan_project as scan_rust +from scriber.scanner.scan_py import scan_project as scan_python + + +def test_native_module_available() -> None: + assert is_native_available() + native = require_native() + assert native is not None + + +def test_native_read_write(tmp_path: Path) -> None: + native = require_native() + test_file = tmp_path / "test.txt" + content = "Hello, native Rust world!\nWith some special characters: Ε‚Γ³Δ…dΕΊΕ›\n" + + native.write_text(str(test_file), content) + assert test_file.exists() + + read_back = native.read_text(str(test_file)) + assert read_back == content + + +def test_native_binary_check(tmp_path: Path) -> None: + native = require_native() + + # Test text file + txt_file = tmp_path / "normal.txt" + txt_file.write_text("Hello world", encoding="utf-8") + assert not native.is_probably_binary(str(txt_file)) + + # Test binary file + bin_file = tmp_path / "binary.bin" + bin_file.write_bytes(b"Hello\x00world") + assert native.is_probably_binary(str(bin_file)) + + +def test_native_scan_matches_python_scan(tmp_path: Path) -> None: + # Set up a mock project structure + (tmp_path / "src").mkdir() + (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8") + (tmp_path / "src" / "helper.py").write_text("import sys", encoding="utf-8") + (tmp_path / "src" / "binary.dat").write_bytes(b"\x00\x01\x02") + (tmp_path / "README.md").write_text("# Test Project", encoding="utf-8") + (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") + + # Hidden dir and ignored patterns + (tmp_path / ".git").mkdir() + (tmp_path / ".git" / "config").write_text("git config", encoding="utf-8") + + config = ScriberConfig( + use_gitignore=True, + code_patterns=["**/*.py"], + support_patterns=["pyproject.toml", "README.md", "requirements.txt"], + hard_ignore_patterns=[".git/**", "**/binary.dat"], + ) + + # Create gitignore + (tmp_path / ".gitignore").write_text("*.pyc\n", encoding="utf-8") + + rust_result = scan_rust(tmp_path, config) + python_result = scan_python(tmp_path, config) + + # They should find the exact same relative paths + assert set(rust_result.keys()) == set(python_result.keys()) + + for path, rust_node in rust_result.items(): + py_node = python_result[path] + + # Verify fields match exactly + assert rust_node.relative == py_node.relative + assert rust_node.kind == py_node.kind + assert rust_node.language == py_node.language + assert rust_node.size_bytes == py_node.size_bytes + assert rust_node.is_binary == py_node.is_binary + assert rust_node.support_category == py_node.support_category + assert rust_node.content_policy == py_node.content_policy + + +def test_native_no_support(tmp_path: Path) -> None: + (tmp_path / "src").mkdir() + (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8") + (tmp_path / "README.md").write_text("# Test Project", encoding="utf-8") + (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") + + config = ScriberConfig( + support=False, + code_patterns=["**/*.py"], + support_patterns=["pyproject.toml", "README.md"], + ) + + rust_result = scan_rust(tmp_path, config) + # Check that README.md and pyproject.toml are NOT in the result (they are support files) + for path, node in rust_result.items(): + assert node.kind != "support" + assert Path("README.md") not in rust_result + assert Path("pyproject.toml") not in rust_result + + +def test_native_write_creates_parent_dirs(tmp_path: Path) -> None: + native = require_native() + path = tmp_path / "a" / "b" / "out.txt" + + native.write_text(str(path), "hello") + + assert path.read_text(encoding="utf-8") == "hello" + + +def write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def make_mixed_project(root: Path) -> None: + write(root / "pyproject.toml", "[tool.scriber]\nversion='2'\n") + write(root / "Cargo.toml", "[package]\nname='x'\n") + write(root / "Cargo.lock", "# lock\n") + write(root / "README.md", "# readme\n") + write(root / "src/main.py", "from .auth import Auth\n") + write(root / "src/auth.py", "class Auth: pass\n") + write(root / "src/main.rs", "mod auth;\n") + write(root / "src/auth.rs", "pub struct Auth;\n") + write(root / "frontend/main.ts", "import './auth'\n") + write(root / "frontend/auth.ts", "export const x = 1\n") + write(root / "node_modules/pkg/index.js", "ignored\n") + write(root / ".gitignore", "*.tmp\n") + write(root / "ignored.tmp", "ignored\n") + (root / "binary.bin").write_bytes(b"\x00\x01") + + +def make_config() -> ScriberConfig: + return ScriberConfig( + use_gitignore=True, + code_patterns=["**/*.py", "**/*.rs", "**/*.ts"], + support_patterns=["pyproject.toml", "README.md", "Cargo.toml", "Cargo.lock"], + hard_ignore_patterns=["node_modules/**"], + ) + + +def test_native_scan_matches_python_scan_mixed_project(tmp_path: Path) -> None: + make_mixed_project(tmp_path) + config = make_config() + + rs = scan_rust(tmp_path, config) + py = scan_python(tmp_path, config) + + assert set(rs.keys()) == set(py.keys()) + + +def test_native_scan_support_false(tmp_path: Path) -> None: + make_mixed_project(tmp_path) + config = make_config() + config.support = False + + rs = scan_rust(tmp_path, config) + + assert all(node.kind != "support" for node in rs.values()) + + +def test_native_scan_gitignore(tmp_path: Path) -> None: + make_mixed_project(tmp_path) + config = make_config() + config.use_gitignore = True + + rs = scan_rust(tmp_path, config) + + assert Path("ignored.tmp") not in rs + + +def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None: + make_mixed_project(tmp_path) + config = make_config() + + python_files = scan_python(tmp_path, config) + + from scriber.graph.builder import build_graph as build_python_graph + py_graph = build_python_graph(python_files, config) + + native = require_native() + native_files = native.scan_project( + str(tmp_path), + config.use_gitignore, + config.hard_ignore_patterns, + config.code_patterns, + config.support_patterns, + config.support_content.full, + config.support_content.tree_only, + config.support_content.default, + config.support + ) + edges = native.build_import_graph( + str(tmp_path), + native_files, + config.python.source_roots, + config.python.module_init_files + ) + + rs_imports = {} + for edge in edges: + rs_imports.setdefault(Path(getattr(edge, "from")), set()).add(Path(edge.to)) + + for path, targets in py_graph.imports.items(): + file = python_files[path] + if file.language in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}: + rs_targets = rs_imports.get(path, set()) + assert rs_targets == targets + + +def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None: + make_mixed_project(tmp_path) + config = make_config() + + python_files = scan_python(tmp_path, config) + from scriber.graph.builder import build_graph as build_python_graph + py_graph = build_python_graph(python_files, config) + + from scriber.engine.scorer import score_candidates as score_python + from scriber.core.models import SeedPath + seed = SeedPath( + original=Path("src/main.py"), + absolute=(tmp_path / "src/main.py").resolve(), + relative=Path("src/main.py"), + is_dir=False, + expanded_files=[Path("src/main.py")] + ) + py_candidates = score_python(files=python_files, seeds=[seed], graph=py_graph, config=config, mode="focused") + + native = require_native() + native_files = native.scan_project( + str(tmp_path), + config.use_gitignore, + config.hard_ignore_patterns, + config.code_patterns, + config.support_patterns, + config.support_content.full, + config.support_content.tree_only, + config.support_content.default, + config.support + ) + edges = native.build_import_graph( + str(tmp_path), + native_files, + config.python.source_roots, + config.python.module_init_files + ) + + scoring = config.modules_config.scoring + opts = native.NativePackOptions( + mode="focused", + max_files=config.max_files, + min_score=config.min_score, + tree_min_score=config.modules_config.tree_min_score, + seed_file_score=scoring.get("seed_file", 100), + seed_folder_file_score=scoring.get("seed_folder_file", 100), + direct_dependency_score=scoring.get("direct_dependency", 90), + reverse_dependency_score=scoring.get("reverse_dependency", 85), + same_package_score=scoring.get("same_package", 65), + parent_entrypoint_score=scoring.get("parent_entrypoint", 60), + related_test_score=scoring.get("related_test", 80), + name_similarity_score=scoring.get("name_similarity", 45), + support_near_seed_score=scoring.get("support_near_seed", 60), + project_config_score=scoring.get("project_config", 55), + dependency_file_score=scoring.get("dependency_file", 52), + runtime_support_score=scoring.get("runtime_support", 50), + documentation_score=scoring.get("documentation", 45), + shared_dependency_bonus=scoring.get("shared_dependency_bonus", 10), + modules_enabled=config.modules, + include_direct_dependencies=config.modules_config.include_direct_dependencies, + include_reverse_dependencies=config.modules_config.include_reverse_dependencies, + include_same_package=config.modules_config.include_same_package, + include_parent_entrypoints=config.modules_config.include_parent_entrypoints, + include_tests=config.modules_config.include_tests, + include_project_configs=config.modules_config.include_project_configs, + depth=config.modules_config.depth, + support_enabled=config.support, + entrypoint_patterns=config.python.entrypoint_patterns, + test_roots=config.python.test_roots, + ) + + rs_candidates = native.score_candidates_native( + native_files, + ["src/main.py"], + edges, + opts + ) + + py_map = {c.file.relative.as_posix(): c for c in py_candidates} + rs_map = {c.path: c for c in rs_candidates} + + assert set(py_map.keys()) == set(rs_map.keys()) + for path, py_c in py_map.items(): + rs_c = rs_map[path] + assert rs_c.kind == py_c.file.kind + assert rs_c.score == py_c.score + + +def test_native_render_tree_matches_python() -> None: + native = require_native() + paths = [ + "src/main.py", + "src/auth.py", + "tests/test_auth.py", + "pyproject.toml", + "README.md", + ] + + from scriber.rendering.renderer import render_tree as render_python_tree + py_tree = render_python_tree([Path(p) for p in paths]) + + rs_tree = native.render_tree(paths) + + assert rs_tree.strip() == py_tree.strip() + + +def test_default_toml_and_lock_support(tmp_path: Path) -> None: + from scriber.core.config import load_config + from scriber.scanner.scan import scan_project + + # Create dummy files + (tmp_path / "src").mkdir() + (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8") + (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") + (tmp_path / "some_random_config.toml").write_text("a = 1", encoding="utf-8") + (tmp_path / "some_random_lockfile.lock").write_text("lock", encoding="utf-8") + + # Load default config + config = load_config(tmp_path / "pyproject.toml") + config.use_gitignore = False + + # Assert that **/*.toml and **/*.lock are in support patterns + assert "**/*.toml" in config.support_patterns + assert "**/*.toml" in config.support_content.full + assert "**/*.lock" in config.support_patterns + assert "**/*.lock" in config.support_content.tree_only + + # Scan the project + scanned = scan_project(tmp_path, config) + + # Check TOML classifications + assert Path("some_random_config.toml") in scanned + node = scanned[Path("some_random_config.toml")] + assert node.kind == "support" + assert node.support_category == "project config" + assert node.content_policy == "full" + + # Check lockfile classifications + assert Path("some_random_lockfile.lock") in scanned + node = scanned[Path("some_random_lockfile.lock")] + assert node.kind == "support" + assert node.support_category == "dependency file" + assert node.content_policy == "tree_only" + + +def test_native_import_complex_python(tmp_path: Path) -> None: + (tmp_path / "src").mkdir() + (tmp_path / "src" / "a.py").write_text("class A: pass", encoding="utf-8") + (tmp_path / "src" / "b.py").write_text("class B: pass", encoding="utf-8") + (tmp_path / "src" / "c.py").write_text("class C: pass", encoding="utf-8") + (tmp_path / "src" / "d.py").write_text("class D: pass", encoding="utf-8") + + import_test_content = """ +import os, sys +import math as m, json +from .a import A as AliasA +from .b import ( + B, # some comment here + C as AliasC +) +from .c import D +""" + (tmp_path / "src" / "main.py").write_text(import_test_content, encoding="utf-8") + (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") + + config = ScriberConfig( + use_gitignore=False, + code_patterns=["**/*.py"], + support_patterns=["pyproject.toml"], + ) + + from scriber.scanner.scan import scan_project + files = scan_project(tmp_path, config) + + native = require_native() + native_files = native.scan_project( + str(tmp_path), + config.use_gitignore, + config.hard_ignore_patterns, + config.code_patterns, + config.support_patterns, + config.support_content.full, + config.support_content.tree_only, + config.support_content.default, + config.support + ) + edges = native.build_import_graph( + str(tmp_path), + native_files, + config.python.source_roots, + config.python.module_init_files + ) + + imports = {Path(getattr(edge, "from")): set() for edge in edges} + for edge in edges: + imports[Path(getattr(edge, "from"))].add(Path(edge.to)) + + main_path = Path("src/main.py") + assert main_path in imports + + expected_imports = { + Path("src/a.py"), + Path("src/b.py"), + Path("src/c.py") + } + assert imports[main_path] == expected_imports + + + diff --git a/tests/test_processing_modes.py b/tests/test_processing_modes.py deleted file mode 100644 index a795d26..0000000 --- a/tests/test_processing_modes.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Tests for single-process and multi-process execution modes in Scriber. -""" -from pathlib import Path -from unittest.mock import MagicMock, patch - -from src.scriber.core import Scriber - - -def test_single_process_mode_avoids_process_pool(tmp_path: Path): - """ - Verifies that ProcessPoolExecutor is not used when single_process is True. - """ - (tmp_path / "test.txt").write_text("hello world") - - with patch('src.scriber.core.ProcessPoolExecutor') as mock_executor: - config = {"single_process": True, "exclude": []} - scriber = Scriber(root_path=tmp_path, config=config) - scriber.map_project() - - mock_executor.assert_not_called() - stats = scriber.get_stats() - assert stats['total_files'] == 1 - assert stats['total_tokens'] > 0 - - -def test_multi_process_mode_uses_process_pool(tmp_path: Path): - """ - Verifies that ProcessPoolExecutor is used by default (single_process is False). - - This test uses a more advanced mock to simulate the return of futures - and ensure the statistics are correctly aggregated from the mocked results. - """ - (tmp_path / "test.txt").write_text("hello world") - expected_stats = {"size": 11, "tokens": 2, "lang": "text"} - - with patch('src.scriber.core.ProcessPoolExecutor') as MockProcessPoolExecutor, \ - patch('src.scriber.core.as_completed') as mock_as_completed: - mock_future = MagicMock() - mock_future.result.return_value = expected_stats - mock_as_completed.return_value = [mock_future] - - mock_executor_instance = MockProcessPoolExecutor.return_value.__enter__.return_value - - config = {"single_process": False, "exclude": []} - scriber = Scriber(root_path=tmp_path, config=config) - scriber.map_project() - - MockProcessPoolExecutor.assert_called_once() - assert mock_executor_instance.submit.called - mock_as_completed.assert_called_once() - - stats = scriber.get_stats() - assert stats['total_files'] == 1 - assert stats['total_size_bytes'] == expected_stats['size'] - assert stats['total_tokens'] == expected_stats['tokens'] \ No newline at end of file diff --git a/tests/test_scriber.py b/tests/test_scriber.py new file mode 100644 index 0000000..8ddf870 --- /dev/null +++ b/tests/test_scriber.py @@ -0,0 +1,222 @@ +from __future__ import annotations + +from pathlib import Path + +from scriber.pack import build_pack +from scriber.render import render_markdown + + +def write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def make_project(tmp_path: Path) -> Path: + write( + tmp_path / "pyproject.toml", + """ +[tool.scriber] +version = "2" +format = "md" +output = ".scriber/out.md" +use_gitignore = false +max_files = 50 +max_tokens = 100000 +min_score = 30 + +[tool.scriber.code_files] +patterns = ["**/*.py"] + +[tool.scriber.support_files] +enabled = true +patterns = ["pyproject.toml", "README.md", "requirements.txt", "poetry.lock", "Dockerfile"] + +[tool.scriber.support_files.content] +default = "auto" +full = ["pyproject.toml", "README.md", "requirements.txt", "Dockerfile"] +tree_only = ["poetry.lock"] + +[tool.scriber.modules] +enabled = true +depth = 2 +include_direct_dependencies = true +include_reverse_dependencies = true +include_tests = true +include_same_package = true +include_parent_entrypoints = true +include_project_configs = true +content_min_score = 50 +tree_min_score = 30 + +[tool.scriber.python] +source_roots = ["src", "."] +test_roots = ["tests"] +entrypoint_patterns = ["main.py", "routes.py"] + +[tool.scriber.hard_ignore] +patterns = [".git/**"] +""".strip() + + "\n", + ) + write(tmp_path / "README.md", "# Example\n") + write(tmp_path / "requirements.txt", "fastapi\n") + write(tmp_path / "poetry.lock", "very large lock in real life\n") + write(tmp_path / "Dockerfile", "FROM python:3.12\n") + write(tmp_path / "src/app/__init__.py", "") + write(tmp_path / "src/app/auth.py", "from .session import Session\nfrom .config import SETTINGS\n\nclass Auth: pass\n") + write(tmp_path / "src/app/session.py", "class Session: pass\n") + write(tmp_path / "src/app/config.py", "SETTINGS = {}\n") + write(tmp_path / "src/app/main.py", "from app.auth import Auth\n") + write(tmp_path / "src/api/routes.py", "from app.auth import Auth\n") + write(tmp_path / "tests/test_auth.py", "from app.auth import Auth\n\ndef test_auth():\n assert Auth\n") + write(tmp_path / "src/app/unrelated.py", "VALUE = 1\n") + return tmp_path + + +def test_build_pack_includes_seed_dependencies_reverse_tests_and_support(tmp_path: Path, monkeypatch) -> None: + project = make_project(tmp_path) + monkeypatch.chdir(project) + + pack = build_pack(["src/app/auth.py"], config_path="pyproject.toml") + paths = [path.as_posix() for path in pack.included_paths] + + assert "src/app/auth.py" in paths + assert "src/app/session.py" in paths + assert "src/app/config.py" in paths + assert "src/api/routes.py" in paths + assert "tests/test_auth.py" in paths + assert "pyproject.toml" in paths + assert "README.md" in paths + assert "requirements.txt" in paths + assert "poetry.lock" in paths + + by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates} + assert by_path["src/app/auth.py"].score == 100 + assert by_path["src/app/session.py"].score >= 80 + assert by_path["src/api/routes.py"].score >= 80 + assert by_path["tests/test_auth.py"].score >= 80 + assert by_path["poetry.lock"].include_content is False + assert "tree_only" in (by_path["poetry.lock"].omitted_reason or "") + + +def test_only_tree_omits_contents(tmp_path: Path, monkeypatch) -> None: + project = make_project(tmp_path) + monkeypatch.chdir(project) + + pack = build_pack(["src/app/auth.py"], config_path="pyproject.toml", only_tree=True) + assert pack.only_tree is True + assert all(candidate.include_content is False for candidate in pack.candidates) + + rendered = render_markdown(pack) + assert "## Pack summary" in rendered + assert "Mode: `focused`" in rendered + assert "## File contents" not in rendered + assert "## Module graph" in rendered + + +def test_multiple_paths_promote_shared_dependency(tmp_path: Path, monkeypatch) -> None: + project = make_project(tmp_path) + write(tmp_path / "src/app/billing.py", "from .config import SETTINGS\n") + monkeypatch.chdir(project) + + pack = build_pack(["src/app/auth.py", "src/app/billing.py"], config_path="pyproject.toml") + by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates} + assert "src/app/config.py" in by_path + assert by_path["src/app/config.py"].score == 100 + assert any("shared by multiple seed paths" in reason for reason in by_path["src/app/config.py"].reasons) + + +def test_no_modules_keeps_seed_and_pyproject(tmp_path: Path, monkeypatch) -> None: + project = make_project(tmp_path) + monkeypatch.chdir(project) + + pack = build_pack(["src/app/auth.py"], config_path="pyproject.toml", modules=False) + paths = [path.as_posix() for path in pack.included_paths] + assert "src/app/auth.py" in paths + assert "pyproject.toml" in paths + assert "src/app/session.py" not in paths + + +def test_folder_seed_expands_files(tmp_path: Path, monkeypatch) -> None: + project = make_project(tmp_path) + monkeypatch.chdir(project) + + pack = build_pack(["src/app"], config_path="pyproject.toml", modules=False) + paths = [path.as_posix() for path in pack.included_paths] + assert "src/app/auth.py" in paths + assert "src/app/session.py" in paths + assert "src/app/config.py" in paths + + +def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None: + project = make_project(tmp_path) + monkeypatch.chdir(project) + + pack = build_pack(["."], config_path="pyproject.toml") + assert pack.mode == "project_snapshot" + + by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates} + + # Entrypoint (e.g., src/app/main.py matches main.py pattern) + assert by_path["src/app/main.py"].score == 90 + assert by_path["src/app/main.py"].reason_summary == "entrypoint file" + + # Test file (tests/test_auth.py) + assert by_path["tests/test_auth.py"].score == 60 + assert by_path["tests/test_auth.py"].reason_summary == "test file" + + # Regular code file + assert by_path["src/app/auth.py"].score == 80 + assert by_path["src/app/auth.py"].reason_summary == "code file" + + # Support files + assert by_path["README.md"].score == 45 + assert by_path["README.md"].reason_summary == "project support file" + + # Ensure no near-seed duplication in project snapshot mode + assert "near" not in by_path["README.md"].reason_summary + assert "shared by multiple seed paths" not in by_path["README.md"].reasons + + +def test_dry_run_and_open_cli(tmp_path: Path, monkeypatch) -> None: + from scriber.cli.main import main + project = make_project(tmp_path) + monkeypatch.chdir(project) + + # Test dry run + code = main(["src/app/auth.py", "--dry-run"]) + assert code == 0 + + # Ensure no output file was created under .scriber/out.md if it didn't exist + assert not (tmp_path / ".scriber/out.md").exists() + + # Test open flag by mocking open_path to verify it gets called + called_with = None + + def mock_open_path(path: Path) -> None: + nonlocal called_with + called_with = path + + monkeypatch.setattr("scriber.core.open_file.open_path", mock_open_path) + code = main(["src/app/auth.py", "--open"]) + assert code == 0 + assert called_with == (tmp_path / ".scriber/out.md").resolve() + + +def test_no_support_excludes_support_files_project_snapshot(tmp_path: Path) -> None: + project = make_project(tmp_path) + + pack = build_pack(["."], config_path=str(project / "pyproject.toml"), support=False) + + assert all(c.file.kind != "support" for c in pack.candidates) + + +def test_no_support_excludes_support_files_folder_seed(tmp_path: Path) -> None: + project = make_project(tmp_path) + + pack = build_pack(["."], config_path=str(project / "pyproject.toml"), support=False) + + paths = {c.file.relative.as_posix() for c in pack.candidates} + assert "README.md" not in paths + assert "pyproject.toml" not in paths + diff --git a/tests/test_suite.py b/tests/test_suite.py deleted file mode 100644 index 197293c..0000000 --- a/tests/test_suite.py +++ /dev/null @@ -1,457 +0,0 @@ -""" -Tests for the main Scriber application, covering both core logic and the CLI. -""" -import io -import json -from collections import Counter -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -import tiktoken - -try: - import tomllib -except ImportError: - import tomli as tomllib - -from src.scriber.cli import format_bytes -from src.scriber.cli import main as cli_main -from src.scriber.config import ScriberConfig -from src.scriber.core import Scriber - - -def test_direct_import(): - """Tests that the Scriber class can be imported directly from the package.""" - try: - from src.scriber import Scriber - except ImportError: - pytest.fail("Could not import Scriber from src.scriber") - assert callable(Scriber) - - -# --- Test Core Scriber Functionality --- - -class TestCore: - """Groups tests for the Scriber core logic found in `src.scriber.core`.""" - - def test_default_exclusion(self, tmp_path: Path): - """Tests that default patterns like .git/ and __pycache__/ are excluded.""" - (tmp_path / ".git").mkdir() - (tmp_path / ".git" / "config").touch() - (tmp_path / "main.py").touch() - (tmp_path / "__pycache__").mkdir() - (tmp_path / "__pycache__" / "cache.pyc").touch() - (tmp_path / "build").mkdir() - (tmp_path / "build" / "app").touch() - - scriber = Scriber(root_path=tmp_path) - scriber.map_project() - - paths = {p.relative_to(tmp_path).as_posix() for p in scriber.mapped_files} - assert "main.py" in paths - assert not any(p.startswith('.git/') for p in paths) - assert not any(p.startswith('__pycache__/') for p in paths) - assert not any(p.startswith('build/') for p in paths) - - def test_directory_only_exclusion(self, tmp_path: Path): - """Tests that a pattern with a trailing slash only excludes the directory.""" - (tmp_path / "my_app").mkdir() - (tmp_path / "my_app" / "code.py").touch() - (tmp_path / "my_app_file").touch() - - config = ScriberConfig(exclude=["my_app/"], include=[]) - - scriber = Scriber(root_path=tmp_path, config=config) - scriber.map_project() - paths = {p.name for p in scriber.mapped_files} - - assert "my_app_file" in paths - assert "code.py" not in paths - assert len(paths) == 1 - - def test_root_anchored_exclusion(self, tmp_path: Path): - """Tests that a pattern with a leading slash only excludes at the root.""" - (tmp_path / "src").mkdir() - (tmp_path / "src" / "config.yml").touch() - (tmp_path / "config.yml").touch() - config = ScriberConfig(exclude=["/config.yml"], include=[]) - - scriber = Scriber(root_path=tmp_path, config=config) - scriber.map_project() - paths = {p.relative_to(tmp_path).as_posix() for p in scriber.mapped_files} - - assert "src/config.yml" in paths - assert "config.yml" not in paths - - def test_unanchored_exclusion(self, tmp_path: Path): - """Tests that a pattern without slashes excludes files/dirs anywhere.""" - (tmp_path / "src").mkdir() - (tmp_path / "src" / "temp.log").touch() - (tmp_path / "temp.log").touch() - config = ScriberConfig(exclude=["temp.log"], include=[]) - - scriber = Scriber(root_path=tmp_path, config=config) - scriber.map_project() - - assert not scriber.mapped_files - - def test_gitignore_handling(self, tmp_path: Path): - """Ensures .gitignore rules are correctly applied when enabled.""" - (tmp_path / "main.py").touch() - (tmp_path / "ignored.log").touch() - (tmp_path / ".gitignore").write_text("*.log") - - scriber = Scriber(root_path=tmp_path) - scriber.map_project() - - paths = {p.name for p in scriber.mapped_files} - assert "main.py" in paths - assert "ignored.log" not in paths - - def test_disable_gitignore(self, tmp_path: Path): - """Ensures .gitignore is ignored when `use_gitignore` is false in the config.""" - (tmp_path / "main.py").touch() - (tmp_path / "not_ignored.log").touch() - (tmp_path / ".gitignore").write_text("*.log") - config = {"use_gitignore": False, "exclude": []} - (tmp_path / ".scriber.json").write_text(json.dumps(config)) - - scriber = Scriber(root_path=tmp_path) - scriber.map_project() - - paths = {p.name for p in scriber.mapped_files} - assert "main.py" in paths - assert "not_ignored.log" in paths - - def test_binary_file_skipping(self, tmp_path: Path): - """Tests that binary files are detected and correctly skipped.""" - (tmp_path / "app.exe").write_bytes(b"\x4d\x5a\x90\x00\x03\x00\x00\x00") - - config = ScriberConfig(include=["app.exe"], exclude=[]) - scriber = Scriber(root_path=tmp_path, config=config) - scriber.map_project() - - assert len(scriber.mapped_files) == 0 - assert scriber.get_stats()['skipped_binary'] == 1 - - def test_include_patterns(self, tmp_path: Path): - """Tests that 'include' patterns correctly filter files when provided.""" - (tmp_path / "main.py").touch() - (tmp_path / "script.js").touch() - (tmp_path / "style.css").touch() - (tmp_path / ".scriber.json").write_text(json.dumps({"include": ["*.py", "*.js"]})) - - scriber = Scriber(root_path=tmp_path) - scriber.map_project() - - paths = {p.name for p in scriber.mapped_files} - assert paths == {"main.py", "script.js"} - - def test_exclude_map_dictionary(self, tmp_path: Path): - """Tests that the exclude_map dictionary filter works as intended.""" - (tmp_path / "app.py").touch() - (tmp_path / "utils_test.py").touch() - (tmp_path / "script.js").touch() - (tmp_path / "archive.log").touch() - (tmp_path / "README.md").touch() - - config = ScriberConfig( - exclude_map={ - "python": ["*_test.py"], - "global": ["*.log"] - }, - exclude=[], - include=[] - ) - scriber = Scriber(root_path=tmp_path, config=config) - files = scriber.get_mapped_files() - mapped_names = {p.name for p in files} - - assert "app.py" in mapped_names - assert "script.js" in mapped_names - assert "README.md" in mapped_names - assert "utils_test.py" not in mapped_names - assert "archive.log" not in mapped_names - assert len(mapped_names) == 3 - - def test_hidden_files_are_in_tree_but_content_is_skipped(self, tmp_path: Path): - """Tests that hidden files appear in the tree but their content is not in the output.""" - (tmp_path / "main.py").write_text("print('hello')") - lock_content = "some-lock-file-content" - (tmp_path / "poetry.lock").write_text(lock_content) - config = {"hidden": ["poetry.lock"], "exclude": []} - (tmp_path / ".scriber.json").write_text(json.dumps(config)) - - scriber = Scriber(root_path=tmp_path) - scriber.map_project() - - output_buffer = io.StringIO() - scriber._write_output(output_buffer, tree_only=False, progress=None, task_id=None) - output = output_buffer.getvalue() - - assert "poetry.lock" in output - assert "[Content hidden based on configuration]" in output - assert lock_content not in output - assert "print('hello')" in output - - def test_hidden_files_are_excluded_from_token_count(self, tmp_path: Path): - """Tests that hidden files contribute to size but not token count.""" - main_py_content = "def main(): pass" - (tmp_path / "main.py").write_text(main_py_content) - (tmp_path / "poetry.lock").write_text("some-lock-file-content") - config = {"hidden": ["poetry.lock"], "exclude": [".scriber.json"]} - (tmp_path / ".scriber.json").write_text(json.dumps(config)) - - scriber = Scriber(root_path=tmp_path) - scriber.map_project() - stats = scriber.get_stats() - - tokenizer = tiktoken.get_encoding("cl100k_base") - expected_tokens = len(tokenizer.encode(main_py_content)) - - assert stats["total_files"] == 2 - assert stats["total_tokens"] == expected_tokens - assert stats["total_size_bytes"] == ( - (tmp_path / "main.py").stat().st_size + - (tmp_path / "poetry.lock").stat().st_size - ) - - def test_init_with_direct_config_object(self, tmp_path: Path): - """Tests that Scriber can be configured directly with a ScriberConfig object.""" - (tmp_path / "app.py").touch() - (tmp_path / "data.json").touch() - direct_config = ScriberConfig(include=["*.py"], exclude=[]) - - scriber = Scriber(root_path=tmp_path, config=direct_config) - files = scriber.get_mapped_files() - - paths = {p.name for p in files} - assert paths == {"app.py"} - assert scriber.config_path_used is None - - def test_get_output_as_string(self, tmp_path: Path): - """Tests that the full project map can be retrieved as a string.""" - (tmp_path / "main.py").write_text("print('test')") - scriber = Scriber(root_path=tmp_path) - output_str = scriber.get_output_as_string() - - assert isinstance(output_str, str) - assert "Mapped Folder Structure" in output_str - assert "main.py" in output_str - assert "print('test')" in output_str - - def test_getters_trigger_map_project_automatically(self, tmp_path: Path): - """Tests that getter methods automatically call map_project if not already run.""" - (tmp_path / "test.txt").touch() - scriber = Scriber(root_path=tmp_path) - - assert not scriber.mapped_files - stats = scriber.get_stats() - assert len(scriber.mapped_files) == 1 - assert stats["total_files"] == 1 - - def test_core_loads_external_toml_config(self, tmp_path: Path): - """Tests core logic loads config from an external pyproject.toml via config_path.""" - config_dir = tmp_path / "config" - config_dir.mkdir() - toml_path = config_dir / "pyproject.toml" - toml_path.write_text("[tool.scriber]\ninclude = ['*.py']") - - project_dir = tmp_path / "project" - project_dir.mkdir() - (project_dir / "app.py").touch() - (project_dir / "data.json").touch() - - scriber = Scriber(root_path=project_dir, config_path=toml_path) - scriber.map_project() - - paths = {p.name for p in scriber.mapped_files} - assert paths == {"app.py"} - assert scriber.config_path_used == toml_path - - def test_core_handles_nonexistent_config_path(self, tmp_path: Path, capsys): - """Tests that a warning is printed for a non-existent --config path.""" - non_existent_path = tmp_path / "nonexistent.json" - Scriber(root_path=tmp_path, config_path=non_existent_path) - captured = capsys.readouterr() - assert "Warning: Config file specified by --config not found" in captured.err - - def test_tree_representation(self, tmp_path: Path): - """Checks if the folder tree string is formatted correctly.""" - (tmp_path / "src").mkdir() - (tmp_path / "src" / "main.py").touch() - (tmp_path / "README.md").touch() - - scriber = Scriber(root_path=tmp_path) - scriber.map_project() - tree_str = scriber._get_tree_representation() - - expected_lines = [ - tmp_path.name, - "β”œβ”€β”€ README.md", - "└── src", - " └── main.py", - ] - actual_lines = tree_str.split('\n') - # The tree formatting can have subtle whitespace differences, so we check line by line - assert actual_lines[0] == expected_lines[0] - assert "README.md" in actual_lines[1] - assert "src" in actual_lines[2] - assert "main.py" in actual_lines[3] - - - @pytest.mark.parametrize("filename, expected_lang", [ - ("test.py", "python"), - ("script.js", "javascript"), - ("style.css", "css"), - ("Dockerfile", "dockerfile"), - ("unknown.xyz", ""), - ]) - def test_language_detection(self, tmp_path: Path, filename: str, expected_lang: str): - """Tests the language mapping utility for various file types.""" - scriber = Scriber(root_path=tmp_path) - lang = scriber._get_language(Path(filename)) - assert lang == expected_lang - - def test_multi_root_collection(self, tmp_path: Path): - """Tests that files from multiple root directories are collected.""" - project_a = tmp_path / "project_a" - project_a.mkdir() - (project_a / "a.py").touch() - - project_b = tmp_path / "project_b" - project_b.mkdir() - (project_b / "b.js").touch() - - scriber = Scriber(root_path=[project_a, project_b]) - scriber.map_project() - mapped_names = {p.name for p in scriber.mapped_files} - - assert mapped_names == {"a.py", "b.js"} - assert len(scriber.mapped_files) == 2 - - def test_multi_root_tree_and_output(self, tmp_path: Path): - """Tests tree and output format for multiple roots.""" - project_a = tmp_path / "project_a" - project_a.mkdir() - (project_a / "a.py").write_text("print('a')") - - project_b = tmp_path / "project_b" - project_b.mkdir() - (project_b / "b.js").write_text("console.log('b')") - - scriber = Scriber(root_path=[project_a, project_b]) - output = scriber.get_output_as_string() - - assert "project_a\n└── a.py" in output - assert "project_b\n└── b.js" in output - assert f"File: project_a/a.py" in output - assert f"File: project_b/b.js" in output - -# --- Test CLI Functionality --- - -class TestCli: - """Groups tests for the command-line interface in `src.scriber.cli`.""" - - @patch('src.scriber.cli.run_scriber') - def test_cli_run_command_is_default(self, mock_run_scriber, mocker): - """Tests that the 'run' command is triggered by default with no subcommand.""" - mocker.patch('sys.argv', ['scriber']) - cli_main() - mock_run_scriber.assert_called_once() - - @patch('src.scriber.cli.Scriber') - def test_cli_arguments_are_passed_correctly(self, mock_scriber, mocker, tmp_path: Path): - """Tests if CLI arguments are correctly parsed and passed to the Scriber class.""" - mock_instance = MagicMock() - mock_instance.get_output_as_string.return_value = "Mocked Output" - mock_instance.config = ScriberConfig(output="default_name.txt") - mock_instance.get_stats.return_value = {'total_files': 0, 'language_counts': Counter()} - mock_instance.get_file_count.return_value = 0 - mock_scriber.return_value = mock_instance - mocker.patch('pyperclip.copy') - - project_dir = tmp_path / "project" - project_dir.mkdir() - config_file = tmp_path / "config.json" - config_file.touch() - - test_path_str = str(project_dir) - test_output = "output.txt" - test_config_str = str(config_file) - - mocker.patch('sys.argv', [ - 'scriber', 'run', test_path_str, '--output', test_output, '--config', test_config_str, '--tree-only' - ]) - - cli_main() - - mock_scriber.assert_called_with(Path(test_path_str).resolve(), config_path=Path(test_config_str)) - - mock_instance.get_output_as_string.assert_called_once() - call_kwargs = mock_instance.get_output_as_string.call_args.kwargs - assert call_kwargs['tree_only'] is True - - output_file = project_dir / test_output - assert output_file.is_file() - assert output_file.read_text() == "Mocked Output" - - @patch('src.scriber.cli.Confirm.ask') - @patch('src.scriber.cli.Prompt.ask') - def test_cli_init_command_creates_config(self, mock_prompt, mock_confirm, tmp_path: Path, mocker): - """Tests the interactive 'init' command for config file creation.""" - mocker.patch('pathlib.Path.cwd', return_value=tmp_path) - mock_confirm.return_value = False - mock_prompt.side_effect = ["*.tmp, *.log", "*.py", "", "1"] - - mocker.patch('sys.argv', ['scriber', 'init']) - cli_main() - - config_path = tmp_path / ".scriber.json" - assert config_path.exists() - - with open(config_path, "r", encoding="utf-8") as f: - data = json.load(f) - - assert not data['use_gitignore'] - assert data['exclude'] == ['*.tmp', '*.log'] - assert data['include'] == ['*.py'] - - @patch('src.scriber.cli.Confirm.ask') - @patch('src.scriber.cli.Prompt.ask') - def test_cli_init_command_creates_config_in_toml(self, mock_prompt, mock_confirm, tmp_path: Path, mocker): - """Tests the interactive 'init' command for saving config to pyproject.toml.""" - mocker.patch('pathlib.Path.cwd', return_value=tmp_path) - - pyproject_path = tmp_path / "pyproject.toml" - pyproject_path.write_text("[project]\nname = 'test-project'") - - mock_confirm.return_value = True - mock_prompt.side_effect = ["*.log, .env", "*.py", "*.lock", "2"] - - mocker.patch('sys.argv', ['scriber', 'init']) - cli_main() - - assert pyproject_path.exists() - - with open(pyproject_path, "rb") as f: - data = tomllib.load(f) - - assert "tool" in data - assert "scriber" in data["tool"] - scriber_config = data["tool"]["scriber"] - assert scriber_config['use_gitignore'] is True - assert scriber_config['exclude'] == ['*.log', '.env'] - assert scriber_config['include'] == ['*.py'] - assert scriber_config['hidden'] == ['*.lock'] - - @pytest.mark.parametrize("bytes_val, expected_str", [ - (500, "500 Bytes"), - (2048, "2.00 KB"), - (1500000, "1.43 MB"), - (2 * 1024 * 1024, "2.00 MB"), - ]) - def test_format_bytes_utility(self, bytes_val: int, expected_str: str): - """Tests the byte formatting utility function.""" - assert format_bytes(bytes_val) == expected_str \ No newline at end of file diff --git a/tests/test_tokens.py b/tests/test_tokens.py new file mode 100644 index 0000000..fe1e2b2 --- /dev/null +++ b/tests/test_tokens.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from pathlib import Path +from scriber.tokens import estimate_tokens +from scriber.core.models import TokenConfig +from scriber.core.config import load_config + + +def test_token_estimation_default() -> None: + text = "hello world" + # default chars_per_token is 4, len("hello world") == 11, 11 // 4 == 2 + assert estimate_tokens(text) == 2 + + +def test_token_estimation_custom_config() -> None: + text = "hello world" + config = TokenConfig(estimator="chars", chars_per_token=2) + # len("hello world") == 11, 11 // 2 == 5 + assert estimate_tokens(text, config) == 5 + + +def test_token_estimation_parsing_from_config(tmp_path: Path) -> None: + config_file = tmp_path / "pyproject.toml" + config_file.write_text(""" +[tool.scriber.tokens] +estimator = "chars" +chars_per_token = 5 +""".strip(), encoding="utf-8") + + config = load_config(config_file) + assert config.tokens.estimator == "chars" + assert config.tokens.chars_per_token == 5 + + text = "hello world" + # len("hello world") == 11, 11 // 5 == 2 + assert estimate_tokens(text, config.tokens) == 2