From dd34fc24e4cda6c5852ce400704ed71da98f3576 Mon Sep 17 00:00:00 2001
From: SunneV
Date: Sat, 30 May 2026 17:42:48 +0200
Subject: [PATCH] init project scriber 2.0 - boost by rust
---
.github/workflows/ci.yml | 59 +-
.github/workflows/release.yml | 85 ++-
CHANGELOG.md | 10 +
Cargo.toml | 19 +
LICENSE | 2 +-
README.md | 473 ++++---------
examples/example_pyproject.toml | 48 ++
pyproject.toml | 121 +++-
rust/scriber_native/src/import.rs | 622 ++++++++++++++++
rust/scriber_native/src/io.rs | 48 ++
rust/scriber_native/src/lib.rs | 92 +++
rust/scriber_native/src/render.rs | 42 ++
rust/scriber_native/src/scan.rs | 356 ++++++++++
rust/scriber_native/src/score.rs | 817 ++++++++++++++++++++++
src/run.py | 6 -
src/scriber/__init__.py | 14 +-
src/scriber/__main__.py | 4 +
src/scriber/cache.py | 135 ++++
src/scriber/cli.py | 361 ----------
src/scriber/cli/__init__.py | 3 +
src/scriber/cli/main.py | 219 ++++++
src/scriber/config.py | 46 --
src/scriber/core.py | 682 ------------------
src/scriber/core/__init__.py | 64 ++
src/scriber/core/config.py | 447 ++++++++++++
src/scriber/core/errors.py | 2 +
src/scriber/core/init_config.py | 57 ++
src/scriber/core/matchers.py | 101 +++
src/scriber/core/models.py | 167 +++++
src/scriber/core/open_file.py | 22 +
src/scriber/core/root.py | 67 ++
src/scriber/engine/__init__.py | 3 +
src/scriber/engine/scorer.py | 302 ++++++++
src/scriber/graph/__init__.py | 3 +
src/scriber/graph/builder.py | 138 ++++
src/scriber/graph/languages/__init__.py | 1 +
src/scriber/graph/languages/cpp.py | 51 ++
src/scriber/graph/languages/go.py | 48 ++
src/scriber/graph/languages/javascript.py | 51 ++
src/scriber/graph/languages/python.py | 139 ++++
src/scriber/graph/languages/rust.py | 106 +++
src/scriber/native.py | 45 ++
src/scriber/pack.py | 3 +
src/scriber/packer/__init__.py | 3 +
src/scriber/packer/pack.py | 365 ++++++++++
src/scriber/render.py | 3 +
src/scriber/rendering/__init__.py | 3 +
src/scriber/rendering/renderer.py | 278 ++++++++
src/scriber/scanner/__init__.py | 21 +
src/scriber/scanner/files.py | 142 ++++
src/scriber/scanner/scan.py | 75 ++
src/scriber/scanner/scan_py.py | 79 +++
src/scriber/tokens.py | 14 +
tests/test_cache.py | 43 ++
tests/test_config_schema.py | 77 ++
tests/test_init_config.py | 89 +++
tests/test_languages.py | 154 ++++
tests/test_native.py | 422 +++++++++++
tests/test_processing_modes.py | 56 --
tests/test_scriber.py | 222 ++++++
tests/test_suite.py | 457 ------------
tests/test_tokens.py | 36 +
62 files changed, 6603 insertions(+), 2017 deletions(-)
create mode 100644 Cargo.toml
create mode 100644 examples/example_pyproject.toml
create mode 100644 rust/scriber_native/src/import.rs
create mode 100644 rust/scriber_native/src/io.rs
create mode 100644 rust/scriber_native/src/lib.rs
create mode 100644 rust/scriber_native/src/render.rs
create mode 100644 rust/scriber_native/src/scan.rs
create mode 100644 rust/scriber_native/src/score.rs
delete mode 100644 src/run.py
create mode 100644 src/scriber/__main__.py
create mode 100644 src/scriber/cache.py
delete mode 100644 src/scriber/cli.py
create mode 100644 src/scriber/cli/__init__.py
create mode 100644 src/scriber/cli/main.py
delete mode 100644 src/scriber/config.py
delete mode 100644 src/scriber/core.py
create mode 100644 src/scriber/core/__init__.py
create mode 100644 src/scriber/core/config.py
create mode 100644 src/scriber/core/errors.py
create mode 100644 src/scriber/core/init_config.py
create mode 100644 src/scriber/core/matchers.py
create mode 100644 src/scriber/core/models.py
create mode 100644 src/scriber/core/open_file.py
create mode 100644 src/scriber/core/root.py
create mode 100644 src/scriber/engine/__init__.py
create mode 100644 src/scriber/engine/scorer.py
create mode 100644 src/scriber/graph/__init__.py
create mode 100644 src/scriber/graph/builder.py
create mode 100644 src/scriber/graph/languages/__init__.py
create mode 100644 src/scriber/graph/languages/cpp.py
create mode 100644 src/scriber/graph/languages/go.py
create mode 100644 src/scriber/graph/languages/javascript.py
create mode 100644 src/scriber/graph/languages/python.py
create mode 100644 src/scriber/graph/languages/rust.py
create mode 100644 src/scriber/native.py
create mode 100644 src/scriber/pack.py
create mode 100644 src/scriber/packer/__init__.py
create mode 100644 src/scriber/packer/pack.py
create mode 100644 src/scriber/render.py
create mode 100644 src/scriber/rendering/__init__.py
create mode 100644 src/scriber/rendering/renderer.py
create mode 100644 src/scriber/scanner/__init__.py
create mode 100644 src/scriber/scanner/files.py
create mode 100644 src/scriber/scanner/scan.py
create mode 100644 src/scriber/scanner/scan_py.py
create mode 100644 src/scriber/tokens.py
create mode 100644 tests/test_cache.py
create mode 100644 tests/test_config_schema.py
create mode 100644 tests/test_init_config.py
create mode 100644 tests/test_languages.py
create mode 100644 tests/test_native.py
delete mode 100644 tests/test_processing_modes.py
create mode 100644 tests/test_scriber.py
delete mode 100644 tests/test_suite.py
create mode 100644 tests/test_tokens.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 24d6934..d0ffdb8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,28 +1,57 @@
-name: Continuous Integration
+name: CI
on:
push:
- branches:
- - develop
+ branches: [main, develop]
+ pull_request:
jobs:
- run_tests:
- runs-on: ubuntu-latest
+ test:
+ name: ${{ matrix.os }} / py${{ matrix.python-version }}
+ runs-on: ${{ matrix.os }}
+
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest, windows-latest, macos-latest]
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
steps:
- - name: Checkout code
- uses: actions/checkout@v4
+ - uses: actions/checkout@v4
- - name: Set up Python 3.10
- uses: actions/setup-python@v4
- with:
- python-version: '3.10'
+ - name: Set up Rust
+ uses: dtolnay/rust-toolchain@stable
+
+ - name: Rust cache
+ uses: Swatinem/rust-cache@v2
+ env:
+ TAR: ${{ matrix.os == 'windows-latest' && 'C:\Windows\System32\tar.exe' || 'tar' }}
- name: Install uv
- run: pipx install uv
+ uses: astral-sh/setup-uv@v5
+ with:
+ cache-dependency-glob: "pyproject.toml"
+
+ - name: Set up Python
+ run: uv python install ${{ matrix.python-version }}
- - name: Install dependencies
- run: uv pip install -e .[dev] --system
+ - name: Sync
+ run: uv sync --all-extras
+
+ - name: Check native import
+ run: uv run python -c "import scriber._native; print('native ok')"
+
+ - name: Rust format check
+ run: cargo fmt --check
+
+ - name: Rust clippy
+ run: cargo clippy --all-targets -- -D warnings
+
+ - name: Rust tests
+ run: cargo test
- name: Run tests
- run: pytest
\ No newline at end of file
+ run: uv run pytest
+
+ - name: CLI smoke
+ run: uv run scriber . --only-tree --output -
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index c94a6a9..2887303 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,26 +1,83 @@
-name: Publish to PyPI
+name: Release
on:
push:
tags:
- - 'v*' # Triggers on any tag starting with v, like v0.0.3
+ - "v*"
jobs:
- build_and_publish:
+ build:
+ name: Build ${{ matrix.os }}
+ runs-on: ${{ matrix.os }}
+
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest, macos-latest, windows-latest]
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Build wheels
+ uses: PyO3/maturin-action@v1
+ with:
+ command: build
+ args: --release --locked --out dist --compatibility pypi
+ manylinux: "2014"
+ sccache: "true"
+
+ - name: Build sdist
+ if: matrix.os == 'ubuntu-latest'
+ uses: PyO3/maturin-action@v1
+ with:
+ command: sdist
+ args: --out dist
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+
+ - name: Wheel smoke install
+ shell: bash
+ run: |
+ python -m venv test_venv
+ if [ "${{ matrix.os }}" = "windows-latest" ]; then
+ test_venv/Scripts/pip install dist/*.whl
+ test_venv/Scripts/python -c "import scriber._native; print('native ok')"
+ test_venv/Scripts/scriber . --only-tree --output -
+ else
+ test_venv/bin/pip install dist/*.whl
+ test_venv/bin/python -c "import scriber._native; print('native ok')"
+ test_venv/bin/scriber . --only-tree --output -
+ fi
+
+ - name: Upload artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: wheels-${{ matrix.os }}
+ path: dist
+
+ publish:
+ name: Publish
+ needs: [build]
runs-on: ubuntu-latest
+
permissions:
- id-token: write # Required for trusted publishing
- contents: read # Required to read the repository content
+ id-token: write
+ contents: read
steps:
- - name: Checkout code
- uses: actions/checkout@v3
-
- - name: Install uv
- run: pipx install uv
+ - uses: actions/download-artifact@v4
+ with:
+ path: dist-artifacts
+ pattern: wheels-*
+ merge-multiple: true
- - name: Build distributions
- run: uv build
+ - name: List artifacts
+ run: ls -la dist-artifacts
- - name: Publish to PyPI
- run: uv publish
\ No newline at end of file
+ - name: Publish
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ packages-dir: dist-artifacts
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2cf07d6..bcd7481 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [2.0.0] - 2026-05-30
+
+### Added
+- **β‘ Native Rust Acceleration (`scriber._native`)**: Full transition of filesystem scanning, high-performance file reading/writing, and binary classification to a compiled Rust extension built using Maturin and PyO3.
+- **π³ Fast Parallel Scanner**: Re-engineered directory scanning utilizing the `WalkBuilder` from the `ignore` crate, fully respecting `.gitignore` rules with blazing fast native execution.
+- **π§ͺ Rigorous Verification & Equivalence Testing**: Comprehensive suite of regression and equivalence tests validating 100% exact matching behavior between Rust and Python scanner modules.
+- **π¦ Multi-Platform Binary Wheels**: CI/CD integration using `PyO3/maturin-action` to compile and distribute native wheels across Linux, macOS, and Windows.
+
+
## [1.1.2] - 2025-09-30
### Fixed
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..91e0426
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "project-scriber-native"
+version = "2.0.0"
+edition = "2021"
+
+[lib]
+name = "_native"
+crate-type = ["cdylib"]
+path = "rust/scriber_native/src/lib.rs"
+
+[dependencies]
+pyo3 = { version = "0.21", features = ["extension-module", "abi3-py310"] }
+ignore = "0.4"
+globset = "0.4"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+memchr = "2.7"
+regex = "1.10"
+
diff --git a/LICENSE b/LICENSE
index 222cae3..8039da5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2025 SunneV (Wojciech Mariusz CichoΕ)
+Copyright (c) 2026 SunneV
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index a9849e2..7dc7d1a 100644
--- a/README.md
+++ b/README.md
@@ -9,50 +9,44 @@
-An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for
-Large Language Models (LLMs), available as both a powerful CLI and a flexible Python library.
+An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for Large Language Models (LLMs). **Version 2.0** brings advanced dependency graph analysis, strict whitelist-based file inclusion, zero-dependency lightweight execution, and progress tracking!
-----
## π Table of Contents
-- [π€ Why ProjectScriber?](#-why-projectscriber)
+- [π€ Why ProjectScriber 2.0?](#-why-projectscriber-20)
- [β¨ Key Features](#-key-features)
- [π Quick Start](#-quick-start)
- [πΎ Installation](#-installation)
- [π₯οΈ Command-Line Usage](#οΈ-command-line-usage)
-- [π Library Usage (API)](#-library-usage-api)
- [βοΈ Configuration](#οΈ-configuration)
- [π€ Contributing & Development](#-contributing--development)
-----
-## π€ Why ProjectScriber?
+## π€ Why ProjectScriber 2.0?
-When working with Large Language Models, providing the full context of a codebase is crucial for getting accurate
-analysis, documentation, or refactoring suggestions. Manually copying and pasting files is tedious, error-prone, and
-unsustainable for projects of any real size. **ProjectScriber automates this entire process.** It intelligently scans
-your project, respects your existing
-`.gitignore` rules, applies custom filters, and bundles all relevant code into a single, clean, and readable format
-perfect for any AI model.
+When working with Large Language Models, providing the full context of a codebase is crucial for getting accurate analysis, documentation, or refactoring suggestions. However, blindly pasting an entire project wastes tokens and introduces noise.
+
+**ProjectScriber 2.0** automates context building using a **Whitelist-First** philosophy and an **Intelligent Scoring Engine**. It analyzes your codebase's dependency graph (e.g., Python imports), determines which files are most relevant to the code you're working on, and bundles them into a single, clean markdown file, strictly respecting your token budgets and file-type configurations.
- π Your Codebase β π¦ ProjectScriber β π LLM-Ready Context
+ π Your Codebase β π¦ ProjectScriber 2.0 β π LLM-Ready Context
-----
## β¨ Key Features
-|Feature |Description |
-|:-------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| **π³ Smart Project Mapping** | Generates a clear and intuitive tree view of your project's structure. |
-| **βοΈ Intelligent Filtering** | Automatically respects `.gitignore` and supports custom `include`, `exclude`, and `hidden` patterns using `.gitignore`-style syntax for precise control. |
-| **π In-depth Code Analysis** | Provides a summary with total file size, estimated token count (using `cl100k_base`), and a language breakdown. |
-| **π Flexible Python Library** | Import and use the `Scriber` class directly in your Python projects for full programmatic control. |
-| **β¨ Interactive CLI** | A simple `scriber init` command walks you through creating a configuration file for your project. |
-| **π Clipboard Integration** | Use the `--copy` or `--copy-only` flags to automatically send the entire output to your clipboard, ready for pasting. |
-| **π¨ Lightweight & Fast** | The default installation is minimal, and file analysis is multi-threaded for improved performance. A single-process mode is available for compatibility. |
+| Feature | Description |
+|:---|:---|
+| **π³ Smart Project Mapping** | Generates a clear and intuitive tree view of your project's structure. |
+| **β‘ Native Rust Acceleration** | Accelerates heavy I/O and directory scanning natively via a high-performance Rust backend. |
+| **π‘οΈ Whitelist Philosophy** | By default, only recognized code and support files are included. Binary and lock files are automatically ignored. |
+| **π§ Intelligent Scoring Engine** | Analyzes import graphs and file proximity to prioritize code modules that are directly related to your provided seed files. |
+| **π° Token Budgets** | Set a hard limit on `--max-tokens`. Scriber will fit the most relevant files within your budget to save API costs. |
+| **π Live Progress & Stats** | Built-in zero-dependency progress spinner and detailed statistics summary at the end of the run. |
-----
@@ -62,72 +56,52 @@ perfect for any AI model.
```shell
pip install project-scriber
+ ```
+
+2. **Navigate to your project's root and initialize config:**
- ````
+ ```shell
+ scriber --init
+ ```
+ *(This appends a `[tool.scriber]` block to your `pyproject.toml`. Use `--force` to overwrite it.)*
-2. **Navigate to your project's root and run:**
+3. **Pack your context!** Just point it to a file, folder, or let it scan the whole project:
```shell
- scriber
+ scriber src/main.py --output context.md
```
-3. **That's it\!** A `scriber_output.txt` file is now in your directory. It will look something like this:
-
- ````text
- ===
- Mapped Folder Structure
- ===
-
- ProjectScriber
- βββ .github
- β βββ workflows
- β βββ ci.yml
- β βββ release.yml
- βββ README.md
- βββ src
- βββ scriber
- βββ __init__.py
- βββ core.py
-
- ---
- File: .github/workflows/ci.yml
- Size: 512 bytes
- ---
- ```yaml
- name: Continuous Integration
-
- on:
- push:
- branches:
- - develop
-
- jobs:
- run_tests:
- ...
- ````
+4. **Review your stats:**
+ ```text
+ Scriber build completed.
+ ----------------------------------------
+ Code files included: 15
+ Support files included: 4
+ Files omitted/skipped: 2
+ Estimated tokens: 12500
+ ----------------------------------------
+ Scriber pack written to: context.md
+ ```
-----
## πΎ Installation
-You have two options for installation.
-
-#### Standard Installation
-
-This provides the core functionality with a minimal, text-based interface.
+ProjectScriber distributes pre-compiled binary wheels for Linux, macOS, and Windows. A simple pip command is all you need:
```shell
pip install project-scriber
```
-#### With Rich UI β¨
-
-For an enhanced terminal experience with colors, tables, and progress bars, install the `rich` extra:
+Or if you use `uv`:
```shell
-pip install project-scriber[rich]
+uv pip install project-scriber
```
+> [!NOTE]
+> If a pre-compiled wheel is not available for your platform/architecture, the package will automatically build from source, which requires a Rust compiler toolchain (Rust 1.70+) installed on your machine.
+
-----
## π₯οΈ Command-Line Usage
@@ -136,315 +110,140 @@ pip install project-scriber[rich]
- **Scan the current directory**:
```shell
- scriber
+ scriber .
```
-- **Scan a different directory**:
+- **Scan a specific file and its dependencies**:
```shell
- scriber /path/to/your/project
+ scriber src/my_module.py
```
-- **Interactive Setup**: Create a configuration file (`.scriber.json` or `pyproject.toml`) for your project.
+- **Interactive Setup**: Create/Append a default configuration in `pyproject.toml` (use `--force` to overwrite it).
```shell
- scriber init
+ scriber --init
```
### CLI Options
-|Option | Alias | Description |
-|:------------------|:------|:--------------------------------------------------------------------------------------------------------|
-| `root_path` | | The project directory to map. Defaults to the current directory. |
-| `--output [file]` | `-o` | Set a custom name for the output file. |
-| `--config [path]` | | Path to a custom config file (e.g., a `pyproject.toml` in a monorepo). |
-| `--copy` | `-c` | Copy the final output to the clipboard in addition to saving it. |
-| `--copy-only` | | Generate the output and copy it to the clipboard without saving to a file. |
-| `--tree-only` | | Generate only the file tree structure, without any file content. |
-| `--single-process`| | Run file analysis in a single process. Recommended for use in environments like Celery. |
-| `--version` | `-v` | Show the installed version of ProjectScriber. |
-| `--help` | `-h` | Display the help message. |
-
-### Advanced Example
-
-Scan another project, save the output to `custom_map.txt`, and copy the result to the clipboard in one go:
-
-```shell
-scriber ../my-other-project --output custom_map.txt --copy
-```
-
------
-
-## π Library Usage (API)
-
-Use `ProjectScriber` directly in your Python code for maximum flexibility and automation.
-
-### Basic Example: Get Context as a String
-
-Initialize `Scriber`, and it will automatically handle mapping and analysis.
-
-```python
-from pathlib import Path
-from scriber import Scriber # The class is exposed for direct import
-
-# 1. Initialize Scriber for the current directory
-scriber = Scriber(root_path=Path('.'))
-
-# 2. Get the complete output directly as a string
-project_context = scriber.get_output_as_string()
-
-# 3. Use the context for your application
-print(f"Generated context of {len(project_context)} characters.")
-
-# 4. Access the calculated statistics
-stats = scriber.get_stats()
-print(f"Total files mapped: {stats['total_files']}")
-print(f"Estimated tokens: {stats['total_tokens']:,}")
-```
-
-### Advanced Configuration via ScriberConfig
-
-Bypass all on-disk configuration files by passing a `ScriberConfig` object directly to the constructor. This is perfect
-for dynamic or controlled environments.
-
-```python
-from pathlib import Path
-from scriber import Scriber, ScriberConfig
-
-# 1. Create a config object and customize it
-config = ScriberConfig()
-config.single_process = True
-config.exclude.append("tests/")
-config.exclude.append("assets/scriber_*")
-
-# 2. Initialize Scriber with the root path and config object
-current_directory = Path('.').resolve()
-scriber = Scriber(root_path=current_directory, config=config)
-
-# 3. Get the output
-project_context = scriber.get_output_as_string()
-print(project_context)
-```
-
-### Scanning Multiple Directories
-
-You can pass a list of paths to the `Scriber` constructor to map multiple directories into a single output. The first
-path in the list is treated as the "primary root" for loading configurations (`.gitignore`, `pyproject.toml`, etc.).
-
-```python
-from pathlib import Path
-from scriber import Scriber
-
-# Example: Scan both a 'backend' and a 'frontend' directory
-backend_path = Path('./my_backend_project')
-frontend_path = Path('./my_frontend_project')
-
-# Create dummy directories and files for the example
-backend_path.mkdir(exist_ok=True)
-(backend_path / "main.py").write_text("print('hello from backend')")
-frontend_path.mkdir(exist_ok=True)
-(frontend_path / "app.js").write_text("console.log('hello from frontend')")
-
-# Initialize with a list of paths. `backend_path` is the primary root.
-scriber = Scriber(root_path=[backend_path, frontend_path])
-
-# Get the combined context as a single string
-combined_context = scriber.get_output_as_string()
-print(combined_context)
-
-# The output will contain two separate trees and file content blocks,
-# with file paths prefixed by their root folder's name.
-```
-
-### Accessing Intermediate Data
-
-You can also access the generated file tree and the list of mapped files before the final output is compiled.
-
-```python
-from pathlib import Path
-from scriber import Scriber
-
-scriber = Scriber(root_path=Path('.'))
-
-# Get just the formatted file tree
-tree_representation = scriber.get_tree()
-print("--- Project Tree ---")
-print(tree_representation)
-
-# Get a list of all mapped file paths
-print("\n--- Mapped Files ---")
-file_paths = scriber.get_mapped_files()
-for path in file_paths:
- print(path.relative_to(scriber.primary_root))
-```
-
-### Practical Example: Preparing Context for an LLM
-
-Here's a small function demonstrating how you can use ProjectScriber to generate a complete, well-formatted prompt for
-an LLM.
-
-```python
-from pathlib import Path
-from scriber import Scriber
-
-
-def get_llm_context(project_path: Path, task: str) -> str:
- '''
- Generates a complete project context string ready for an LLM.
-
- Args:
- project_path: The root directory of the project.
- task: The specific task you want the LLM to perform.
-
- Returns:
- A formatted string to be used as a prompt for an LLM.
- '''
- # Initialize Scriber and get the project map
- scriber = Scriber(root_path=project_path)
- project_map = scriber.get_output_as_string()
-
- # Get some stats for the context header
- stats = scriber.get_stats()
- token_count = stats.get("total_tokens", 0)
-
- # Assemble the final prompt for the LLM
- prompt = (
- f"Please perform the following task: {task}\n\n"
- f"Here is the full context of the project codebase. "
- f"It includes a file tree and the content of all relevant files.\n"
- f"Estimated Token Count: {token_count:,}\n\n"
- "--- PROJECT CONTEXT BEGINS ---\n"
- f"{project_map}"
- "--- PROJECT CONTEXT ENDS ---"
- )
-
- return prompt
-
-
-# --- Usage ---
-if __name__ == "__main__":
- my_project_path = Path('.')
- user_task = "Analyze the code for potential bugs and suggest improvements."
- llm_prompt = get_llm_context(my_project_path, user_task)
-
- print(llm_prompt)
-
- # Now you can send `llm_prompt` to your favorite LLM API.
-```
+| Option | Description |
+|:---|:---|
+| `paths` | Project file/folder paths used as seeds. Defaults to current directory `.`. |
+| `--config [path]` | Path to `pyproject.toml`. Its parent directory becomes the project root. |
+| `--path-base [base]`| Base for relative paths: `project` (default) or `cwd`. |
+| `--format [md, txt]` | Output format. Defaults to `md` (Markdown). |
+| `--output [file]` | Output file path. Use `-` for stdout. |
+| `--dry-run` | Show pack summary without writing the output file. |
+| `--open` | Open the generated file in the default editor. |
+| `--validate-config`| Validate the `[tool.scriber]` configuration and exit. |
+| `--only-tree` | Render only the scored tree/map, without any file contents. |
+| `--[no-]modules` | Enable/Disable automatic related module selection (dependency graph scanning). |
+| `--[no-]support` | Enable/Disable support files (like `.env.example`, `.github/workflows`). |
+| `--support-content` | Override support file content policy (`full`, `auto`, `tree_only`). |
+| `--max-files` | Maximum number of files in the pack. |
+| `--max-tokens` | Approximate token budget using char-based estimation. `0` disables budget. |
+| `--min-score` | Minimum relevance score (0-100) for non-seed files to be included. |
+| `--init` | Append a default `[tool.scriber]` config to `pyproject.toml` and exit. |
+| `--force` | Force overwrite of the config block when used with `--init`. |
+| `--version` | Show program's version number and exit. |
-----
## βοΈ Configuration
-ProjectScriber is configured via a file in your project's root. It searches for configurations in the following order of
-precedence:
+ProjectScriber 2.0 configures itself through the standard `pyproject.toml` using the `[tool.scriber]` table.
+Generate the default block using:
-1. **Direct `config` object/dictionary** (Library mode only).
-2. **`--config [path]` flag** (CLI mode only).
-3. **`.scriber.json`** in the project root.
-4. **`[tool.scriber]`** section in `pyproject.toml`.
-5. **Default Behavior**: If no file is found, a default configuration is used, and a `.scriber.json` may be created to
- guide you.
-
-### Configuration Keys
-
-|Key |Type |Default |Description |
-|:----------------|:--------|:-----------------------|:------------------------------------------------------------------------------------------------------------------------------------------------|
-| `use_gitignore` | boolean | `true` |If `true`, all patterns in the `.gitignore` file will be used for exclusion. |
-| `exclude` |list |See `config.py` |A list of file/folder names or `.gitignore`-style patterns to exclude globally (e.g., `"node_modules"`, `"*.log"`, `build/`). |
-| `include` |list |`[]` |If not empty, **only** files matching these `.gitignore`-style patterns will be included. |
-| `hidden` |list |`[]` |Files matching these patterns will appear in the tree but their content will be replaced with a placeholder. Useful for large lock files. |
-| `exclude_map` |object |`{}` |A dictionary for language-specific and global exclusion patterns. See example below. |
-| `output` |string |`"scriber_output.txt"` |The default name for the output file. |
-| `single_process`|boolean |`false` |If `true`, runs file analysis in a single process. This is slower but required for environments like Celery that do not allow child processes. |
+```shell
+scriber --init
+```
-### Example `pyproject.toml` Configuration
+### Example `pyproject.toml`
-Here is an example of a well-configured `[tool.scriber]` section in your `pyproject.toml` file:
+> [!NOTE]
+> This is a minimal example. Run `scriber --init` to generate the full default configuration.
```toml
[tool.scriber]
-# Respect the project's .gitignore file
-use_gitignore = true
-
-# Globally exclude common folders and file types using gitignore-style patterns
-exclude = [
- "__pycache__/",
- "node_modules/",
- "dist/",
- "build/",
- ".venv/",
+format = "md"
+max_tokens = 0 # 0 means unlimited
+max_files = 0 # 0 means unlimited
+only_tree = false # If true, file contents are omitted
+allow_external_paths = false
+
+[tool.scriber.modules]
+enabled = true
+content_min_score = 50
+
+[tool.scriber.tokens]
+estimator = "chars"
+chars_per_token = 4
+
+[tool.scriber.code_files]
+# Only files matching these are considered "Code"
+patterns = [
+ "**/*.py",
+ "**/*.js",
+ "**/*.ts",
+ "**/*.tsx"
]
-# Only include files with these extensions
-include = [
- "*.py",
- "*.js",
- "*.css",
- "*.md"
+[tool.scriber.support_files]
+enabled = true
+# Only files matching these are considered "Support"
+patterns = [
+ "pyproject.toml",
+ "Dockerfile",
+ "**/*.svg"
]
-# Show these files in the tree, but hide their content
-hidden = [
- "poetry.lock"
+[tool.scriber.support_files.content]
+default = "auto"
+auto_max_bytes = 10000
+full = [
+ "pyproject.toml",
+ "requirements.txt",
+ "README.md"
+]
+tree_only = [
+ "**/*.svg"
]
-# Run in a single process to prevent issues in certain environments
-single_process = false
-
-# Language-specific and global exclusion rules
-[tool.scriber.exclude_map]
-# Exclude these patterns from all files
-global = ["*.log", "*.tmp"]
-# In Python files, exclude tests and setup scripts
-python = ["*_test.py", "setup.py"]
-# In JavaScript files, exclude spec files
-javascript = ["*.spec.js"]
+[tool.scriber.hard_ignore]
+# Folders ignored entirely during the initial scan
+patterns = [
+ ".git/**",
+ "__pycache__/**",
+ "node_modules/**",
+ ".venv/**"
+]
```
-> **π‘ Note on Pattern Matching:** The `exclude` and `include` options support `.gitignore`-style pattern matching. This
-allows for more precise rules, such as matching directories only (e.g., `build/`), root-level files (e.g.,
-`/config.yaml`), or standard wildcards (`*.log`).
+### Whitelist Policy
+ProjectScriber 2.0 uses a strict **whitelist** approach:
+1. Files must match either a `code_pattern` or a `support_pattern` to be considered.
+2. Unrecognized extensions and binary files are automatically excluded, keeping your LLM context safe from binary garbage.
+3. Lock files are included in the tree by default, but their contents are omitted to save tokens.
+4. Support files can be marked as `tree_only` (e.g., `**/*.svg`), meaning they'll show up in the project map but their contents won't be read.
-----
## π€ Contributing & Development
-Contributions are welcome\! If you have a suggestion or find a bug, please open an issue to discuss it first.
+Contributions are welcome!
### Development Setup
-1. **Prerequisites**:
-
- * Python 3.10 or higher.
-
-2. **Clone the Repository**:
-
+1. **Clone the Repository**:
```shell
git clone https://github.com/SunneV/ProjectScriber.git
+ cd ProjectScriber
```
-3. **Navigate to the Project Directory**:
-
+2. **Install Dependencies & Compile Extension** (using `uv` is recommended):
```shell
- cd ProjectScriber
+ uv sync --all-extras
```
+ *(This synchronizes the virtual environment and compiles the native Rust extension automatically!)*
-4. **Install Dependencies**:
- Choose one of the following methods to install the project in editable mode with all development dependencies.
-
- * **Using `pip`**:
-
- ```shell
- pip install -e .[dev]
- ```
-
- * **Using `uv`** (Recommended):
-
- ```shell
- uv pip install -e .[dev]
- ```
-
-### Running Tests
-
-Run the test suite using `pytest`:
-
-```shell
-pytest
-```
\ No newline at end of file
+3. **Run Tests**:
+ ```shell
+ uv run pytest
+ ```
\ No newline at end of file
diff --git a/examples/example_pyproject.toml b/examples/example_pyproject.toml
new file mode 100644
index 0000000..15dc70e
--- /dev/null
+++ b/examples/example_pyproject.toml
@@ -0,0 +1,48 @@
+[tool.scriber]
+version = "2"
+format = "md"
+output = ".scriber/scriber_pack.md"
+use_gitignore = true
+max_files = 60
+max_tokens = 100000
+min_score = 45
+
+[tool.scriber.code_files]
+patterns = ["**/*.py", "**/*.pyi", "**/*.rs", "**/*.ts", "**/*.tsx", "**/*.js", "**/*.jsx"]
+
+[tool.scriber.support_files]
+enabled = true
+patterns = [
+ "pyproject.toml",
+ "README.md",
+ "requirements.txt",
+ "requirements/*.txt",
+ "poetry.lock",
+ "uv.lock",
+ ".env.example",
+ "Dockerfile",
+ "docker-compose.yml",
+ ".github/workflows/*.yml",
+]
+
+[tool.scriber.support_files.content]
+default = "auto"
+full = ["pyproject.toml", "README.md", "requirements.txt", "requirements/*.txt", ".env.example", "Dockerfile", "docker-compose.yml", ".github/workflows/*.yml"]
+tree_only = ["poetry.lock", "uv.lock"]
+
+[tool.scriber.modules]
+enabled = true
+depth = 2
+include_direct_dependencies = true
+include_reverse_dependencies = true
+include_tests = true
+include_same_package = true
+include_parent_entrypoints = true
+include_project_configs = true
+content_min_score = 50
+tree_min_score = 30
+
+[tool.scriber.python]
+source_roots = ["src", "app", "."]
+test_roots = ["tests", "test"]
+entrypoint_patterns = ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"]
diff --git a/pyproject.toml b/pyproject.toml
index 86f56ef..6339246 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,53 +1,106 @@
+[build-system]
+requires = ["maturin>=1.7,<2"]
+build-backend = "maturin"
+
[project]
name = "project-scriber"
-version = "1.1.2"
-authors = [
- { name="SunneV (Wojciech Mariusz CichoΕ)", email="wojciech.m.cichon@gmail.com" },
-]
-description = "An intelligent tool to map, analyze, and compile project source code for LLM context."
+version = "2.0.0"
+description = "Scriber 2.0: build intelligent code packs from one or more project paths."
readme = "README.md"
requires-python = ">=3.10"
-license = { file="LICENSE" }
-keywords = ["llm", "code-analysis", "developer-tools", "context-builder", "source-code"]
+license = { text = "MIT" }
+authors = [
+ { name = "SunneV" }
+]
+keywords = ["code-context", "llm", "project-map", "developer-tools"]
classifiers = [
- "Programming Language :: Python :: 3",
+ "Development Status :: 4 - Beta",
+ "Environment :: Console",
+ "Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
- "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Topic :: Software Development :: Documentation",
"Topic :: Software Development :: Libraries :: Python Modules",
- "Topic :: Utilities",
]
dependencies = [
- "pathspec",
- "python-dotenv",
- "tiktoken",
- "pyperclip",
- "tomlkit",
- "tomli; python_version < '3.11'",
+ "tomli>=2.0; python_version < '3.11'",
]
-[project.urls]
-Homepage = "https://github.com/SunneV/ProjectScriber"
-Issues = "https://github.com/SunneV/ProjectScriber/issues"
+[project.optional-dependencies]
+dev = [
+ "pytest>=8",
+ "maturin>=1.7,<2",
+]
[project.scripts]
scriber = "scriber.cli:main"
-[project.optional-dependencies]
-rich = ["rich"]
-dev = [
- "pytest",
- "pytest-mock",
- "rich"
+[tool.maturin]
+python-source = "src"
+module-name = "scriber._native"
+features = ["pyo3/extension-module", "pyo3/abi3-py310"]
+
+[tool.pytest.ini_options]
+addopts = "-q"
+testpaths = ["tests"]
+
+[tool.scriber]
+version = "2"
+format = "md"
+output = ".scriber/scriber_pack.md"
+only_tree = false
+use_gitignore = true
+max_files = 60
+max_tokens = 100000
+min_score = 45
+path_style = "project-relative"
+allow_external_paths = false
+
+[tool.scriber.code_files]
+patterns = ["**/*.py", "**/*.pyi", "**/*.rs", "**/*.js", "**/*.jsx", "**/*.ts", "**/*.tsx"]
+
+[tool.scriber.support_files]
+enabled = true
+patterns = [
+ "**/*.toml",
+ "**/*.lock",
+ "pyproject.toml",
+ "README.md",
+ "requirements.txt",
+ "requirements/*.txt",
+ ".env.example",
+ "Dockerfile",
+ "docker-compose.yml",
+ ".github/workflows/*.yml",
+ "**/*.svg",
]
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
+[tool.scriber.support_files.content]
+default = "auto"
+full = ["**/*.toml", "pyproject.toml", "README.md", "requirements.txt", "requirements/*.txt", ".env.example", "Dockerfile", "docker-compose.yml", ".github/workflows/*.yml"]
+tree_only = ["**/*.svg", "**/*.lock"]
-[tool.hatch.build.targets.wheel]
-packages = ["src/scriber"]
+[tool.scriber.modules]
+enabled = true
+depth = 2
+include_direct_dependencies = true
+include_reverse_dependencies = true
+include_tests = true
+include_same_package = true
+include_parent_entrypoints = true
+include_project_configs = true
+content_min_score = 50
+tree_min_score = 30
-[tool.pytest.ini_options]
-pythonpath = [
- "."
-]
\ No newline at end of file
+[tool.scriber.python]
+source_roots = ["src", "app", "."]
+test_roots = ["tests", "test"]
+entrypoint_patterns = ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"]
+
+[tool.scriber.tokens]
+estimator = "chars"
+chars_per_token = 4
diff --git a/rust/scriber_native/src/import.rs b/rust/scriber_native/src/import.rs
new file mode 100644
index 0000000..c86a963
--- /dev/null
+++ b/rust/scriber_native/src/import.rs
@@ -0,0 +1,622 @@
+use crate::scan::NativeFileInfo;
+use pyo3::prelude::*;
+use regex::Regex;
+use std::collections::{HashMap, HashSet};
+use std::path::Path;
+
+#[pyclass]
+#[derive(Clone, Debug)]
+pub struct NativeImportEdge {
+ #[pyo3(get)]
+ pub from: String,
+ #[pyo3(get)]
+ pub to: String,
+ #[pyo3(get)]
+ pub kind: String,
+}
+
+fn is_under(relative: &str, root: &str) -> bool {
+ if root.is_empty() || root == "." {
+ return true;
+ }
+ let rel_parts: Vec<&str> = relative.split('/').collect();
+ let root_parts: Vec<&str> = root.split('/').collect();
+ if rel_parts.len() < root_parts.len() {
+ return false;
+ }
+ for i in 0..root_parts.len() {
+ if rel_parts[i] != root_parts[i] {
+ return false;
+ }
+ }
+ true
+}
+
+fn relative_to_root(relative: &str, root: &str) -> String {
+ if root.is_empty() || root == "." {
+ return relative.to_string();
+ }
+ let rel_parts: Vec<&str> = relative.split('/').collect();
+ let root_parts: Vec<&str> = root.split('/').collect();
+ rel_parts[root_parts.len()..].join("/")
+}
+
+fn module_name_for_file(
+ relative: &str,
+ source_roots: &[String],
+ module_init_files: &[String],
+) -> Option {
+ let mut roots = source_roots.to_vec();
+ roots.sort_by_key(|r| if r == "." { 0 } else { r.len() });
+ roots.reverse();
+
+ for r in roots {
+ if !is_under(relative, &r) {
+ continue;
+ }
+ let under = relative_to_root(relative, &r);
+ if under.is_empty() {
+ continue;
+ }
+ let p = Path::new(&under);
+ let file_name = p.file_name()?.to_str()?;
+ if file_name.ends_with(".py") || file_name.ends_with(".pyi") {
+ let mut parts: Vec = Vec::new();
+ if let Some(parent) = p.parent() {
+ for c in parent.components() {
+ parts.push(c.as_os_str().to_string_lossy().to_string());
+ }
+ }
+ if !module_init_files.contains(&file_name.to_string()) {
+ if let Some(stem) = p.file_stem() {
+ parts.push(stem.to_string_lossy().to_string());
+ }
+ }
+ if parts.is_empty() {
+ continue;
+ }
+ return Some(parts.join("."));
+ }
+ }
+ None
+}
+
+fn resolve_relative_module(
+ current_module: &str,
+ current_is_init: bool,
+ level: usize,
+ module: &str,
+) -> String {
+ if level == 0 {
+ return module.to_string();
+ }
+ let mut parts: Vec<&str> = current_module.split('.').collect();
+ if !current_is_init && !parts.is_empty() {
+ parts.pop();
+ }
+ let up = level.saturating_sub(1);
+ if up < parts.len() {
+ parts.truncate(parts.len() - up);
+ } else {
+ parts.clear();
+ }
+ if !module.is_empty() {
+ for part in module.split('.') {
+ parts.push(part);
+ }
+ }
+ parts.join(".")
+}
+
+fn normalize_posix_path(path: &str) -> String {
+ let mut parts = Vec::new();
+ for part in path.split('/') {
+ if part.is_empty() || part == "." {
+ continue;
+ }
+ if part == ".." {
+ parts.pop();
+ } else {
+ parts.push(part);
+ }
+ }
+ parts.join("/")
+}
+
+#[pyfunction]
+pub fn build_import_graph(
+ root: &str,
+ files: Vec,
+ python_source_roots: Vec,
+ python_module_init_files: Vec,
+) -> PyResult> {
+ let mut edges = Vec::new();
+ if files.is_empty() {
+ return Ok(edges);
+ }
+
+ let absolute_to_file: HashMap =
+ files.iter().map(|f| (f.relative.clone(), f)).collect();
+
+ let mut dir_to_files: HashMap> = HashMap::new();
+ for file in &files {
+ let parent = Path::new(&file.relative)
+ .parent()
+ .unwrap_or(Path::new(""))
+ .to_string_lossy()
+ .replace("\\", "/");
+ dir_to_files
+ .entry(parent)
+ .or_default()
+ .push(file.relative.clone());
+ }
+
+ // Pre-calculate Python module map
+ let mut module_to_path: HashMap = HashMap::new();
+ let mut path_to_module: HashMap = HashMap::new();
+ for file in &files {
+ if let Some(mod_name) = module_name_for_file(
+ &file.relative,
+ &python_source_roots,
+ &python_module_init_files,
+ ) {
+ path_to_module.insert(file.relative.clone(), mod_name.clone());
+ module_to_path
+ .entry(mod_name)
+ .or_insert_with(|| file.relative.clone());
+ }
+ }
+
+ // Go module resolution
+ let mut go_module_name = None;
+ let go_mod_path = Path::new(root).join("go.mod");
+ if go_mod_path.exists() {
+ if let Ok(content) = std::fs::read_to_string(go_mod_path) {
+ let go_mod_re = Regex::new(r"(?m)^\s*module\s+(\S+)").unwrap();
+ if let Some(m) = go_mod_re.captures(&content) {
+ go_module_name = Some(m.get(1).unwrap().as_str().to_string());
+ }
+ }
+ }
+
+ // Regex compile
+ let py_import_re = Regex::new(r"(?m)^\s*import\s+([a-zA-Z0-9_.,\t ]+)").unwrap();
+ let py_from_paren_re =
+ Regex::new(r"(?m)^\s*from\s+(\.+[a-zA-Z0-9_.]*|[a-zA-Z0-9_.]+)\s+import\s+\(([^)]+)\)")
+ .unwrap();
+ let py_from_simple_re = Regex::new(
+ r"(?m)^\s*from\s+(\.+[a-zA-Z0-9_.]*|[a-zA-Z0-9_.]+)\s+import\s+([a-zA-Z0-9_.,\t ]+)",
+ )
+ .unwrap();
+
+ let js_import_re = Regex::new(r#"(?:import|export)\s+(?:[\w*\s{},]*\s+from\s+)?['"]([^'"]+)['"]|require\s*\(\s*['"]([^'"]+)['"]\s*\)"#).unwrap();
+
+ let rust_mod_re = Regex::new(r"\bmod\s+(\w+)\s*;").unwrap();
+ let rust_use_re = Regex::new(r"\buse\s+([^;]+)\s*;").unwrap();
+
+ let go_import_single_re = Regex::new(r#"\bimport\s+['"]([^'"]+)['"]"#).unwrap();
+ let go_import_block_re = Regex::new(r"(?s)\bimport\s*\(([^)]+)\)").unwrap();
+
+ let cpp_include_re = Regex::new(r#"#include\s*["<]([^">]+)[">]"#).unwrap();
+
+ for file in &files {
+ if file.kind != "code" || file.is_binary {
+ continue;
+ }
+
+ let file_abs_path = Path::new(root).join(&file.relative);
+ let mut source = match std::fs::read(&file_abs_path) {
+ Ok(bytes) => String::from_utf8_lossy(&bytes).to_string(),
+ Err(_) => continue,
+ };
+
+ if file.language == "python" {
+ let normalized = source.replace("\r\n", "\n");
+ let mut clean = String::new();
+ for line in normalized.lines() {
+ if let Some(idx) = line.find('#') {
+ clean.push_str(&line[..idx]);
+ } else {
+ clean.push_str(line);
+ }
+ clean.push('\n');
+ }
+ source = clean.replace("\\\n", " ");
+ }
+
+ let mut resolved_set = HashSet::new();
+
+ if file.language == "python" {
+ if let Some(current_module) = path_to_module.get(&file.relative) {
+ let current_is_init = file.relative.ends_with("__init__.py");
+
+ // Parse standard imports
+ for cap in py_import_re.captures_iter(&source) {
+ if let Some(m) = cap.get(1) {
+ for alias in m.as_str().split(',') {
+ let parts: Vec<&str> = alias.split_whitespace().collect();
+ if !parts.is_empty() {
+ let imported_module = parts[0].to_string();
+ resolved_set.insert((imported_module, true, 0, Vec::new()));
+ }
+ }
+ }
+ }
+
+ // Parse from ... import (...)
+ for cap in py_from_paren_re.captures_iter(&source) {
+ let from_module = cap.get(1).unwrap().as_str().trim().to_string();
+ let names_str = cap.get(2).unwrap().as_str().trim();
+ let mut names = Vec::new();
+ for name in names_str.split(',') {
+ let parts: Vec<&str> = name.split_whitespace().collect();
+ if !parts.is_empty() && parts[0] != "*" {
+ names.push(parts[0].to_string());
+ }
+ }
+
+ let mut level = 0;
+ let mut module = from_module;
+ while module.starts_with('.') {
+ level += 1;
+ module = module[1..].to_string();
+ }
+
+ resolved_set.insert((module, false, level, names));
+ }
+
+ // Parse from ... import ... (simple)
+ for cap in py_from_simple_re.captures_iter(&source) {
+ let from_module = cap.get(1).unwrap().as_str().trim().to_string();
+ let names_str = cap.get(2).unwrap().as_str().trim();
+ let mut names = Vec::new();
+ for name in names_str.split(',') {
+ let parts: Vec<&str> = name.split_whitespace().collect();
+ if !parts.is_empty() && parts[0] != "*" {
+ names.push(parts[0].to_string());
+ }
+ }
+
+ let mut level = 0;
+ let mut module = from_module;
+ while module.starts_with('.') {
+ level += 1;
+ module = module[1..].to_string();
+ }
+
+ resolved_set.insert((module, false, level, names));
+ }
+
+ // Resolve python imports
+ for (module, is_import, level, names) in resolved_set {
+ let mut candidates = Vec::new();
+ if is_import {
+ candidates.push(module);
+ } else {
+ let base = if level > 0 {
+ resolve_relative_module(current_module, current_is_init, level, &module)
+ } else {
+ module
+ };
+ for name in &names {
+ if !base.is_empty() {
+ candidates.push(format!("{}.{}", base, name));
+ } else {
+ candidates.push(name.clone());
+ }
+ }
+ if !base.is_empty() {
+ candidates.push(base);
+ }
+ }
+
+ for candidate in candidates {
+ if candidate.is_empty() {
+ continue;
+ }
+ let parts: Vec<&str> = candidate.split('.').collect();
+ for end in (1..=parts.len()).rev() {
+ let mod_name = parts[..end].join(".");
+ if let Some(target_path) = module_to_path.get(&mod_name) {
+ if target_path != &file.relative {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target_path.clone(),
+ kind: "import".to_string(),
+ });
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ } else if file.language == "javascript" || file.language == "typescript" {
+ let parent = Path::new(&file.relative)
+ .parent()
+ .unwrap_or(Path::new(""))
+ .to_string_lossy()
+ .replace("\\", "/");
+ for cap in js_import_re.captures_iter(&source) {
+ let spec = cap
+ .get(1)
+ .or_else(|| cap.get(2))
+ .map(|m| m.as_str())
+ .unwrap_or("");
+ if !spec.starts_with('.') {
+ continue;
+ }
+
+ let raw_base = if parent.is_empty() {
+ spec.to_string()
+ } else {
+ format!("{}/{}", parent, spec)
+ };
+ let base_normalized = normalize_posix_path(&raw_base);
+
+ let mut resolved = false;
+ let extensions = vec!["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"];
+ for ext in extensions {
+ let cand = if ext.is_empty() {
+ base_normalized.clone()
+ } else {
+ format!("{}{}", base_normalized, ext)
+ };
+ if let Some(target) = absolute_to_file.get(&cand) {
+ if !target.is_binary && target.relative != file.relative {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target.relative.clone(),
+ kind: "import".to_string(),
+ });
+ resolved = true;
+ break;
+ }
+ }
+ }
+
+ if !resolved {
+ let index_names = vec!["index.ts", "index.tsx", "index.js", "index.jsx"];
+ for idx in index_names {
+ let cand = format!("{}/{}", base_normalized, idx);
+ if let Some(target) = absolute_to_file.get(&cand) {
+ if !target.is_binary && target.relative != file.relative {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target.relative.clone(),
+ kind: "import".to_string(),
+ });
+ break;
+ }
+ }
+ }
+ }
+ }
+ } else if file.language == "rust" {
+ let parent = Path::new(&file.relative)
+ .parent()
+ .unwrap_or(Path::new(""))
+ .to_string_lossy()
+ .replace("\\", "/");
+ let mut mod_specs = Vec::new();
+
+ for cap in rust_mod_re.captures_iter(&source) {
+ if let Some(m) = cap.get(1) {
+ mod_specs.push(("mod".to_string(), m.as_str().to_string()));
+ }
+ }
+
+ for cap in rust_use_re.captures_iter(&source) {
+ if let Some(m) = cap.get(1) {
+ let spec = m.as_str().trim();
+ if spec.contains('{') {
+ if let Some(idx) = spec.find('{') {
+ let base = spec[..idx].trim();
+ let rest = spec[idx + 1..].replace('}', "");
+ for part in rest.split(',') {
+ let part_trimmed = part.trim();
+ if !part_trimmed.is_empty() {
+ mod_specs.push((
+ "use".to_string(),
+ format!("{}{}", base, part_trimmed),
+ ));
+ }
+ }
+ }
+ } else {
+ mod_specs.push(("use".to_string(), spec.to_string()));
+ }
+ }
+ }
+
+ // Resolve rust
+ for (kind, spec) in mod_specs {
+ if kind == "mod" {
+ let cand1 = if parent.is_empty() {
+ format!("{}.rs", spec)
+ } else {
+ format!("{}/{}.rs", parent, spec)
+ };
+ let cand2 = if parent.is_empty() {
+ format!("{}/mod.rs", spec)
+ } else {
+ format!("{}/{}/mod.rs", parent, spec)
+ };
+ for cand in &[cand1, cand2] {
+ if let Some(target) = absolute_to_file.get(cand) {
+ if target.relative != file.relative {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target.relative.clone(),
+ kind: "mod".to_string(),
+ });
+ break;
+ }
+ }
+ }
+ } else {
+ let parts: Vec<&str> = spec.split("::").collect();
+ if parts.is_empty() {
+ continue;
+ }
+
+ let mut crate_root = "".to_string();
+ let mut curr = Path::new(&file.relative).parent();
+ while let Some(c) = curr {
+ let c_str = c.to_string_lossy().replace("\\", "/");
+ let cargo_toml = if c_str.is_empty() {
+ "Cargo.toml".to_string()
+ } else {
+ format!("{}/Cargo.toml", c_str)
+ };
+ let src_dir = if c_str.is_empty() {
+ "src".to_string()
+ } else {
+ format!("{}/src", c_str)
+ };
+
+ let has_cargo = absolute_to_file.contains_key(&cargo_toml);
+ let has_src = absolute_to_file
+ .keys()
+ .any(|k| k.starts_with(&format!("{}/", src_dir)) || *k == src_dir);
+
+ if has_cargo || has_src {
+ crate_root = if has_src { src_dir } else { c_str };
+ break;
+ }
+ curr = c.parent();
+ }
+
+ if crate_root.is_empty() {
+ crate_root = parent.clone();
+ }
+
+ if parts[0] == "crate" || parts[0] == "super" || parts[0] == "self" {
+ let base_dir = match parts[0] {
+ "crate" => crate_root,
+ "super" => Path::new(&parent)
+ .parent()
+ .unwrap_or(Path::new(""))
+ .to_string_lossy()
+ .replace("\\", "/"),
+ _ => parent.clone(),
+ };
+
+ let sub_parts = &parts[1..];
+ if !sub_parts.is_empty() {
+ let mut resolved = false;
+ for end in (1..=sub_parts.len()).rev() {
+ let sub_path = sub_parts[..end].join("/");
+ let path_str = if base_dir.is_empty() {
+ sub_path
+ } else {
+ format!("{}/{}", base_dir, sub_path)
+ };
+ let cand1 = format!("{}.rs", path_str);
+ let cand2 = format!("{}/mod.rs", path_str);
+ for cand in &[cand1, cand2] {
+ if let Some(target) = absolute_to_file.get(cand) {
+ if target.relative != file.relative {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target.relative.clone(),
+ kind: "use".to_string(),
+ });
+ resolved = true;
+ break;
+ }
+ }
+ }
+ if resolved {
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ } else if file.language == "go" {
+ let mut specs = Vec::new();
+ for cap in go_import_single_re.captures_iter(&source) {
+ specs.push(cap.get(1).unwrap().as_str().to_string());
+ }
+ for cap in go_import_block_re.captures_iter(&source) {
+ let block = cap.get(1).unwrap().as_str();
+ for line in block.lines() {
+ let line_trimmed = line.trim();
+ if line_trimmed.starts_with("//") {
+ continue;
+ }
+ if let Some(idx) = line_trimmed.find('"') {
+ let sub = &line_trimmed[idx + 1..];
+ if let Some(end) = sub.find('"') {
+ specs.push(sub[..end].to_string());
+ }
+ }
+ }
+ }
+
+ if let Some(ref mod_name) = go_module_name {
+ for spec in specs {
+ if spec.starts_with(mod_name) {
+ let rel_spec = spec[mod_name.len()..].trim_start_matches('/').to_string();
+ if let Some(targets) = dir_to_files.get(&rel_spec) {
+ for target in targets {
+ if target.ends_with(".go") && target != &file.relative {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target.clone(),
+ kind: "import".to_string(),
+ });
+ }
+ }
+ }
+ }
+ }
+ }
+ } else if file.language == "c" || file.language == "cpp" {
+ let parent = Path::new(&file.relative)
+ .parent()
+ .unwrap_or(Path::new(""))
+ .to_string_lossy()
+ .replace("\\", "/");
+ for cap in cpp_include_re.captures_iter(&source) {
+ let spec = cap.get(1).unwrap().as_str();
+ let raw_base = if parent.is_empty() {
+ spec.to_string()
+ } else {
+ format!("{}/{}", parent, spec)
+ };
+ let base_normalized = normalize_posix_path(&raw_base);
+
+ if let Some(target) = absolute_to_file.get(&base_normalized) {
+ if !target.is_binary && target.relative != file.relative {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target.relative.clone(),
+ kind: "include".to_string(),
+ });
+ }
+ } else {
+ for (rel_path, target) in &absolute_to_file {
+ if target.is_binary {
+ continue;
+ }
+ if (*rel_path == spec || rel_path.ends_with(&format!("/{}", spec)))
+ && target.relative != file.relative
+ {
+ edges.push(NativeImportEdge {
+ from: file.relative.clone(),
+ to: target.relative.clone(),
+ kind: "include".to_string(),
+ });
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ Ok(edges)
+}
diff --git a/rust/scriber_native/src/io.rs b/rust/scriber_native/src/io.rs
new file mode 100644
index 0000000..2d4508b
--- /dev/null
+++ b/rust/scriber_native/src/io.rs
@@ -0,0 +1,48 @@
+use pyo3::exceptions::PyOSError;
+use pyo3::prelude::*;
+use std::fs;
+use std::path::Path;
+
+pub fn io_err(context: &str, path: &str, err: std::io::Error) -> PyErr {
+ PyOSError::new_err(format!("{}: {}: {}", context, path, err))
+}
+
+pub fn read_text_lossy_native(path: &str) -> PyResult {
+ let bytes = fs::read(path).map_err(|e| io_err("Failed to read", path, e))?;
+ Ok(String::from_utf8_lossy(&bytes).into_owned())
+}
+
+pub fn write_text_native(path: &str, content: &str) -> PyResult<()> {
+ let p = Path::new(path);
+ if let Some(parent) = p.parent() {
+ fs::create_dir_all(parent)
+ .map_err(|e| io_err("Failed to create parent directory", path, e))?;
+ }
+ fs::write(path, content).map_err(|e| io_err("Failed to write", path, e))
+}
+
+pub fn is_binary_native(path: &str) -> PyResult {
+ Ok(is_binary(Path::new(path)))
+}
+
+pub fn is_binary(path: &Path) -> bool {
+ use std::fs::File;
+ use std::io::Read;
+ let mut file = match File::open(path) {
+ Ok(f) => f,
+ Err(_) => return true,
+ };
+ let mut buf = [0u8; 4096];
+ let n = match file.read(&mut buf) {
+ Ok(n) => n,
+ Err(_) => return true,
+ };
+ memchr::memchr(0, &buf[..n]).is_some()
+}
+
+pub fn read_many_text_native(paths: Vec) -> PyResult> {
+ paths
+ .into_iter()
+ .map(|path| read_text_lossy_native(&path))
+ .collect()
+}
diff --git a/rust/scriber_native/src/lib.rs b/rust/scriber_native/src/lib.rs
new file mode 100644
index 0000000..4b854dd
--- /dev/null
+++ b/rust/scriber_native/src/lib.rs
@@ -0,0 +1,92 @@
+use pyo3::prelude::*;
+
+mod import;
+mod io;
+mod render;
+mod scan;
+mod score;
+
+#[pyfunction]
+#[pyo3(name = "read_text")]
+fn read_text(path: &str) -> PyResult {
+ io::read_text_lossy_native(path)
+}
+
+#[pyfunction]
+#[pyo3(name = "write_text")]
+fn write_text(path: &str, content: &str) -> PyResult<()> {
+ io::write_text_native(path, content)
+}
+
+#[pyfunction]
+#[pyo3(name = "is_probably_binary")]
+fn is_probably_binary(path: &str) -> PyResult {
+ io::is_binary_native(path)
+}
+
+#[pyfunction]
+#[pyo3(name = "read_many_text")]
+fn read_many_text(paths: Vec) -> PyResult> {
+ io::read_many_text_native(paths)
+}
+
+#[pyfunction]
+#[pyo3(name = "scan_project")]
+#[allow(clippy::too_many_arguments)]
+fn scan_project(
+ root_path: &str,
+ use_gitignore: bool,
+ hard_ignore_patterns: Vec,
+ code_patterns: Vec,
+ support_patterns: Vec,
+ support_full_patterns: Vec,
+ support_tree_only_patterns: Vec,
+ support_default_policy: String,
+ support_enabled: bool,
+) -> PyResult> {
+ scan::scan_project_native(
+ root_path,
+ use_gitignore,
+ hard_ignore_patterns,
+ code_patterns,
+ support_patterns,
+ support_full_patterns,
+ support_tree_only_patterns,
+ support_default_policy,
+ support_enabled,
+ )
+}
+
+#[pyfunction]
+fn native_api_version() -> u32 {
+ 1
+}
+
+#[pyfunction]
+fn build_info() -> PyResult {
+ Ok(format!(
+ "scriber-native {} {}",
+ env!("CARGO_PKG_VERSION"),
+ std::env::consts::OS
+ ))
+}
+
+#[pymodule]
+#[allow(deprecated)]
+fn _native(_py: Python, m: &PyModule) -> PyResult<()> {
+ m.add_class::()?;
+ m.add_class::()?;
+ m.add_class::()?;
+ m.add_class::()?;
+ m.add_function(wrap_pyfunction!(read_text, m)?)?;
+ m.add_function(wrap_pyfunction!(write_text, m)?)?;
+ m.add_function(wrap_pyfunction!(is_probably_binary, m)?)?;
+ m.add_function(wrap_pyfunction!(read_many_text, m)?)?;
+ m.add_function(wrap_pyfunction!(scan_project, m)?)?;
+ m.add_function(wrap_pyfunction!(import::build_import_graph, m)?)?;
+ m.add_function(wrap_pyfunction!(score::score_candidates_native, m)?)?;
+ m.add_function(wrap_pyfunction!(render::render_tree, m)?)?;
+ m.add_function(wrap_pyfunction!(native_api_version, m)?)?;
+ m.add_function(wrap_pyfunction!(build_info, m)?)?;
+ Ok(())
+}
diff --git a/rust/scriber_native/src/render.rs b/rust/scriber_native/src/render.rs
new file mode 100644
index 0000000..c3189c6
--- /dev/null
+++ b/rust/scriber_native/src/render.rs
@@ -0,0 +1,42 @@
+use pyo3::prelude::*;
+use std::collections::BTreeMap;
+
+#[derive(Default)]
+struct TreeNode {
+ children: BTreeMap,
+}
+
+fn walk(node: &TreeNode, prefix: &str) -> Vec {
+ let mut lines = Vec::new();
+ let items: Vec<(&String, &TreeNode)> = node.children.iter().collect();
+ for (index, (name, child)) in items.iter().enumerate() {
+ let is_last = index == items.len() - 1;
+ let branch = if is_last { "βββ " } else { "βββ " };
+ lines.push(format!("{}{}{}", prefix, branch, name));
+ let extension = if is_last { " " } else { "β " };
+ lines.extend(walk(child, &format!("{}{}", prefix, extension)));
+ }
+ lines
+}
+
+#[pyfunction]
+pub fn render_tree(paths: Vec) -> PyResult {
+ let mut root = TreeNode::default();
+ for path_str in paths {
+ let mut curr = &mut root;
+ // Support both backslash and forward slash
+ let clean_path = path_str.replace("\\", "/");
+ for part in clean_path.split('/') {
+ if part.is_empty() || part == "." {
+ continue;
+ }
+ curr = curr.children.entry(part.to_string()).or_default();
+ }
+ }
+
+ if root.children.is_empty() {
+ Ok(".".to_string())
+ } else {
+ Ok(format!(".\n{}", walk(&root, "").join("\n")))
+ }
+}
diff --git a/rust/scriber_native/src/scan.rs b/rust/scriber_native/src/scan.rs
new file mode 100644
index 0000000..ce43103
--- /dev/null
+++ b/rust/scriber_native/src/scan.rs
@@ -0,0 +1,356 @@
+use globset::GlobBuilder;
+use ignore::WalkBuilder;
+use pyo3::prelude::*;
+use std::path::Path;
+
+#[pyclass]
+#[derive(Clone)]
+pub struct NativeFileInfo {
+ #[pyo3(get)]
+ pub relative: String,
+ #[pyo3(get)]
+ pub kind: String,
+ #[pyo3(get)]
+ pub language: String,
+ #[pyo3(get)]
+ pub size_bytes: u64,
+ #[pyo3(get)]
+ pub is_binary: bool,
+ #[pyo3(get)]
+ pub support_category: Option,
+ #[pyo3(get)]
+ pub content_policy: String,
+ #[pyo3(get)]
+ pub mtime_ns: u64,
+}
+
+#[derive(Clone)]
+pub struct PreparedPattern {
+ pub normalized_pat: String,
+ pub prefix_star_star: Option,
+ pub matcher: globset::GlobMatcher,
+ pub double_star_short_matcher: Option,
+}
+
+#[derive(Clone)]
+pub struct PathMatcher {
+ patterns: Vec,
+}
+
+impl PathMatcher {
+ pub fn new(raw_patterns: &[String]) -> Self {
+ let mut patterns = Vec::new();
+ for raw in raw_patterns {
+ let mut pat = raw.replace("\\", "/").trim().to_string();
+ if pat.is_empty() {
+ continue;
+ }
+ if pat.starts_with('/') {
+ pat = pat[1..].to_string();
+ }
+ if pat.ends_with('/') {
+ pat = pat[..pat.len() - 1].to_string();
+ }
+
+ let mut prefix_star_star = None;
+ if pat.ends_with("/**") {
+ let prefix = pat[..pat.len() - 3].trim_matches('/').to_string();
+ prefix_star_star = Some(prefix);
+ }
+
+ let mut double_star_short_glob = None;
+ if let Some(short) = pat.strip_prefix("**/") {
+ if let Ok(g) = GlobBuilder::new(short).literal_separator(false).build() {
+ double_star_short_glob = Some(g);
+ }
+ }
+
+ if let Ok(g) = GlobBuilder::new(&pat).literal_separator(false).build() {
+ let matcher = g.compile_matcher();
+ let double_star_short_matcher = double_star_short_glob
+ .as_ref()
+ .map(|d_g| d_g.compile_matcher());
+ patterns.push(PreparedPattern {
+ normalized_pat: pat,
+ prefix_star_star,
+ matcher,
+ double_star_short_matcher,
+ });
+ }
+ }
+ PathMatcher { patterns }
+ }
+
+ pub fn matches(&self, rel_path: &str) -> bool {
+ if self.patterns.is_empty() {
+ return false;
+ }
+ let rel = rel_path.replace("\\", "/").trim_matches('/').to_string();
+ for p in &self.patterns {
+ if rel == p.normalized_pat {
+ return true;
+ }
+ if let Some(ref prefix) = p.prefix_star_star {
+ if rel == *prefix || rel.starts_with(&format!("{}/", prefix)) {
+ return true;
+ }
+ }
+ if p.matcher.is_match(&rel) {
+ return true;
+ }
+ if !p.normalized_pat.contains('/') {
+ if let Some(filename) = rel.rsplit('/').next() {
+ if p.matcher.is_match(filename) {
+ return true;
+ }
+ }
+ }
+ if let Some(ref short_matcher) = p.double_star_short_matcher {
+ if short_matcher.is_match(&rel) {
+ return true;
+ }
+ if let Some(filename) = rel.rsplit('/').next() {
+ if short_matcher.is_match(filename) {
+ return true;
+ }
+ }
+ }
+ }
+ false
+ }
+}
+
+fn to_posix_string(path: &Path) -> String {
+ path.to_string_lossy().replace("\\", "/")
+}
+
+fn language_for(name: &str) -> String {
+ if name.starts_with("Dockerfile") {
+ return "dockerfile".to_string();
+ }
+ let suffix = match name.rfind('.') {
+ Some(idx) => &name[idx..],
+ None => "",
+ };
+ let lang = match suffix.to_lowercase().as_str() {
+ ".py" | ".pyi" => "python",
+ ".rs" => "rust",
+ ".js" | ".jsx" => "javascript",
+ ".ts" | ".tsx" => "typescript",
+ ".go" => "go",
+ ".java" => "java",
+ ".kt" => "kotlin",
+ ".c" | ".h" => "c",
+ ".cpp" | ".hpp" | ".cc" | ".cxx" | ".hh" | ".hxx" => "cpp",
+ ".toml" => "toml",
+ ".yaml" | ".yml" => "yaml",
+ ".json" => "json",
+ ".md" => "markdown",
+ ".rst" => "rst",
+ ".txt" => "text",
+ ".ini" | ".cfg" => "ini",
+ ".lock" => "lock",
+ _ => "text",
+ };
+ lang.to_string()
+}
+
+fn support_category(rel_s: &str, name: &str) -> String {
+ if name == "pyproject.toml"
+ || name.ends_with(".toml")
+ || name == "setup.py"
+ || name == "setup.cfg"
+ || name == "tox.ini"
+ || name == "pytest.ini"
+ || name == "mypy.ini"
+ || name == "ruff.toml"
+ || name == ".ruff.toml"
+ {
+ return "project config".to_string();
+ }
+ if name.ends_with(".lock")
+ || name == "requirements.txt"
+ || name == "poetry.lock"
+ || name == "uv.lock"
+ || name == "Pipfile"
+ || name == "Pipfile.lock"
+ || name == "package.json"
+ || name == "package-lock.json"
+ || name == "pnpm-lock.yaml"
+ || name == "yarn.lock"
+ || name == "Cargo.toml"
+ || name == "Cargo.lock"
+ || name == "go.mod"
+ || name == "go.sum"
+ || rel_s.starts_with("requirements/")
+ {
+ return "dependency file".to_string();
+ }
+ if name.starts_with("README")
+ || name == "CHANGELOG.md"
+ || name == "CONTRIBUTING.md"
+ || rel_s.starts_with("docs/")
+ {
+ return "documentation".to_string();
+ }
+ if name.starts_with("Dockerfile")
+ || name.starts_with("docker-compose")
+ || name.starts_with("compose")
+ {
+ return "runtime support".to_string();
+ }
+ if rel_s.starts_with(".github/workflows/") || name == ".gitlab-ci.yml" {
+ return "ci support".to_string();
+ }
+ if name.starts_with(".env") || rel_s.starts_with("config/") || rel_s.starts_with("settings/") {
+ return "runtime config".to_string();
+ }
+ if name == ".pre-commit-config.yaml"
+ || name == "tsconfig.json"
+ || name.starts_with("vite.config")
+ || name.starts_with("webpack.config")
+ {
+ return "tooling config".to_string();
+ }
+ "support file".to_string()
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn scan_project_native(
+ root_path: &str,
+ use_gitignore: bool,
+ hard_ignore_patterns: Vec,
+ code_patterns: Vec,
+ support_patterns: Vec,
+ support_full_patterns: Vec,
+ support_tree_only_patterns: Vec,
+ support_default_policy: String,
+ support_enabled: bool,
+) -> PyResult> {
+ let root = Path::new(root_path);
+ let hard_ignore_matcher = PathMatcher::new(&hard_ignore_patterns);
+ let code_matcher = PathMatcher::new(&code_patterns);
+ let support_matcher = PathMatcher::new(&support_patterns);
+ let support_tree_only_matcher = PathMatcher::new(&support_tree_only_patterns);
+ let support_full_matcher = PathMatcher::new(&support_full_patterns);
+
+ let mut builder = WalkBuilder::new(root);
+ builder.standard_filters(use_gitignore);
+ builder.hidden(false);
+
+ let hard_ignore_matcher_clone = hard_ignore_matcher.clone();
+ let root_clone = root.to_path_buf();
+ builder.filter_entry(move |entry| {
+ if let Ok(rel) = entry.path().strip_prefix(&root_clone) {
+ let rel_s = to_posix_string(rel);
+ if rel_s != "." && !rel_s.is_empty() && hard_ignore_matcher_clone.matches(&rel_s) {
+ return false;
+ }
+ }
+ true
+ });
+
+ let mut file_infos = Vec::new();
+
+ for result in builder.build() {
+ let entry = match result {
+ Ok(e) => e,
+ Err(_) => continue,
+ };
+
+ if !entry.file_type().is_some_and(|ft| ft.is_file()) {
+ continue;
+ }
+
+ let path = entry.path();
+ let rel = match path.strip_prefix(root) {
+ Ok(r) => r,
+ Err(_) => continue,
+ };
+ let rel_s = to_posix_string(rel);
+
+ if rel_s.is_empty() {
+ continue;
+ }
+
+ if hard_ignore_matcher.matches(&rel_s) {
+ continue;
+ }
+
+ let kind;
+ let mut category = None;
+ let mut policy = "auto".to_string();
+
+ if code_matcher.matches(&rel_s) {
+ kind = "code";
+ } else if support_enabled && support_matcher.matches(&rel_s) {
+ kind = "support";
+ let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
+ category = Some(support_category(&rel_s, name));
+ if support_tree_only_matcher.matches(&rel_s) {
+ policy = "tree_only".to_string();
+ } else if support_full_matcher.matches(&rel_s) {
+ policy = "full".to_string();
+ } else {
+ policy = support_default_policy.clone();
+ }
+ } else {
+ continue;
+ }
+
+ let metadata = match entry.metadata() {
+ Ok(m) => m,
+ Err(_) => continue,
+ };
+ let size_bytes = metadata.len();
+
+ let mtime_ns = match metadata.modified() {
+ Ok(t) => t
+ .duration_since(std::time::SystemTime::UNIX_EPOCH)
+ .map_or(0, |d| d.as_nanos() as u64),
+ Err(_) => 0,
+ };
+
+ let is_binary = crate::io::is_binary(path);
+
+ file_infos.push(NativeFileInfo {
+ relative: rel_s,
+ kind: kind.to_string(),
+ language: language_for(path.file_name().and_then(|n| n.to_str()).unwrap_or("")),
+ size_bytes,
+ is_binary,
+ support_category: category,
+ content_policy: policy,
+ mtime_ns,
+ });
+ }
+
+ Ok(file_infos)
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn matches_double_star_suffix() {
+ let matcher = PathMatcher::new(&["**/*.py".to_string()]);
+ assert!(matcher.matches("src/main.py"));
+ assert!(matcher.matches("main.py"));
+ assert!(!matcher.matches("src/main.rs"));
+ }
+
+ #[test]
+ fn matches_dir_prefix() {
+ let matcher = PathMatcher::new(&["target/**".to_string()]);
+ assert!(matcher.matches("target/debug/x"));
+ assert!(!matcher.matches("src/target.rs"));
+ }
+
+ #[test]
+ fn matches_basename() {
+ let matcher = PathMatcher::new(&["Cargo.toml".to_string()]);
+ assert!(matcher.matches("Cargo.toml"));
+ assert!(matcher.matches("crates/a/Cargo.toml"));
+ }
+}
diff --git a/rust/scriber_native/src/score.rs b/rust/scriber_native/src/score.rs
new file mode 100644
index 0000000..a4efbff
--- /dev/null
+++ b/rust/scriber_native/src/score.rs
@@ -0,0 +1,817 @@
+use crate::import::NativeImportEdge;
+use crate::scan::NativeFileInfo;
+use pyo3::prelude::*;
+use std::collections::{HashMap, HashSet};
+use std::path::Path;
+
+#[pyclass]
+#[derive(Clone, Debug)]
+pub struct NativeCandidate {
+ #[pyo3(get)]
+ pub path: String,
+ #[pyo3(get)]
+ pub kind: String,
+ #[pyo3(get, set)]
+ pub score: i32,
+ #[pyo3(get, set)]
+ pub reasons: Vec,
+ #[pyo3(get, set)]
+ pub reason_summary: String,
+ #[pyo3(get, set)]
+ pub include_content: bool,
+ #[pyo3(get, set)]
+ pub omitted_reason: Option,
+}
+
+#[pyclass]
+#[derive(Clone, Debug)]
+pub struct NativePackOptions {
+ #[pyo3(get, set)]
+ pub mode: String,
+ #[pyo3(get, set)]
+ pub max_files: usize,
+ #[pyo3(get, set)]
+ pub min_score: i32,
+ #[pyo3(get, set)]
+ pub tree_min_score: i32,
+
+ // Config scoring values
+ #[pyo3(get, set)]
+ pub seed_file_score: i32,
+ #[pyo3(get, set)]
+ pub seed_folder_file_score: i32,
+ #[pyo3(get, set)]
+ pub direct_dependency_score: i32,
+ #[pyo3(get, set)]
+ pub reverse_dependency_score: i32,
+ #[pyo3(get, set)]
+ pub same_package_score: i32,
+ #[pyo3(get, set)]
+ pub parent_entrypoint_score: i32,
+ #[pyo3(get, set)]
+ pub related_test_score: i32,
+ #[pyo3(get, set)]
+ pub name_similarity_score: i32,
+ #[pyo3(get, set)]
+ pub support_near_seed_score: i32,
+ #[pyo3(get, set)]
+ pub project_config_score: i32,
+ #[pyo3(get, set)]
+ pub dependency_file_score: i32,
+ #[pyo3(get, set)]
+ pub runtime_support_score: i32,
+ #[pyo3(get, set)]
+ pub documentation_score: i32,
+ #[pyo3(get, set)]
+ pub shared_dependency_bonus: i32,
+
+ // Module flags
+ #[pyo3(get, set)]
+ pub modules_enabled: bool,
+ #[pyo3(get, set)]
+ pub include_direct_dependencies: bool,
+ #[pyo3(get, set)]
+ pub include_reverse_dependencies: bool,
+ #[pyo3(get, set)]
+ pub include_same_package: bool,
+ #[pyo3(get, set)]
+ pub include_parent_entrypoints: bool,
+ #[pyo3(get, set)]
+ pub include_tests: bool,
+ #[pyo3(get, set)]
+ pub include_project_configs: bool,
+ #[pyo3(get, set)]
+ pub depth: usize,
+
+ // Support file scanning
+ #[pyo3(get, set)]
+ pub support_enabled: bool,
+
+ // Python module info
+ #[pyo3(get, set)]
+ pub entrypoint_patterns: Vec,
+ #[pyo3(get, set)]
+ pub test_roots: Vec,
+}
+
+#[pymethods]
+impl NativePackOptions {
+ #[new]
+ #[pyo3(signature = (
+ mode = "focused".to_string(),
+ max_files = 0,
+ min_score = 0,
+ tree_min_score = 0,
+ seed_file_score = 100,
+ seed_folder_file_score = 90,
+ direct_dependency_score = 80,
+ reverse_dependency_score = 70,
+ same_package_score = 75,
+ parent_entrypoint_score = 70,
+ related_test_score = 85,
+ name_similarity_score = 65,
+ support_near_seed_score = 50,
+ project_config_score = 70,
+ dependency_file_score = 60,
+ runtime_support_score = 50,
+ documentation_score = 45,
+ shared_dependency_bonus = 10,
+ modules_enabled = true,
+ include_direct_dependencies = true,
+ include_reverse_dependencies = true,
+ include_same_package = true,
+ include_parent_entrypoints = true,
+ include_tests = true,
+ include_project_configs = true,
+ depth = 2,
+ support_enabled = true,
+ entrypoint_patterns = Vec::new(),
+ test_roots = Vec::new(),
+ ))]
+ #[allow(clippy::too_many_arguments)]
+ fn new(
+ mode: String,
+ max_files: usize,
+ min_score: i32,
+ tree_min_score: i32,
+ seed_file_score: i32,
+ seed_folder_file_score: i32,
+ direct_dependency_score: i32,
+ reverse_dependency_score: i32,
+ same_package_score: i32,
+ parent_entrypoint_score: i32,
+ related_test_score: i32,
+ name_similarity_score: i32,
+ support_near_seed_score: i32,
+ project_config_score: i32,
+ dependency_file_score: i32,
+ runtime_support_score: i32,
+ documentation_score: i32,
+ shared_dependency_bonus: i32,
+ modules_enabled: bool,
+ include_direct_dependencies: bool,
+ include_reverse_dependencies: bool,
+ include_same_package: bool,
+ include_parent_entrypoints: bool,
+ include_tests: bool,
+ include_project_configs: bool,
+ depth: usize,
+ support_enabled: bool,
+ entrypoint_patterns: Vec,
+ test_roots: Vec,
+ ) -> Self {
+ NativePackOptions {
+ mode,
+ max_files,
+ min_score,
+ tree_min_score,
+ seed_file_score,
+ seed_folder_file_score,
+ direct_dependency_score,
+ reverse_dependency_score,
+ same_package_score,
+ parent_entrypoint_score,
+ related_test_score,
+ name_similarity_score,
+ support_near_seed_score,
+ project_config_score,
+ dependency_file_score,
+ runtime_support_score,
+ documentation_score,
+ shared_dependency_bonus,
+ modules_enabled,
+ include_direct_dependencies,
+ include_reverse_dependencies,
+ include_same_package,
+ include_parent_entrypoints,
+ include_tests,
+ include_project_configs,
+ depth,
+ support_enabled,
+ entrypoint_patterns,
+ test_roots,
+ }
+ }
+}
+
+// Internal Candidate builder struct to aggregate reasons
+struct ScoringCandidate {
+ info: NativeFileInfo,
+ score: i32,
+ reasons: Vec,
+ reason_counts: HashMap,
+ reason_examples: HashMap>,
+ seed_sources: HashSet,
+}
+
+fn add_reason(c: &mut ScoringCandidate, kind: &str, label: &str, example: Option<&str>) {
+ *c.reason_counts.entry(kind.to_string()).or_default() += 1;
+ if let Some(ex) = example {
+ let examples = c.reason_examples.entry(kind.to_string()).or_default();
+ if !examples.contains(&ex.to_string()) {
+ examples.push(ex.to_string());
+ }
+ }
+ if !c.reasons.contains(&label.to_string()) {
+ c.reasons.push(label.to_string());
+ }
+}
+
+fn build_reason_summary(c: &ScoringCandidate) -> String {
+ let mut parts = Vec::new();
+ let order = vec![
+ "seed_file",
+ "seed_folder_file",
+ "direct_dependency",
+ "reverse_dependency",
+ "related_test",
+ "same_package",
+ "parent_entrypoint",
+ "name_similarity",
+ "support_near_seed",
+ "project_support",
+ "shared_dependency",
+ "entrypoint",
+ "test_file",
+ "code_file",
+ "other_file",
+ ];
+
+ for kind in order {
+ if let Some(&count) = c.reason_counts.get(kind) {
+ let examples = c.reason_examples.get(kind);
+ if kind == "seed_file" {
+ parts.push("seed file".to_string());
+ } else if kind == "seed_folder_file" {
+ parts.push("seed folder file".to_string());
+ } else if kind == "direct_dependency" {
+ if count > 1 {
+ parts.push(format!("imports {} included files", count));
+ } else if let Some(exs) = examples {
+ if !exs.is_empty() {
+ let filename = Path::new(&exs[0])
+ .file_name()
+ .unwrap_or(std::ffi::OsStr::new(""))
+ .to_string_lossy();
+ parts.push(format!("imports {}", filename));
+ } else {
+ parts.push("imports seed".to_string());
+ }
+ } else {
+ parts.push("imports seed".to_string());
+ }
+ } else if kind == "reverse_dependency" {
+ if count > 1 {
+ parts.push(format!("imported by {} included files", count));
+ } else if let Some(exs) = examples {
+ if !exs.is_empty() {
+ let filename = Path::new(&exs[0])
+ .file_name()
+ .unwrap_or(std::ffi::OsStr::new(""))
+ .to_string_lossy();
+ parts.push(format!("imported by {}", filename));
+ } else {
+ parts.push("imported by seed".to_string());
+ }
+ } else {
+ parts.push("imported by seed".to_string());
+ }
+ } else if kind == "related_test" {
+ parts.push("related test".to_string());
+ } else if kind == "same_package" {
+ parts.push("same package".to_string());
+ } else if kind == "parent_entrypoint" {
+ parts.push("parent entrypoint".to_string());
+ } else if kind == "name_similarity" {
+ parts.push("name similarity".to_string());
+ } else if kind == "support_near_seed" {
+ parts.push("support file".to_string());
+ } else if kind == "project_support" {
+ parts.push("project support file".to_string());
+ } else if kind == "shared_dependency" {
+ parts.push("shared dependency bonus".to_string());
+ } else if kind == "entrypoint" {
+ parts.push("entrypoint file".to_string());
+ } else if kind == "test_file" {
+ parts.push("test file".to_string());
+ } else if kind == "code_file" {
+ parts.push("code file".to_string());
+ } else if kind == "other_file" {
+ parts.push("other file".to_string());
+ }
+ }
+ }
+ parts.join("; ")
+}
+
+fn is_test_file(rel: &str, test_roots: &[String]) -> bool {
+ let p = Path::new(rel);
+ let name = p
+ .file_name()
+ .unwrap_or(std::ffi::OsStr::new(""))
+ .to_string_lossy()
+ .to_lowercase();
+ for part in p.components().filter_map(|c| c.as_os_str().to_str()) {
+ if test_roots.contains(&part.to_string()) {
+ return true;
+ }
+ }
+ name.starts_with("test_") || name.ends_with("_test.py") || name.ends_with(".test.py")
+}
+
+fn name_related(a: &str, b: &str) -> bool {
+ let a_stem = Path::new(a)
+ .file_stem()
+ .unwrap_or(std::ffi::OsStr::new(""))
+ .to_string_lossy()
+ .to_lowercase()
+ .replace("test_", "")
+ .replace("_test", "");
+ let b_stem = Path::new(b)
+ .file_stem()
+ .unwrap_or(std::ffi::OsStr::new(""))
+ .to_string_lossy()
+ .to_lowercase()
+ .replace("test_", "")
+ .replace("_test", "");
+ if a_stem.is_empty() || b_stem.is_empty() {
+ return false;
+ }
+ a_stem.contains(&b_stem) || b_stem.contains(&a_stem)
+}
+
+fn is_near_seed(support_file: &str, seed: &str) -> bool {
+ let sf_parent = Path::new(support_file).parent().unwrap_or(Path::new(""));
+ if sf_parent == Path::new("") {
+ return true;
+ }
+ let seed_parent = Path::new(seed).parent().unwrap_or(Path::new(""));
+ sf_parent == seed_parent
+ || sf_parent.starts_with(seed_parent)
+ || seed_parent.starts_with(sf_parent)
+}
+
+fn walk_neighbors(
+ edges: &HashMap>,
+ start: &str,
+ depth: usize,
+) -> HashMap {
+ let mut found = HashMap::new();
+ let mut frontier = HashSet::new();
+ frontier.insert(start.to_string());
+ let mut visited = HashSet::new();
+ visited.insert(start.to_string());
+
+ for distance in 1..=depth {
+ let mut next_frontier = HashSet::new();
+ for item in frontier {
+ if let Some(neighbors) = edges.get(&item) {
+ for neighbor in neighbors {
+ if visited.contains(neighbor) {
+ continue;
+ }
+ visited.insert(neighbor.clone());
+ found.insert(neighbor.clone(), distance);
+ next_frontier.insert(neighbor.clone());
+ }
+ }
+ }
+ frontier = next_frontier;
+ if frontier.is_empty() {
+ break;
+ }
+ }
+ found
+}
+
+fn support_base_score(file: &NativeFileInfo, options: &NativePackOptions) -> i32 {
+ let cat = file.support_category.as_deref().unwrap_or("support file");
+ match cat {
+ "project config" => options.project_config_score,
+ "dependency file" => options.dependency_file_score,
+ "runtime support" | "runtime config" | "ci support" | "tooling config" => {
+ options.runtime_support_score
+ }
+ "documentation" => options.documentation_score,
+ _ => options.documentation_score,
+ }
+}
+
+fn matches_entrypoint(rel: &str, entrypoint_patterns: &[String]) -> bool {
+ let name = Path::new(rel)
+ .file_name()
+ .unwrap_or(std::ffi::OsStr::new(""))
+ .to_string_lossy()
+ .to_string();
+ // Simple glob matcher for entrypoints
+ for pat in entrypoint_patterns {
+ let pat_clean = pat.replace("*", "");
+ if pat.starts_with('*') && pat.ends_with('*') {
+ if name.contains(&pat_clean) {
+ return true;
+ }
+ } else if pat.starts_with('*') {
+ if name.ends_with(&pat_clean) {
+ return true;
+ }
+ } else if pat.ends_with('*') {
+ if name.starts_with(&pat_clean) {
+ return true;
+ }
+ } else if name == *pat {
+ return true;
+ }
+ }
+ false
+}
+
+#[pyfunction]
+pub fn score_candidates_native(
+ files: Vec,
+ seeds_list: Vec,
+ edges: Vec,
+ options: NativePackOptions,
+) -> PyResult> {
+ let mut mapped_files = HashMap::new();
+ for f in files {
+ mapped_files.insert(
+ f.relative.clone(),
+ ScoringCandidate {
+ info: f.clone(),
+ score: 0,
+ reasons: Vec::new(),
+ reason_counts: HashMap::new(),
+ reason_examples: HashMap::new(),
+ seed_sources: HashSet::new(),
+ },
+ );
+ }
+
+ // Build graph edges maps
+ let mut graph_imports: HashMap> = HashMap::new();
+ let mut graph_imported_by: HashMap> = HashMap::new();
+ for edge in edges {
+ graph_imports
+ .entry(edge.from.clone())
+ .or_default()
+ .insert(edge.to.clone());
+ graph_imported_by
+ .entry(edge.to.clone())
+ .or_default()
+ .insert(edge.from.clone());
+ }
+
+ if options.mode == "project_snapshot" {
+ for (rel, c) in &mut mapped_files {
+ if c.info.kind == "code" {
+ if matches_entrypoint(rel, &options.entrypoint_patterns) {
+ c.score = 90;
+ add_reason(c, "entrypoint", "entrypoint file", None);
+ } else if is_test_file(rel, &options.test_roots) {
+ c.score = 60;
+ add_reason(c, "test_file", "test file", None);
+ } else {
+ c.score = 80;
+ add_reason(c, "code_file", "code file", None);
+ }
+ } else if c.info.kind == "support" && options.support_enabled {
+ let base = support_base_score(&c.info, &options);
+ let cat = c
+ .info
+ .support_category
+ .clone()
+ .unwrap_or("support file".to_string());
+ c.score = base;
+ add_reason(c, "project_support", &cat, None);
+ }
+ }
+ } else {
+ // Focused mode scoring
+ let mut seed_files = Vec::new();
+ for s in &seeds_list {
+ // Find all files matching or under seed paths
+ for rel in mapped_files.keys() {
+ if rel == s || rel.starts_with(&format!("{}/", s)) {
+ seed_files.push(rel.clone());
+ }
+ }
+ }
+ let seed_set: HashSet = seed_files.iter().cloned().collect();
+
+ // 1. Seed paths scores
+ for s in &seeds_list {
+ for rel in &seed_files {
+ if rel == s || rel.starts_with(&format!("{}/", s)) {
+ let is_dir = rel != s;
+ let (score, key, reason) = if is_dir {
+ (
+ options.seed_folder_file_score,
+ "seed_folder_file",
+ format!("file inside seed folder `{}`", s),
+ )
+ } else {
+ (
+ options.seed_file_score,
+ "seed_file",
+ "seed file".to_string(),
+ )
+ };
+ if let Some(c) = mapped_files.get_mut(rel) {
+ c.score = std::cmp::max(c.score, score);
+ let r_clone = rel.clone();
+ add_reason(c, key, &reason, Some(&r_clone));
+ c.seed_sources.insert(r_clone);
+ }
+ }
+ }
+ }
+
+ // 2. Dependencies / Related files scores
+ if options.modules_enabled {
+ for seed_rel in &seed_files {
+ // Direct dependencies
+ if options.include_direct_dependencies {
+ for (dep, distance) in walk_neighbors(&graph_imports, seed_rel, options.depth) {
+ let score = std::cmp::max(
+ options.tree_min_score,
+ options.direct_dependency_score - ((distance as i32 - 1) * 10),
+ );
+ if let Some(c) = mapped_files.get_mut(&dep) {
+ c.score = std::cmp::max(c.score, score);
+ add_reason(
+ c,
+ "direct_dependency",
+ &format!("direct dependency of `{}`", seed_rel),
+ Some(seed_rel),
+ );
+ c.seed_sources.insert(seed_rel.clone());
+ }
+ }
+ }
+
+ // Reverse dependencies
+ if options.include_reverse_dependencies {
+ for (dep, distance) in
+ walk_neighbors(&graph_imported_by, seed_rel, options.depth)
+ {
+ let score = std::cmp::max(
+ options.tree_min_score,
+ options.reverse_dependency_score - ((distance as i32 - 1) * 10),
+ );
+ if let Some(c) = mapped_files.get_mut(&dep) {
+ c.score = std::cmp::max(c.score, score);
+ add_reason(
+ c,
+ "reverse_dependency",
+ &format!("imports seed `{}`", seed_rel),
+ Some(seed_rel),
+ );
+ c.seed_sources.insert(seed_rel.clone());
+ }
+ }
+ }
+
+ // Same package
+ if options.include_same_package {
+ let seed_parent = Path::new(seed_rel)
+ .parent()
+ .unwrap_or(Path::new(""))
+ .to_string_lossy()
+ .to_string();
+ for (rel, c) in &mut mapped_files {
+ if c.info.kind == "code" && !seed_set.contains(rel) {
+ let rel_parent = Path::new(rel)
+ .parent()
+ .unwrap_or(Path::new(""))
+ .to_string_lossy()
+ .to_string();
+ if rel_parent == seed_parent {
+ c.score = std::cmp::max(c.score, options.same_package_score);
+ add_reason(
+ c,
+ "same_package",
+ &format!("same package as `{}`", seed_rel),
+ Some(seed_rel),
+ );
+ c.seed_sources.insert(seed_rel.clone());
+ }
+ }
+ }
+ }
+
+ // Parent entrypoints
+ if options.include_parent_entrypoints {
+ for (rel, c) in &mut mapped_files {
+ if c.info.kind == "code"
+ && matches_entrypoint(rel, &options.entrypoint_patterns)
+ {
+ let rel_p = Path::new(rel);
+ let seed_p = Path::new(seed_rel);
+ let is_parent = rel_p.parent() == Some(Path::new(""))
+ || seed_p.starts_with(rel_p.parent().unwrap())
+ || rel_p.starts_with(seed_p.parent().unwrap());
+ if is_parent {
+ c.score = std::cmp::max(c.score, options.parent_entrypoint_score);
+ add_reason(
+ c,
+ "parent_entrypoint",
+ &format!("parent/entrypoint near `{}`", seed_rel),
+ Some(seed_rel),
+ );
+ c.seed_sources.insert(seed_rel.clone());
+ }
+ }
+ }
+ }
+
+ // Related tests
+ if options.include_tests {
+ for (rel, c) in &mut mapped_files {
+ if c.info.kind == "code" && is_test_file(rel, &options.test_roots) {
+ let matches_name = name_related(rel, seed_rel);
+ let is_dep = graph_imports
+ .get(rel)
+ .is_some_and(|deps| deps.contains(seed_rel));
+ if matches_name || is_dep {
+ c.score = std::cmp::max(c.score, options.related_test_score);
+ add_reason(
+ c,
+ "related_test",
+ &format!("related test for `{}`", seed_rel),
+ Some(seed_rel),
+ );
+ c.seed_sources.insert(seed_rel.clone());
+ }
+ }
+ }
+ }
+
+ // Name similarity
+ for (rel, c) in &mut mapped_files {
+ if c.info.kind == "code"
+ && !seed_set.contains(rel)
+ && name_related(rel, seed_rel)
+ {
+ c.score = std::cmp::max(c.score, options.name_similarity_score);
+ add_reason(
+ c,
+ "name_similarity",
+ &format!("name similarity with `{}`", seed_rel),
+ Some(seed_rel),
+ );
+ c.seed_sources.insert(seed_rel.clone());
+ }
+ }
+ }
+
+ // Support files
+ if options.support_enabled {
+ for (rel, c) in &mut mapped_files {
+ if c.info.kind == "support" {
+ let base = support_base_score(&c.info, &options);
+ let cat = c
+ .info
+ .support_category
+ .clone()
+ .unwrap_or("support file".to_string());
+ if rel == "pyproject.toml" {
+ c.score = std::cmp::max(c.score, options.project_config_score);
+ add_reason(c, "project_support", "project config/root file", None);
+ continue;
+ }
+
+ let mut added = false;
+ for seed_rel in &seed_files {
+ if is_near_seed(rel, seed_rel) {
+ c.score = std::cmp::max(
+ c.score,
+ std::cmp::max(base, options.support_near_seed_score),
+ );
+ add_reason(
+ c,
+ "support_near_seed",
+ &format!("{} near `{}`", cat, seed_rel),
+ Some(seed_rel),
+ );
+ c.seed_sources.insert(seed_rel.clone());
+ added = true;
+ }
+ }
+
+ if !added
+ && Path::new(rel).parent() == Some(Path::new(""))
+ && options.include_project_configs
+ {
+ c.score = std::cmp::max(c.score, base);
+ add_reason(c, "project_support", &cat, None);
+ }
+ }
+ }
+ }
+ } else {
+ // Modules disabled fallback
+ if options.support_enabled {
+ if let Some(c) = mapped_files.get_mut("pyproject.toml") {
+ c.score = std::cmp::max(c.score, options.project_config_score);
+ add_reason(c, "project_support", "project config/root file", None);
+ }
+ }
+ }
+
+ // Shared dependency bonus
+ for c in mapped_files.values_mut() {
+ if c.seed_sources.len() > 1 {
+ c.score = std::cmp::min(100, c.score + options.shared_dependency_bonus);
+ add_reason(
+ c,
+ "shared_dependency",
+ "shared by multiple seed paths",
+ None,
+ );
+ }
+ }
+ }
+
+ // Build summaries & NativeCandidate objects
+ let mut candidates = Vec::new();
+ let seed_set: HashSet = seeds_list.iter().cloned().collect();
+
+ for c in mapped_files.values() {
+ let is_seed = seed_set.contains(&c.info.relative)
+ || seeds_list
+ .iter()
+ .any(|s| c.info.relative.starts_with(&format!("{}/", s)));
+ let is_valid_score = c.score >= options.min_score || c.score >= options.tree_min_score;
+ if is_seed || is_valid_score {
+ let summary = build_reason_summary(c);
+ candidates.push(NativeCandidate {
+ path: c.info.relative.clone(),
+ kind: c.info.kind.clone(),
+ score: c.score,
+ reasons: c.reasons.clone(),
+ reason_summary: summary,
+ include_content: true,
+ omitted_reason: None,
+ });
+ }
+ }
+
+ // Sort by score desc, kind desc (code first), relative path asc
+ candidates.sort_by(|a, b| {
+ let score_cmp = b.score.cmp(&a.score);
+ if score_cmp != std::cmp::Ordering::Equal {
+ return score_cmp;
+ }
+ let kind_cmp = (b.kind == "code").cmp(&(a.kind == "code"));
+ if kind_cmp != std::cmp::Ordering::Equal {
+ return kind_cmp;
+ }
+ a.path.cmp(&b.path)
+ });
+
+ // Enforce max files limit
+ if options.max_files > 0 && candidates.len() > options.max_files {
+ let is_snap = options.mode == "project_snapshot";
+
+ let mut seeds_first = Vec::new();
+ let mut rest = Vec::new();
+ for cand in candidates {
+ let belongs_in_seeds = if is_snap {
+ cand.path == "pyproject.toml" || cand.path == "README.md"
+ } else {
+ let is_seed = seed_set.contains(&cand.path)
+ || seeds_list
+ .iter()
+ .any(|s| cand.path.starts_with(&format!("{}/", s)));
+ is_seed || cand.path == "pyproject.toml" || cand.path == "README.md"
+ };
+ if belongs_in_seeds {
+ seeds_first.push(cand);
+ } else {
+ rest.push(cand);
+ }
+ }
+ let remaining = if options.max_files > seeds_first.len() {
+ options.max_files - seeds_first.len()
+ } else {
+ 0
+ };
+ seeds_first.extend(rest.into_iter().take(remaining));
+ candidates = seeds_first;
+
+ // Resort final list
+ candidates.sort_by(|a, b| {
+ let score_cmp = b.score.cmp(&a.score);
+ if score_cmp != std::cmp::Ordering::Equal {
+ return score_cmp;
+ }
+ let kind_cmp = (b.kind == "code").cmp(&(a.kind == "code"));
+ if kind_cmp != std::cmp::Ordering::Equal {
+ return kind_cmp;
+ }
+ a.path.cmp(&b.path)
+ });
+ }
+
+ Ok(candidates)
+}
diff --git a/src/run.py b/src/run.py
deleted file mode 100644
index 25db586..0000000
--- a/src/run.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import os
-from scriber.cli import main
-
-os.environ['SCRIBER_EXEC_MODE'] = 'RUN_PY'
-
-main()
\ No newline at end of file
diff --git a/src/scriber/__init__.py b/src/scriber/__init__.py
index 3c21a13..1aef752 100644
--- a/src/scriber/__init__.py
+++ b/src/scriber/__init__.py
@@ -1,10 +1,8 @@
-"""
-ProjectScriber: A tool for mapping and compiling project source code.
+"""ProjectScriber 2.0."""
-This package provides the core functionality and command-line interface for
-ProjectScriber. The main `Scriber` class can be imported directly for
-programmatic use.
-"""
-from .core import Scriber, ScriberConfig
+from .packer.pack import build_pack, build_and_write_pack
+from .core.models import ScriberPack
-__all__ = ["Scriber", "ScriberConfig"]
\ No newline at end of file
+__all__ = ["build_pack", "build_and_write_pack", "ScriberPack"]
+
+__version__ = "2.0.0"
diff --git a/src/scriber/__main__.py b/src/scriber/__main__.py
new file mode 100644
index 0000000..89b4af8
--- /dev/null
+++ b/src/scriber/__main__.py
@@ -0,0 +1,4 @@
+from .cli.main import main
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/src/scriber/cache.py b/src/scriber/cache.py
new file mode 100644
index 0000000..f96f0fc
--- /dev/null
+++ b/src/scriber/cache.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+import os
+import sys
+import json
+import hashlib
+from pathlib import Path
+from typing import Any, TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from scriber.core.models import ScriberConfig
+
+
+def get_config_hash(config: ScriberConfig) -> str:
+ from scriber import __version__
+ data = {
+ "code_patterns": config.code_patterns,
+ "support_patterns": config.support_patterns,
+ "hard_ignore_patterns": config.hard_ignore_patterns,
+ "support": config.support,
+ "support_content_default": config.support_content.default,
+ "support_content_full": config.support_content.full,
+ "support_content_tree_only": config.support_content.tree_only,
+ "support_content_auto_max_bytes": config.support_content.auto_max_bytes,
+ "use_gitignore": config.use_gitignore,
+ "python_source_roots": config.python.source_roots,
+ "python_module_init_files": config.python.module_init_files,
+ "scriber_version": __version__,
+ "native_scanner_version": 1,
+ }
+ dump = json.dumps(data, sort_keys=True)
+ return hashlib.sha256(dump.encode("utf-8")).hexdigest()
+
+
+class ScriberCache:
+ def __init__(self, config: ScriberConfig, project_root: Path):
+ self.enabled = config.cache.enabled
+ self.cache_dir = project_root / config.cache.dir
+ self.files_cache_path = self.cache_dir / "files.json"
+ self.graph_cache_path = self.cache_dir / "import_graph.json"
+ self.config_hash = get_config_hash(config)
+ self.python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
+
+ self.files_data: dict[str, dict[str, Any]] = {}
+ self.graph_data: dict[str, list[str]] = {}
+ self._load()
+
+ def _load(self) -> None:
+ if not self.enabled:
+ return
+
+ try:
+ if self.files_cache_path.exists():
+ with self.files_cache_path.open("r", encoding="utf-8") as f:
+ self.files_data = json.load(f)
+ if self.graph_cache_path.exists():
+ with self.graph_cache_path.open("r", encoding="utf-8") as f:
+ self.graph_data = json.load(f)
+ except Exception:
+ # Silently fallback to empty cache on read errors
+ self.files_data = {}
+ self.graph_data = {}
+
+ def get_file(self, rel_path: Path, mtime_ns: int, size: int) -> dict[str, Any] | None:
+ if not self.enabled:
+ return None
+
+ key = rel_path.as_posix()
+ entry = self.files_data.get(key)
+ if entry is None:
+ return None
+
+ if (entry.get("mtime_ns") == mtime_ns and
+ entry.get("size") == size and
+ entry.get("python_version") == self.python_version and
+ entry.get("config_hash") == self.config_hash):
+ return entry.get("data")
+ return None
+
+ def set_file(self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any]) -> None:
+ if not self.enabled:
+ return
+ key = rel_path.as_posix()
+ self.files_data[key] = {
+ "mtime_ns": mtime_ns,
+ "size": size,
+ "python_version": self.python_version,
+ "config_hash": self.config_hash,
+ "data": data
+ }
+
+ def get_imports(self, rel_path: Path) -> set[Path] | None:
+ if not self.enabled:
+ return None
+ key = rel_path.as_posix()
+ imports = self.graph_data.get(key)
+ if imports is not None:
+ return {Path(p) for p in imports}
+ return None
+
+ def set_imports(self, rel_path: Path, imports: set[Path]) -> None:
+ if not self.enabled:
+ return
+ key = rel_path.as_posix()
+ self.graph_data[key] = [p.as_posix() for p in sorted(imports)]
+
+ def save(self, active_files: set[Path] | None = None) -> None:
+ if not self.enabled:
+ return
+
+ try:
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
+
+ # Simple cleanup mechanism:
+ # 1. Prune stale cache entries (entries for files no longer in active_files)
+ if active_files is not None:
+ active_keys = {p.as_posix() for p in active_files}
+ self.files_data = {k: v for k, v in self.files_data.items() if k in active_keys}
+ self.graph_data = {k: v for k, v in self.graph_data.items() if k in active_keys}
+
+ # 2. Enforce absolute limit of max 1000 entries to prevent infinite growth
+ if len(self.files_data) > 1000:
+ # Remove oldest keys
+ sorted_keys = sorted(self.files_data.keys(), key=lambda k: self.files_data[k].get("mtime_ns", 0))
+ to_remove = sorted_keys[:len(sorted_keys) - 1000]
+ for k in to_remove:
+ self.files_data.pop(k, None)
+ self.graph_data.pop(k, None)
+
+ with self.files_cache_path.open("w", encoding="utf-8") as f:
+ json.dump(self.files_data, f, indent=2)
+ with self.graph_cache_path.open("w", encoding="utf-8") as f:
+ json.dump(self.graph_data, f, indent=2)
+ except Exception:
+ pass # Fail silently on write errors to not interrupt execution
diff --git a/src/scriber/cli.py b/src/scriber/cli.py
deleted file mode 100644
index e689c64..0000000
--- a/src/scriber/cli.py
+++ /dev/null
@@ -1,361 +0,0 @@
-import argparse
-import io
-import json
-import os
-import re
-import sys
-from importlib import metadata
-from pathlib import Path
-from typing import Any
-
-import pyperclip
-import tomlkit
-from dotenv import load_dotenv
-
-from .core import DEFAULT_CONFIG, Scriber
-
-try:
- import rich.box
- from rich.console import Console
- from rich.panel import Panel
- from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn
- from rich.prompt import Confirm, Prompt
- from rich.table import Table
- from rich.text import Text
-
- RICH_AVAILABLE = True
-except ImportError:
- RICH_AVAILABLE = False
-
-load_dotenv()
-
-
-class SimpleConsole:
- """A fallback console that mimics rich.Console with simple print statements."""
-
- def print(self, message: Any = "") -> None:
- """Strips rich markup and prints the message, handling potential Unicode errors.
-
- This method attempts to print the message directly. If a UnicodeEncodeError
- occurs, it falls back to encoding the message using the system's stdout
- encoding, replacing any unsupported characters to prevent crashes.
-
- Args:
- message: The object or text to print.
- """
- message_str = str(message)
- cleaned_message = re.sub(r'\[/?[a-zA-Z\s=]+\]', '', message_str)
- try:
- print(cleaned_message)
- except UnicodeEncodeError:
- safe_message = cleaned_message.encode(
- sys.stdout.encoding, errors='replace'
- ).decode(sys.stdout.encoding)
- print(safe_message)
-
-
-def format_bytes(byte_count: int) -> str:
- """Formats a byte count into a human-readable string (KB, MB).
-
- Args:
- byte_count: The number of bytes.
-
- Returns:
- A formatted string representing the size.
- """
- if byte_count > 1024 * 1024:
- return f"{byte_count / (1024 * 1024):.2f} MB"
- if byte_count > 1024:
- return f"{byte_count / 1024:.2f} KB"
- return f"{byte_count} Bytes"
-
-
-def save_to_json(console: Any, config: dict[str, Any]):
- """Saves configuration to a .scriber.json file.
-
- Args:
- console: The console instance for printing output.
- config: The configuration dictionary to save.
- """
- config_path = Path.cwd() / ".scriber.json"
- try:
- with open(config_path, "w", encoding="utf-8") as f:
- json.dump(config, f, indent=2)
- console.print(f"\nβ
[bold green]Configuration saved to:[/] {config_path}")
- except IOError as e:
- console.print(f"\nβ [bold red]Error saving config file:[/] {e}")
-
-
-def save_to_toml(console: Any, config: dict[str, Any]):
- """Saves configuration to the pyproject.toml file.
-
- Args:
- console: The console instance for printing output.
- config: The configuration dictionary to save.
- """
- toml_path = Path.cwd() / "pyproject.toml"
- if not toml_path.exists():
- console.print(f"\nβ [bold red]Error: `pyproject.toml` not found in the current directory.[/]")
- return
-
- try:
- with open(toml_path, "r+", encoding="utf-8") as f:
- doc = tomlkit.parse(f.read())
-
- tool_table = doc.setdefault("tool", tomlkit.table())
- scriber_table = tool_table.setdefault("scriber", tomlkit.table())
- scriber_table.update(config)
-
- f.seek(0)
- f.truncate()
- f.write(tomlkit.dumps(doc))
-
- console.print(f"\nβ
[bold green]Configuration saved to:[/] {toml_path}")
- except Exception as e:
- console.print(f"\nβ [bold red]Error updating `pyproject.toml`:[/] {e}")
-
-
-def handle_init(args: argparse.Namespace, console: Any, rich_available: bool):
- """Handles the interactive initialization of a config file.
-
- Args:
- args: The parsed command-line arguments.
- console: The console instance for printing output.
- rich_available: A boolean indicating if the 'rich' library is installed.
- """
- if rich_available:
- console.print(Panel("[bold cyan]Scriber Configuration Setup[/]", expand=False))
- else:
- console.print("--- Scriber Configuration Setup ---")
- console.print("This utility will help you create a configuration file.\n")
-
- config: dict[str, Any] = {}
-
- if rich_available:
- config["use_gitignore"] = Confirm.ask("β¨ Would you like to respect `.gitignore` rules?", default=True)
- default_exclude = ", ".join(DEFAULT_CONFIG.exclude)
- exclude_str = Prompt.ask("π Enter patterns to exclude (comma-separated)", default=default_exclude)
- include_str = Prompt.ask("π Enter patterns to include (optional, comma-separated)", default="")
- hidden_str = Prompt.ask("π Enter patterns to hide content for (e.g., lock files, optional, comma-separated)",
- default="")
- config["single_process"] = Confirm.ask("βοΈ Run in a single process? (for Celery or similar environments)",
- default=False)
- else:
- answer = input("β¨ Would you like to respect `.gitignore` rules? (Y/n) ").strip().lower()
- config["use_gitignore"] = answer not in ['n', 'no']
- default_exclude = ", ".join(DEFAULT_CONFIG.exclude)
- exclude_str = input(
- f"π Enter patterns to exclude (comma-separated, default: {default_exclude}): ") or default_exclude
- include_str = input("π Enter patterns to include (optional, comma-separated): ")
- hidden_str = input("π Enter patterns to hide content for (e.g., lock files, optional, comma-separated): ")
- answer = input("βοΈ Run in a single process? (for Celery or similar environments) (y/N) ").strip().lower()
- config["single_process"] = answer in ['y', 'yes']
-
- config["exclude"] = [item.strip() for item in exclude_str.split(',') if item.strip()]
- include_patterns = [item.strip() for item in include_str.split(',') if item.strip()]
- if include_patterns:
- config["include"] = include_patterns
- hidden_patterns = [item.strip() for item in hidden_str.split(",") if item.strip()]
- if hidden_patterns:
- config["hidden"] = hidden_patterns
-
- console.print("\n[bold]Choose a save location:[/bold]")
- console.print(" [cyan]1[/]: Save to `.scriber.json` (project-specific override)")
- console.print(" [cyan]2[/]: Save to `pyproject.toml` (project default)")
-
- if rich_available:
- save_target = Prompt.ask("Enter your choice", choices=["1", "2"], default="1")
- else:
- save_target = input("Enter your choice (1/2, default: 1): ") or "1"
-
- if save_target == '1':
- save_to_json(console, config)
- elif save_target == '2':
- save_to_toml(console, config)
-
-
-def run_scriber(args: argparse.Namespace, console: Any, version: str, rich_available: bool):
- """Handles the main logic of mapping and generating the project output.
-
- Args:
- args: The parsed command-line arguments.
- console: The console instance for printing output.
- version: The current version of the application.
- rich_available: A boolean indicating if the 'rich' library is installed.
- """
- if rich_available:
- title_text = Text(f"Scriber v{version}", justify="center", style="bold magenta")
- subtitle_text = Text("An intelligent tool to map, analyze, and compile project source code for LLM context.",
- justify="center", style="cyan")
- console.print(Panel(Text.assemble(title_text, "\n", subtitle_text), expand=False, border_style="blue"))
- else:
- console.print(f"--- Scriber v{version} ---")
-
- scriber = Scriber(args.root_path.resolve(), config_path=args.config)
- if args.single_process:
- scriber.single_process = True
-
- scriber.map_project()
-
- progress = None
- task_id = None
- if rich_available:
- progress_manager = Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
- BarColumn(), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
- console=console, transient=True)
- total_files = scriber.get_file_count()
- if total_files > 0 and not args.tree_only:
- task_id = progress_manager.add_task("[green]Processing files...", total=total_files)
- progress = progress_manager
- else:
- console.print("Processing files...")
-
- output_content = ""
- if progress:
- with progress:
- output_content = scriber.get_output_as_string(tree_only=args.tree_only, progress=progress, task_id=task_id)
- else:
- output_content = scriber.get_output_as_string(tree_only=args.tree_only)
-
- stats = scriber.get_stats()
- config_file_display = str(scriber.config_path_used) if scriber.config_path_used else "Defaults"
-
- if rich_available:
- summary_table = Table(box=rich.box.ROUNDED, show_header=False, title="[bold]Run Summary[/]",
- title_justify="left")
- summary_table.add_column("Parameter", style="cyan", no_wrap=True)
- summary_table.add_column("Value", style="magenta")
- summary_table.add_row("Project Path", str(args.root_path.resolve()))
- summary_table.add_row("Config File", config_file_display)
- if not args.copy_only:
- summary_table.add_row("Output File", args.output or scriber.config.output)
- console.print(summary_table)
- else:
- console.print("\n--- Run Summary ---")
- console.print(f"Project Path: {str(args.root_path.resolve())}")
- console.print(f"Config File: {config_file_display}")
- if not args.copy_only:
- console.print(f"Output File: {args.output or scriber.config.output}")
-
- if stats['total_files'] > 0:
- if rich_available:
- results_table = Table(box=rich.box.ROUNDED, show_header=False, title="[bold]π Analysis Results[/]",
- title_justify="left")
- results_table.add_column("Metric", style="cyan", no_wrap=True)
- results_table.add_column("Value", style="magenta", justify="right")
- results_table.add_row("Files Mapped", str(stats['total_files']))
- if stats.get('skipped_binary') > 0:
- results_table.add_row("Binary Skipped", str(stats['skipped_binary']))
- results_table.add_section()
- results_table.add_row("Total Size", format_bytes(stats['total_size_bytes']))
- results_table.add_row("Est. Tokens (cl100k)", f"{stats['total_tokens']:,}")
- results_table.add_section()
- results_table.add_row("[bold]Language Breakdown[/]", "")
- for lang, count in stats['language_counts'].most_common():
- results_table.add_row(f" {lang.capitalize()}", str(count))
- console.print(results_table)
- else:
- console.print("\n--- Analysis Results ---")
- console.print(f"Files Mapped: {stats['total_files']}")
- if stats.get('skipped_binary') > 0:
- console.print(f"Binary Skipped: {stats['skipped_binary']}")
- console.print(f"Total Size: {format_bytes(stats['total_size_bytes'])}")
- console.print(f"Est. Tokens (cl100k): {stats['total_tokens']:,}")
- console.print("Language Breakdown:")
- for lang, count in stats['language_counts'].most_common():
- console.print(f" {lang.capitalize()}: {count}")
- else:
- if rich_available:
- console.print(Panel("[yellow]No files were mapped based on the current configuration.[/]", expand=False))
- else:
- console.print("No files were mapped based on the current configuration.")
-
- if not args.copy_only:
- output_filename = args.output or scriber.config.output
- output_location = Path(args.root_path).resolve() / output_filename
- try:
- with open(output_location, 'w', encoding='utf-8') as f:
- f.write(output_content)
- console.print("\nβ
[green]Success! Output saved to:[/green]")
- console.print(str(output_location))
- except IOError as e:
- console.print(f"\nβ [bold red]Error saving output file:[/] {e}")
-
- if args.copy or args.copy_only:
- try:
- pyperclip.copy(output_content)
- if args.copy_only:
- console.print("\nβ
[green]Success! Output copied to clipboard.[/green]")
- else:
- console.print("π [green]Content copied to clipboard.[/green]")
- except Exception as e:
- console.print(f"β [bold red]Could not copy to clipboard: {e}[/bold red]")
-
-
-def main() -> None:
- """Parses arguments and runs the appropriate command."""
- if RICH_AVAILABLE:
- # On Windows, the default console (cmd.exe) often doesn't support Unicode
- # emojis. We detect this environment and disable emojis to prevent crashes,
- # unless we are in a modern terminal like Windows Terminal.
- is_legacy_windows = (
- sys.platform == "win32"
- and not os.environ.get("WT_SESSION")
- and not os.environ.get("TERMINUS_SUCKS")
- and sys.stdout.encoding != "utf-8"
- )
- console = Console(emoji=not is_legacy_windows)
- else:
- console = SimpleConsole()
-
- try:
- version = metadata.version("project-scriber")
- except metadata.PackageNotFoundError:
- version = "1.0.0 (local)"
-
- parser = argparse.ArgumentParser(
- description="Scriber: An intelligent tool to map, analyze, and compile project source code for LLM context.")
- parser.add_argument("-v", "--version", action="version", version=f"%(prog)s v{version}",
- help="Show the version number and exit.")
- subparsers = parser.add_subparsers(dest="command", title="Commands")
-
- init_parser = subparsers.add_parser("init", help="Create a new configuration file interactively.")
- init_parser.set_defaults(func=lambda args: handle_init(args, console, RICH_AVAILABLE))
-
- run_parser = subparsers.add_parser("run", help="Map the project structure (default command).")
- exec_mode = os.environ.get('SCRIBER_EXEC_MODE')
- default_path = Path.cwd().parent if exec_mode == 'RUN_PY' else Path.cwd()
- if exec_mode == 'RUN_PY':
- del os.environ['SCRIBER_EXEC_MODE']
-
- run_parser.add_argument("root_path", nargs="?", default=os.environ.get("PROJECT_SCRIBER_ROOT", default_path),
- type=Path,
- help="The root directory of the project to map. Defaults to the current directory.")
- run_parser.add_argument("-o", "--output", help="The name of the output file. Overrides config file settings.")
- run_parser.add_argument("--config", default=os.environ.get("PROJECT_SCRIBER_CONFIG"), type=Path,
- help="Path to a custom configuration file.")
- run_parser.add_argument("-c", "--copy", action="store_true", help="Copy the final output to the clipboard.")
- run_parser.add_argument("--copy-only", action="store_true",
- help="Generate the output and copy it to the clipboard without saving to a file.")
- run_parser.add_argument("--tree-only", action="store_true",
- help="Generate only the file tree structure without file content.")
- run_parser.add_argument("--single-process", action="store_true",
- help="Run in a single process to avoid issues in daemonic environments.")
- run_parser.set_defaults(func=lambda args: run_scriber(args, console, version, RICH_AVAILABLE))
-
- args_to_parse = sys.argv[1:]
- global_flags = ['-h', '--help', '-v', '--version']
-
- if not args_to_parse or args_to_parse[0] not in list(subparsers.choices) + global_flags:
- args_to_parse.insert(0, 'run')
-
- args = parser.parse_args(args_to_parse)
-
- if hasattr(args, 'func'):
- args.func(args)
- else:
- parser.print_help()
-
-
-if __name__ == "__main__":
- main()
\ No newline at end of file
diff --git a/src/scriber/cli/__init__.py b/src/scriber/cli/__init__.py
new file mode 100644
index 0000000..021eb45
--- /dev/null
+++ b/src/scriber/cli/__init__.py
@@ -0,0 +1,3 @@
+from .main import main, build_parser
+
+__all__ = ["main", "build_parser"]
diff --git a/src/scriber/cli/main.py b/src/scriber/cli/main.py
new file mode 100644
index 0000000..c943226
--- /dev/null
+++ b/src/scriber/cli/main.py
@@ -0,0 +1,219 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Sequence
+
+from scriber.core.config import DEFAULT_CONFIG_BLOCK, load_raw_pyproject, load_config, validate_config, validate_raw_config
+from scriber.core.errors import ScriberError
+from scriber.core.init_config import init_project
+from scriber.core.root import resolve_config_path
+from scriber.packer.pack import build_and_write_pack
+
+
+
+
+
+def _progress(msg: str) -> None:
+ # Use carriage return and padding to avoid external dependencies like rich
+ sys.stderr.write(f"\r[Scriber] {msg}".ljust(80))
+ sys.stderr.flush()
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ prog="scriber",
+ description="Scriber 2.0: build an intelligent code pack from one or more project paths.",
+ )
+ parser.add_argument("paths", nargs="*", help="Project file/folder paths used as seeds. Defaults to current directory.")
+ parser.add_argument("--config", help="Path to pyproject.toml. Its parent directory becomes the project root.")
+ parser.add_argument("--path-base", choices=["project", "cwd"], default="project", help="Base directory for relative paths when --config is used.")
+ parser.add_argument("--format", choices=["md", "txt"], dest="output_format", help="Output format.")
+ parser.add_argument("--output", help="Output file path, relative to project root unless absolute. Use '-' for stdout.")
+ parser.add_argument("--only-tree", action="store_true", help="Render only scored tree/map, without file contents.")
+ parser.add_argument("--modules", dest="modules", action="store_true", help="Enable automatic related module selection.")
+ parser.add_argument("--no-modules", dest="modules", action="store_false", help="Disable automatic related module selection.")
+ parser.set_defaults(modules=None)
+ parser.add_argument("--support", dest="support", action="store_true", help="Enable support files.")
+ parser.add_argument("--no-support", dest="support", action="store_false", help="Disable support files.")
+ parser.set_defaults(support=None)
+ parser.add_argument("--support-content", choices=["full", "auto", "tree_only"], help="Override default support file content policy.")
+ parser.add_argument("--max-files", type=int, help="Maximum number of files in the pack.")
+ parser.add_argument("--max-tokens", type=int, help="Approximate token budget for included file contents. 0 disables budget.")
+ parser.add_argument("--min-score", type=int, help="Minimum score for non-seed files.")
+ parser.add_argument("--init", action="store_true", help="Append a default [tool.scriber] config to pyproject.toml and exit.")
+ parser.add_argument("--force", action="store_true", help="Allow --init to append even if [tool.scriber] already exists.")
+ parser.add_argument("--project", action="store_true", help="Force project snapshot mode.")
+ parser.add_argument("--explain-selection", action="store_true", help="Explain reason for file selection in detail.")
+ parser.add_argument("--validate-config", action="store_true", help="Validate pyproject.toml scriber config.")
+ parser.add_argument("--dry-run", action="store_true", help="Perform a dry run without saving the pack file.")
+ parser.add_argument("--open", action="store_true", help="Open the output file automatically after creation.")
+ parser.add_argument("--timings", action="store_true", help="Show execution timings for each phase.")
+ parser.add_argument("--version", action="store_true", help="Show version information and exit.")
+ return parser
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+ parser = build_parser()
+ args = parser.parse_args(argv)
+
+ try:
+ if args.version:
+ from scriber import __version__
+ print(f"scriber {__version__}")
+ from scriber.native import is_native_available, require_native
+ if is_native_available():
+ native = require_native()
+ if hasattr(native, "build_info"):
+ print(f"native {native.build_info()}")
+ return 0
+
+ if args.validate_config:
+ config_path = resolve_config_path(args.paths or ["."], args.config)
+ if not config_path.exists():
+ print(f"Error: Config file not found at {config_path}", file=sys.stderr)
+ return 1
+ try:
+ raw_data = load_raw_pyproject(config_path)
+ raw_issues = validate_raw_config(raw_data)
+ if raw_issues:
+ issues = raw_issues
+ else:
+ config = load_config(config_path)
+ issues = validate_config(config, raw_data, config_path)
+
+ if not issues:
+ print("Scriber config is valid.", file=sys.stderr)
+ return 0
+ else:
+ errors = 0
+ warnings = 0
+ for issue in issues:
+ severity = issue.severity.upper()
+ if severity == "ERROR":
+ errors += 1
+ else:
+ warnings += 1
+ print(f"[{severity}] {issue.message}", file=sys.stderr)
+ print(f"\nValidation completed: {errors} error(s), {warnings} warning(s)", file=sys.stderr)
+ return 1 if errors > 0 else 0
+ except Exception as exc:
+ print(f"Error: Failed to parse pyproject.toml: {exc}", file=sys.stderr)
+ return 1
+
+ if args.init:
+ path = init_project(args.config, args.force)
+ print(f"Scriber config written to: {path}")
+ return 0
+
+ if args.dry_run:
+ from scriber.packer.pack import build_pack
+ from scriber.core.config import apply_overrides
+ pack = build_pack(
+ args.paths or ["."],
+ config_path=args.config,
+ output=args.output,
+ output_format=args.output_format,
+ only_tree=True if args.only_tree else None,
+ modules=args.modules,
+ support=args.support,
+ max_files=args.max_files,
+ max_tokens=args.max_tokens,
+ min_score=args.min_score,
+ support_content=args.support_content,
+ progress_callback=_progress,
+ project=args.project,
+ path_base=args.path_base,
+ )
+ sys.stderr.write("\r".ljust(80) + "\r")
+ sys.stderr.flush()
+
+ code_count = len([c for c in pack.candidates if c.file.kind == "code" and c.include_content])
+ support_count = len([c for c in pack.candidates if c.file.kind == "support" and c.include_content])
+ total_count = len(pack.candidates)
+
+ print("Scriber dry-run completed.", file=sys.stderr)
+ print("----------------------------------------", file=sys.stderr)
+ print(f" Mode: {pack.mode}", file=sys.stderr)
+ print(f" Code files selected: {code_count}", file=sys.stderr)
+ print(f" Support files selected: {support_count}", file=sys.stderr)
+ print(f" Total files in pack: {total_count}", file=sys.stderr)
+ print(f" Estimated tokens: {pack.total_tokens}", file=sys.stderr)
+ if args.timings and pack.timings:
+ print("----------------------------------------", file=sys.stderr)
+ print("Timings:", file=sys.stderr)
+ for phase, duration in pack.timings.items():
+ print(f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s", file=sys.stderr)
+ print(f" total: {sum(pack.timings.values()):.4f}s", file=sys.stderr)
+
+ config = load_config(pack.config_path)
+ config = apply_overrides(config, output=args.output)
+ output_path = config.output
+ if not output_path.is_absolute():
+ output_path = pack.project_root / output_path
+ print(f" Proposed output path: {output_path}", file=sys.stderr)
+ print("----------------------------------------", file=sys.stderr)
+ return 0
+
+ output, pack = build_and_write_pack(
+ args.paths or ["."],
+ config_path=args.config,
+ output=args.output,
+ output_format=args.output_format,
+ only_tree=True if args.only_tree else None,
+ modules=args.modules,
+ support=args.support,
+ max_files=args.max_files,
+ max_tokens=args.max_tokens,
+ min_score=args.min_score,
+ support_content=args.support_content,
+ progress_callback=_progress,
+ project=args.project,
+ explain_selection=args.explain_selection,
+ path_base=args.path_base,
+ )
+
+ sys.stderr.write("\r".ljust(80) + "\r")
+ sys.stderr.flush()
+
+ code_count = 0
+ support_count = 0
+ omitted_count = 0
+ for cand in pack.candidates:
+ if cand.include_content:
+ if cand.file.kind == "code":
+ code_count += 1
+ elif cand.file.kind == "support":
+ support_count += 1
+ else:
+ omitted_count += 1
+
+ sys.stderr.write("Scriber build completed.\n")
+ sys.stderr.write("----------------------------------------\n")
+ sys.stderr.write(f" Code files included: {code_count}\n")
+ sys.stderr.write(f" Support files included: {support_count}\n")
+ sys.stderr.write(f" Files omitted/skipped: {omitted_count}\n")
+ sys.stderr.write(f" Estimated tokens: {pack.total_tokens}\n")
+ sys.stderr.write("----------------------------------------\n")
+ if args.timings and pack.timings:
+ sys.stderr.write("Timings:\n")
+ for phase, duration in pack.timings.items():
+ sys.stderr.write(f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n")
+ sys.stderr.write(f" - total: {sum(pack.timings.values()):.4f}s\n")
+ sys.stderr.write("----------------------------------------\n")
+
+ if output is not None:
+ print(f"Scriber pack written to: {output}")
+ if args.open:
+ from scriber.core.open_file import open_path
+ open_path(output)
+ return 0
+ except ScriberError as exc:
+ parser.exit(2, f"scriber: error: {exc}\n")
+ except KeyboardInterrupt:
+ parser.exit(130, "scriber: interrupted\n")
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/src/scriber/config.py b/src/scriber/config.py
deleted file mode 100644
index 6f205d5..0000000
--- a/src/scriber/config.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""
-Configuration data structure for the Scriber application.
-"""
-from dataclasses import asdict, dataclass, field
-from typing import Any, Dict, List, Set
-
-_DEFAULT_OUTPUT_FILENAME = "scriber_output.txt"
-_CONFIG_FILE_NAME = ".scriber.json"
-
-
-@dataclass
-class ScriberConfig:
- """
- A dataclass to hold all configuration settings for Scriber.
-
- This provides a structured, type-safe way to manage configuration,
- replacing the previous dictionary-based approach. It includes methods
- for easy conversion to and from dictionaries.
- """
- use_gitignore: bool = True
- exclude: List[str] = field(default_factory=lambda: [
- "LICENSE",
- ".git/",
- ".idea/", ".vscode/", ".project/", ".settings/", ".classpath/",
- "__pycache__/", "*.pyc", ".venv/", "venv/", ".pytest_cache/", "uv.lock",
- "node_modules/", "npm-debug.log*", "yarn-error.log",
- "build/", "dist/", "target/", "bin/", "obj/", "out/",
- "vendor/", "bower_components/",
- "*.log", "*.lock", "*.tmp", "temp/", "tmp/",
- ".DS_Store", "Thumbs.db", "*~", "*.swp", "*.swo",
- _DEFAULT_OUTPUT_FILENAME, _CONFIG_FILE_NAME
- ])
- include: List[str] = field(default_factory=list)
- hidden: List[str] = field(default_factory=list)
- exclude_map: Dict[str, List[str]] = field(default_factory=dict)
- output: str = _DEFAULT_OUTPUT_FILENAME
- single_process: bool = False
-
- def to_dict(self) -> Dict[str, Any]:
- """
- Converts the configuration dataclass to a dictionary.
-
- Returns:
- A dictionary representation of the configuration settings.
- """
- return asdict(self)
\ No newline at end of file
diff --git a/src/scriber/core.py b/src/scriber/core.py
deleted file mode 100644
index 185a5b8..0000000
--- a/src/scriber/core.py
+++ /dev/null
@@ -1,682 +0,0 @@
-import fnmatch
-import io
-import json
-import multiprocessing
-import os
-import sys
-from collections import Counter
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, TextIO, Union
-
-try:
- import tomllib
-except ImportError:
- import tomli as tomllib
-
-try:
- import pathspec
-except ImportError:
- pathspec = None
-
-import tiktoken
-
-from .config import ScriberConfig
-
-DEFAULT_CONFIG = ScriberConfig()
-
-
-def _process_file_worker(
- file_path: Path,
- containing_root: Path,
- hidden_patterns: Set[str],
- language_map: Dict[str, str],
- tokenizer: Optional[Any],
-) -> Dict[str, Any]:
- """Processes a single file to gather stats; safe for multiprocessing.
-
- This function is defined at the top level to avoid pickling issues with
- instance methods that have un-pickleable attributes (like rich.Console).
-
- Args:
- file_path: The path of the file to process.
- containing_root: The root directory that contains the file.
- hidden_patterns: A set of patterns for files whose content should be hidden.
- language_map: A dictionary mapping file extensions to languages.
- tokenizer: The tiktoken tokenizer instance.
-
- Returns:
- A dictionary containing the size, token count, and language of the file.
- """
- stats: Dict[str, Any] = {"size": 0, "tokens": 0, "lang": "other"}
- try:
- stats["size"] = file_path.stat().st_size
- stats["lang"] = language_map.get(file_path.suffix, language_map.get(file_path.name, "")) or "other"
-
- is_hidden = False
- if hidden_patterns:
- relative_path_str = file_path.relative_to(containing_root).as_posix()
- if any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in hidden_patterns):
- is_hidden = True
-
- if not is_hidden and tokenizer:
- content = file_path.read_text(encoding="utf-8", errors="ignore")
- stats["tokens"] = len(tokenizer.encode(content))
- except Exception:
- pass
- return stats
-
-
-class Scriber:
- """
- Maps, analyzes, and compiles a project's source code into a single output.
-
- This class can be used programmatically to gain fine-grained control over the
- project mapping process, access intermediate data like file lists and
- statistics, and get the final output as a string for further processing.
- """
- _CONFIG_FILE_NAME = ".scriber.json"
- _LANGUAGE_MAP = {
- ".asm": "asm", ".s": "asm", ".html": "html", ".htm": "html", ".css": "css",
- ".scss": "scss", ".sass": "sass", ".less": "less", ".js": "javascript",
- ".mjs": "javascript", ".cjs": "javascript", ".jsx": "jsx", ".ts": "typescript",
- ".tsx": "tsx", ".vue": "vue", ".svelte": "svelte", ".py": "python", ".pyw": "python",
- ".rb": "ruby", ".java": "java", ".kt": "kotlin", ".kts": "kotlin", ".scala": "scala",
- ".go": "go", ".php": "php", ".c": "c", ".h": "c", ".cpp": "cpp", ".hpp": "cpp",
- ".cs": "csharp", ".rs": "rust", ".swift": "swift", ".dart": "dart", ".pl": "perl",
- ".pm": "perl", ".hs": "haskell", ".lua": "lua", ".erl": "erlang", ".ex": "elixir",
- ".exs": "elixir", ".clj": "clojure", ".lisp": "lisp", ".f": "fortran",
- ".f90": "fortran", ".zig": "zig", ".d": "d", ".v": "v", ".cr": "crystal",
- ".nim": "nim", ".pas": "pascal", ".ml": "ocaml", ".sh": "bash", ".bash": "bash",
- ".zsh": "zsh", ".fish": "fish", ".ps1": "powershell", ".bat": "batch",
- ".json": "json", ".jsonc": "jsonc", ".xml": "xml", ".yaml": "yaml", ".yml": "yaml",
- ".toml": "toml", ".ini": "ini", ".properties": "properties", ".env": "dotenv",
- "Dockerfile": "dockerfile", ".tf": "terraform", ".hcl": "hcl", ".groovy": "groovy",
- ".gradle": "groovy", ".cmake": "cmake", "CMakeLists.txt": "cmake", ".md": "markdown",
- ".mdx": "mdx", ".rst": "rst", ".tex": "latex", "LICENSE": "text", ".sql": "sql",
- ".graphql": "graphql", ".proto": "protobuf", ".glsl": "glsl", ".frag": "glsl",
- ".vert": "glsl", ".vb": "vbnet", ".vbs": "vbscript",
- }
-
- def __init__(
- self,
- root_path: Union[Path, List[Path]],
- config: Optional[Union[Dict[str, Any], ScriberConfig]] = None,
- config_path: Optional[Path] = None
- ):
- """Initializes the Scriber instance.
-
- Args:
- root_path: An absolute path or a list of absolute paths to the root
- directories of the project(s) to be mapped.
- config: An optional dictionary or ScriberConfig object of settings.
- Takes the highest precedence if provided.
- config_path: An optional path to a specific configuration file.
- """
- raw_paths = [root_path] if isinstance(root_path, Path) else root_path
- self.root_paths: List[Path] = [p.resolve() for p in raw_paths]
- self.primary_root: Path = self.root_paths[0]
-
- self.mapped_files: List[Path] = []
- self._user_config_path = config_path
- self._user_config_input = config
- self.config: ScriberConfig = ScriberConfig()
- self.config_path_used: Optional[Path] = None
- self.gitignore_spec: Optional[Any] = None
- self.dir_exclude_spec: Optional[Any] = None
- self.general_exclude_spec: Optional[Any] = None
- self.hidden_patterns: Set[str] = set()
- self.include_patterns: List[str] = []
- self.exclude_patterns: List[str] = []
- self.exclude_map: Dict[str, List[str]] = {}
- self.single_process: bool = False
-
- self.stats = {}
- self._has_mapped = False
- self._reset_stats()
- self._load_config()
- try:
- self._tokenizer = tiktoken.get_encoding("cl100k_base")
- except Exception:
- self._tokenizer = None
-
- def _reset_stats(self):
- """Resets the statistics and mapped files to their initial state."""
- self.mapped_files = []
- self.stats = {
- "total_files": 0,
- "total_size_bytes": 0,
- "total_tokens": 0,
- "language_counts": Counter(),
- "skipped_binary": 0,
- }
- self._has_mapped = False
-
- def _create_default_config_file(self) -> None:
- """Creates a default .scriber.json config file if no other config is found."""
- config_path = self.primary_root / self._CONFIG_FILE_NAME
- print(f"β¨ No config found. Creating default configuration at: {config_path}", file=sys.stderr)
-
- file_config = {
- "use_gitignore": DEFAULT_CONFIG.use_gitignore,
- "exclude": DEFAULT_CONFIG.exclude,
- "include": DEFAULT_CONFIG.include,
- "hidden": DEFAULT_CONFIG.hidden
- }
- try:
- with config_path.open("w", encoding="utf-8") as f:
- json.dump(file_config, f, indent=2)
- except IOError as e:
- print(f"β Could not create default config file: {e}", file=sys.stderr)
-
- def _load_config(self) -> None:
- """Loads configuration with a clear precedence: direct config > config_path > local files."""
- config_data = DEFAULT_CONFIG.to_dict()
- config_source_loaded = False
-
- if self._user_config_input:
- if isinstance(self._user_config_input, ScriberConfig):
- config_data.update(self._user_config_input.to_dict())
- else:
- config_data.update(self._user_config_input)
- config_source_loaded = True
- self.config_path_used = None
- else:
- config_path_to_use = self._user_config_path
- if config_path_to_use:
- if not config_path_to_use.is_file():
- print(f"Warning: Config file specified by --config not found at {self._user_config_path}", file=sys.stderr)
- config_path_to_use = None
- else:
- json_path = self.primary_root / self._CONFIG_FILE_NAME
- toml_path = self.primary_root / "pyproject.toml"
- if json_path.is_file():
- config_path_to_use = json_path
- elif toml_path.is_file():
- config_path_to_use = toml_path
-
- if config_path_to_use:
- self.config_path_used = config_path_to_use
- try:
- if config_path_to_use.suffix == ".toml":
- with config_path_to_use.open("rb") as f:
- toml_data = tomllib.load(f)
- if "tool" in toml_data and "scriber" in toml_data["tool"]:
- config_data.update(toml_data["tool"]["scriber"])
- config_source_loaded = True
- else:
- with config_path_to_use.open("r", encoding="utf-8") as f:
- config_data.update(json.load(f))
- config_source_loaded = True
- except (json.JSONDecodeError, tomllib.TOMLDecodeError, IOError) as e:
- print(f"Error parsing config file {self.config_path_used}: {e}", file=sys.stderr)
-
- if not config_source_loaded and not self._user_config_input and self._user_config_path is None:
- self._create_default_config_file()
-
- self.config = ScriberConfig(**config_data)
- self.include_patterns = self.config.include
- self.exclude_patterns = self.config.exclude
- self.hidden_patterns = set(self.config.hidden)
- self.exclude_map = self.config.exclude_map
- self.single_process = self.config.single_process
-
- if not pathspec:
- print("Warning: 'pathspec' not installed. .gitignore and advanced exclude patterns will be ignored.", file=sys.stderr)
- else:
- dir_exclude_patterns = [p for p in self.exclude_patterns if p.endswith('/')]
- general_exclude_patterns = [p for p in self.exclude_patterns if not p.endswith('/')]
-
- self.dir_exclude_spec = pathspec.PathSpec.from_lines("gitwildmatch", dir_exclude_patterns)
- self.general_exclude_spec = pathspec.PathSpec.from_lines("gitwildmatch", general_exclude_patterns)
- self._load_gitignore(self.config.use_gitignore)
-
- def _load_gitignore(self, use_gitignore: bool) -> None:
- """Loads gitignore patterns from the .gitignore file if enabled.
-
- Args:
- use_gitignore: A boolean indicating whether to use .gitignore rules.
- """
- self.gitignore_spec: Optional[pathspec.PathSpec] = None
- if not use_gitignore or not pathspec:
- return
-
- gitignore_path = self.primary_root / ".gitignore"
- if gitignore_path.is_file():
- try:
- with gitignore_path.open("r", encoding="utf-8") as f:
- self.gitignore_spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
- except IOError:
- pass
-
- def _find_containing_root(self, path: Path) -> Optional[Path]:
- """Finds which root directory from self.root_paths contains the given path.
-
- Args:
- path: The path to check.
-
- Returns:
- The containing root path, or None if not found.
- """
- for r in self.root_paths:
- try:
- if path.is_relative_to(r):
- return r
- except ValueError:
- continue
- return None
-
- def _is_binary(self, path: Path) -> bool:
- """Checks if a file is likely a binary file.
-
- Args:
- path: The path to the file.
-
- Returns:
- True if the file contains null bytes, False otherwise.
- """
- try:
- with path.open('rb') as f:
- return b'\0' in f.read(1024)
- except IOError:
- return True
-
- def _is_excluded(self, path: Path) -> bool:
- """Determines if a file or directory should be excluded from mapping.
-
- Args:
- path: The path to check.
-
- Returns:
- True if the path should be excluded, False otherwise.
- """
- containing_root = self._find_containing_root(path)
- if not containing_root:
- return True
-
- # When checking a directory for pruning, its path might not have a trailing
- # slash, so we treat it as such for matching.
- is_dir = path.is_dir()
-
- if self.gitignore_spec:
- try:
- relative_path_for_gitignore = path.relative_to(self.primary_root).as_posix()
- if is_dir and not relative_path_for_gitignore.endswith('/'):
- relative_path_for_gitignore += '/'
- if self.gitignore_spec.match_file(relative_path_for_gitignore):
- return True
- except ValueError:
- pass
-
- relative_path_str = path.relative_to(containing_root).as_posix()
-
- if is_dir:
- path_for_dir_spec = relative_path_str + '/'
- if self.dir_exclude_spec and self.dir_exclude_spec.match_file(path_for_dir_spec):
- return True
- if self.general_exclude_spec and self.general_exclude_spec.match_file(relative_path_str):
- return True
- else: # Is a file
- if self.general_exclude_spec and self.general_exclude_spec.match_file(relative_path_str):
- return True
-
- if path.is_file():
- global_patterns = self.exclude_map.get("global", [])
- if any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in global_patterns):
- return True
-
- lang = self._get_language(path)
- if lang and lang in self.exclude_map:
- lang_patterns = self.exclude_map.get(lang, [])
- if any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in lang_patterns):
- return True
-
- if self.include_patterns:
- return not any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in self.include_patterns)
-
- return False
-
- def _is_hidden(self, path: Path) -> bool:
- """Checks if a path matches any of the hidden patterns.
-
- Args:
- path: The path to check.
-
- Returns:
- True if the path matches a hidden pattern, False otherwise.
- """
- if not self.hidden_patterns:
- return False
- containing_root = self._find_containing_root(path)
- if not containing_root:
- return False
- relative_path_str = path.relative_to(containing_root).as_posix()
- return any(fnmatch.fnmatch(relative_path_str, pattern) for pattern in self.hidden_patterns)
-
- def _collect_files(self, perform_binary_check: bool = True) -> None:
- """Walks the project directory and collects all non-excluded files.
-
- Args:
- perform_binary_check: If False, skips the check for binary files.
- """
- collected = set()
- for root_dir in self.root_paths:
- for root, dirs, files in os.walk(root_dir, topdown=True):
- current_root = Path(root)
- dirs[:] = [d for d in dirs if not self._is_excluded(current_root / d)]
- for file in files:
- file_path = current_root / file
- if not self._is_excluded(file_path):
- if perform_binary_check and self._is_binary(file_path):
- self.stats['skipped_binary'] += 1
- continue
- collected.add(file_path)
- self.mapped_files = sorted(list(collected))
-
- def map_project(self) -> None:
- """Maps all relevant project files and gathers statistics."""
- self._reset_stats()
- self._collect_files(perform_binary_check=True)
- self._gather_stats()
- self._has_mapped = True
-
- def map_tree_only(self) -> None:
- """Maps only the project file structure without reading file contents."""
- self._reset_stats()
- self._collect_files(perform_binary_check=False)
- self.stats['total_files'] = len(self.mapped_files)
- self._has_mapped = True
-
- def _gather_stats(self) -> None:
- """Gathers statistics about the mapped files."""
- if not self.mapped_files:
- return
-
- self.stats['total_files'] = len(self.mapped_files)
- total_size = 0
- total_tokens = 0
- language_counts: Counter = Counter()
-
- if self.single_process:
- for path in self.mapped_files:
- containing_root = self._find_containing_root(path)
- if containing_root:
- try:
- file_stats = _process_file_worker(
- path, containing_root, self.hidden_patterns, self._LANGUAGE_MAP, self._tokenizer
- )
- total_size += file_stats["size"]
- total_tokens += file_stats["tokens"]
- language_counts[file_stats["lang"]] += 1
- except Exception as exc:
- print(f"File processing generated an exception: {exc}", file=sys.stderr)
- else:
- context = multiprocessing.get_context("spawn")
- with ProcessPoolExecutor(mp_context=context) as executor:
- futures = []
- for path in self.mapped_files:
- containing_root = self._find_containing_root(path)
- if containing_root:
- futures.append(executor.submit(
- _process_file_worker,
- path,
- containing_root,
- self.hidden_patterns,
- self._LANGUAGE_MAP,
- self._tokenizer,
- ))
-
- for future in as_completed(futures):
- try:
- file_stats = future.result()
- total_size += file_stats["size"]
- total_tokens += file_stats["tokens"]
- language_counts[file_stats["lang"]] += 1
- except Exception as exc:
- print(f"File processing generated an exception: {exc}", file=sys.stderr)
-
- self.stats['total_size_bytes'] = total_size
- self.stats['total_tokens'] = total_tokens
- self.stats['language_counts'] = language_counts
-
- def get_stats(self) -> Dict:
- """Returns the collected project statistics.
-
- If the project has not been mapped yet, `map_project()` will be called first.
-
- Returns:
- A dictionary containing project statistics.
- """
- if not self._has_mapped:
- self.map_project()
- return self.stats
-
- def get_file_count(self) -> int:
- """Returns the number of files that will be mapped.
-
- If the project has not been mapped yet, `map_project()` will be called first.
-
- Returns:
- The total count of mapped files.
- """
- if not self._has_mapped:
- self.map_project()
- return len(self.mapped_files)
-
- def get_mapped_files(self) -> List[Path]:
- """Returns a list of all mapped file paths.
-
- If the project has not been mapped yet, `map_project()` will be called first.
-
- Returns:
- A sorted list of `pathlib.Path` objects for all included files.
- """
- if not self._has_mapped:
- self.map_project()
- return self.mapped_files
-
- def get_tree(self) -> str:
- """Returns the formatted file tree representation as a string.
-
- If the project has not been mapped yet, `map_project()` will be called first.
-
- Returns:
- A string containing the formatted file tree.
- """
- if not self._has_mapped:
- self.map_project()
- return self._get_tree_representation()
-
- def get_output_as_string(self, tree_only: bool = False, progress=None, task_id=None) -> str:
- """Generates the consolidated project output and returns it as a string.
-
- If the project has not been mapped yet, `map_project()` will be called first.
-
- Args:
- tree_only: If True, the string will only contain the file tree.
- progress: An optional Rich Progress instance for updating a progress bar.
- task_id: An optional ID for the task in the Rich Progress instance.
-
- Returns:
- A string containing the complete project map and file contents.
- """
- if not self._has_mapped:
- if tree_only:
- self.map_tree_only()
- else:
- self.map_project()
- output_buffer = io.StringIO()
- self._write_output(output_buffer, tree_only, progress=progress, task_id=task_id)
- return output_buffer.getvalue()
-
- def generate_output_file(self, output_filename: str, tree_only: bool = False, progress=None, task_id=None) -> None:
- """Generates the consolidated project structure output file.
-
- Args:
- output_filename: The name for the output file.
- tree_only: If True, only the file tree is written.
- progress: A Rich Progress instance for updating the progress bar.
- task_id: The ID of the task in the Rich Progress instance.
- """
- if not self._has_mapped:
- if tree_only:
- self.map_tree_only()
- else:
- self.map_project()
- output_filepath = self.primary_root / output_filename
- with output_filepath.open("w", encoding="utf-8") as f:
- self._write_output(f, tree_only, progress, task_id)
-
- def _write_output(self, f: TextIO, tree_only: bool, progress, task_id) -> None:
- """Writes the complete project map and file contents to an open file stream.
-
- Args:
- f: The file stream to write to.
- tree_only: If True, only write the file tree.
- progress: A Rich Progress instance for updating the progress bar.
- task_id: The ID of the task in the Rich Progress instance.
- """
- f.write("=" * 3 + "\n Mapped Folder Structure\n" + "=" * 3 + "\n\n")
- f.write(self._get_tree_representation() + "\n")
-
- if tree_only:
- return
-
- for file_path in self.mapped_files:
- if self._is_hidden(file_path):
- self._write_hidden_file_placeholder(f, file_path)
- else:
- self._write_file_content(f, file_path)
- if progress and task_id is not None:
- progress.update(task_id, advance=1)
-
- def _get_display_path(self, file_path: Path) -> str:
- """Gets the path to display in the output header.
-
- Args:
- file_path: The absolute path to the file.
-
- Returns:
- A string representing the path for display.
- """
- containing_root = self._find_containing_root(file_path)
- if not containing_root:
- return file_path.name
-
- relative_path = file_path.relative_to(containing_root)
- if len(self.root_paths) > 1:
- return (Path(containing_root.name) / relative_path).as_posix()
- return relative_path.as_posix()
-
- def _write_hidden_file_placeholder(self, f: TextIO, file_path: Path) -> None:
- """Writes a placeholder for a hidden file's content.
-
- Args:
- f: The file stream to write to.
- file_path: The path of the hidden file.
- """
- try:
- display_path = self._get_display_path(file_path)
- file_size = file_path.stat().st_size
- except (OSError, ValueError):
- return
-
- f.write("\n" + "-" * 3 + "\n")
- f.write(f"File: {display_path}\nSize: {file_size} bytes\n" + "-" * 3 + "\n")
- f.write("```\n[Content hidden based on configuration]\n```\n")
-
- def _write_file_content(self, f: TextIO, file_path: Path) -> None:
- """Writes a single file's content to the output stream.
-
- Args:
- f: The file stream to write to.
- file_path: The path of the file to write.
- """
- try:
- display_path = self._get_display_path(file_path)
- file_size = file_path.stat().st_size
- lang = self._get_language(file_path)
- content = file_path.read_text(encoding="utf-8", errors="ignore")
- except (OSError, ValueError):
- return
-
- f.write("\n" + "-" * 3 + "\n")
- f.write(f"File: {display_path}\nSize: {file_size} bytes\n" + "-" * 3 + "\n")
- f.write(f"```{lang}\n{content}\n```\n")
-
- def _get_language(self, file_path: Path) -> str:
- """Determines the programming language of a file based on its extension.
-
- Args:
- file_path: The path to the file.
-
- Returns:
- A string representing the language, or an empty string if not found.
- """
- return self._LANGUAGE_MAP.get(file_path.suffix, self._LANGUAGE_MAP.get(file_path.name, ""))
-
- def _get_tree_representation(self) -> str:
- """Generates a string representation of the project's file tree.
-
- Returns:
- A formatted string of the file tree.
- """
- tree = self._build_file_tree()
- if not tree: return "No files or folders to map."
-
- def format_tree(d: Dict, prefix: str = "") -> List[str]:
- lines = []
- items = sorted(d.keys())
- for i, key in enumerate(items):
- is_last = i == len(items) - 1
- connector = "βββ " if is_last else "βββ "
- lines.append(f"{prefix}{connector}{key}")
- if d[key]:
- new_prefix = prefix + (" " if is_last else "β ")
- lines.extend(format_tree(d[key], new_prefix))
- return lines
-
- if len(self.root_paths) == 1:
- root_name = list(tree.keys())[0]
- output_lines = [root_name]
- output_lines.extend(format_tree(tree[root_name]))
- else:
- output_lines = []
- for root_name, subtree in sorted(tree.items()):
- output_lines.append(root_name)
- output_lines.extend(format_tree(subtree))
- return "\n".join(output_lines)
-
- def _build_file_tree(self) -> Dict[str, Any]:
- """Builds a nested dictionary representing the file tree structure.
-
- Returns:
- A dictionary representing the project's file hierarchy.
- """
- if not self.mapped_files: return {}
-
- if len(self.root_paths) == 1:
- tree = {self.primary_root.name: {}}
- project_level = tree[self.primary_root.name]
- for path in self.mapped_files:
- parts = path.relative_to(self.primary_root).parts
- current_level = project_level
- for part in parts:
- current_level = current_level.setdefault(part, {})
- return tree
- else:
- tree = {}
- for path in self.mapped_files:
- containing_root = self._find_containing_root(path)
- if not containing_root:
- continue
-
- root_name = containing_root.name
- if root_name not in tree:
- tree[root_name] = {}
-
- parts = path.relative_to(containing_root).parts
- current_level = tree[root_name]
- for part in parts:
- current_level = current_level.setdefault(part, {})
- return tree
\ No newline at end of file
diff --git a/src/scriber/core/__init__.py b/src/scriber/core/__init__.py
new file mode 100644
index 0000000..b3b8256
--- /dev/null
+++ b/src/scriber/core/__init__.py
@@ -0,0 +1,64 @@
+from .errors import ScriberError
+from .models import (
+ Candidate,
+ ContentPolicy,
+ FileKind,
+ FileNode,
+ ModuleConfig,
+ ModuleGraph,
+ OutputFormat,
+ PythonConfig,
+ ScriberConfig,
+ ScriberPack,
+ SeedPath,
+ SupportContentConfig,
+)
+from .matchers import SimpleGitIgnore, match_pattern, matches_any, normalize_rel
+from .root import (
+ resolve_config_path,
+ project_root_from_config,
+ ensure_inside_root,
+ rel_to_root,
+)
+from .config import (
+ DEFAULT_CODE_PATTERNS,
+ DEFAULT_SUPPORT_PATTERNS,
+ DEFAULT_SUPPORT_FULL,
+ DEFAULT_SUPPORT_TREE_ONLY,
+ DEFAULT_HARD_IGNORE,
+ DEFAULT_CONFIG_BLOCK,
+ load_config,
+ apply_overrides,
+)
+
+__all__ = [
+ "ScriberError",
+ "Candidate",
+ "ContentPolicy",
+ "FileKind",
+ "FileNode",
+ "ModuleConfig",
+ "ModuleGraph",
+ "OutputFormat",
+ "PythonConfig",
+ "ScriberConfig",
+ "ScriberPack",
+ "SeedPath",
+ "SupportContentConfig",
+ "SimpleGitIgnore",
+ "match_pattern",
+ "matches_any",
+ "normalize_rel",
+ "resolve_config_path",
+ "project_root_from_config",
+ "ensure_inside_root",
+ "rel_to_root",
+ "DEFAULT_CODE_PATTERNS",
+ "DEFAULT_SUPPORT_PATTERNS",
+ "DEFAULT_SUPPORT_FULL",
+ "DEFAULT_SUPPORT_TREE_ONLY",
+ "DEFAULT_HARD_IGNORE",
+ "DEFAULT_CONFIG_BLOCK",
+ "load_config",
+ "apply_overrides",
+]
diff --git a/src/scriber/core/config.py b/src/scriber/core/config.py
new file mode 100644
index 0000000..fed0545
--- /dev/null
+++ b/src/scriber/core/config.py
@@ -0,0 +1,447 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+from dataclasses import dataclass
+
+try: # pragma: no cover - exercised on Python < 3.11 only
+ import tomllib
+except ModuleNotFoundError: # pragma: no cover
+ import tomli as tomllib # type: ignore[no-redef]
+
+from .models import CacheConfig, ModuleConfig, PythonConfig, ScriberConfig, SupportContentConfig, TokenConfig
+
+DEFAULT_CODE_PATTERNS = [
+ "**/*.py",
+ "**/*.pyi",
+ "**/*.rs",
+ "**/*.js",
+ "**/*.jsx",
+ "**/*.ts",
+ "**/*.tsx",
+ "**/*.go",
+ "**/*.java",
+ "**/*.kt",
+ "**/*.c",
+ "**/*.cpp",
+ "**/*.h",
+ "**/*.hpp",
+]
+
+DEFAULT_SUPPORT_PATTERNS = [
+ "**/*.toml",
+ "**/*.lock",
+ "pyproject.toml",
+ "setup.py",
+ "setup.cfg",
+ "requirements.txt",
+ "requirements/*.txt",
+ "tox.ini",
+ "pytest.ini",
+ "mypy.ini",
+ "ruff.toml",
+ ".ruff.toml",
+ "Pipfile",
+ "README.md",
+ "README.rst",
+ "CHANGELOG.md",
+ "CONTRIBUTING.md",
+ "docs/**/*.md",
+ ".env.example",
+ ".env.template",
+ "config/*.toml",
+ "config/*.yaml",
+ "config/*.yml",
+ "config/*.json",
+ "settings/*.toml",
+ "settings/*.yaml",
+ "settings/*.yml",
+ "Dockerfile",
+ "Dockerfile.*",
+ "docker-compose.yml",
+ "docker-compose.yaml",
+ "compose.yml",
+ "compose.yaml",
+ ".github/workflows/*.yml",
+ ".github/workflows/*.yaml",
+ ".gitlab-ci.yml",
+ ".pre-commit-config.yaml",
+ "package.json",
+ "tsconfig.json",
+ "vite.config.*",
+ "webpack.config.*",
+ "Cargo.toml",
+ "Cargo.lock",
+ "go.mod",
+ "go.sum",
+ "poetry.lock",
+ "uv.lock",
+ "Pipfile.lock",
+ "package-lock.json",
+ "pnpm-lock.yaml",
+ "yarn.lock",
+ "**/*.svg",
+]
+
+DEFAULT_SUPPORT_FULL = [
+ "**/*.toml",
+ "pyproject.toml",
+ "requirements.txt",
+ "requirements/*.txt",
+ "pytest.ini",
+ "tox.ini",
+ "mypy.ini",
+ "ruff.toml",
+ ".ruff.toml",
+ ".env.example",
+ ".env.template",
+ "Dockerfile",
+ "Dockerfile.*",
+ "docker-compose.yml",
+ "docker-compose.yaml",
+ ".github/workflows/*.yml",
+ ".github/workflows/*.yaml",
+ "README.md",
+ "Cargo.toml",
+ "go.mod",
+]
+
+DEFAULT_SUPPORT_TREE_ONLY = [
+ "**/*.svg",
+ "**/*.lock",
+ "Cargo.lock",
+ "poetry.lock",
+ "uv.lock",
+ "Pipfile.lock",
+ "package-lock.json",
+ "pnpm-lock.yaml",
+ "yarn.lock",
+ "go.sum",
+]
+
+DEFAULT_HARD_IGNORE = [
+ ".git/**",
+ ".idea/**",
+ ".hg/**",
+ ".svn/**",
+ ".scriber/**",
+ ".venv/**",
+ "venv/**",
+ "env/**",
+ "__pycache__/**",
+ ".pytest_cache/**",
+ ".mypy_cache/**",
+ ".ruff_cache/**",
+ "node_modules/**",
+ "dist/**",
+ "build/**",
+ "target/**",
+ ".next/**",
+ ".turbo/**",
+]
+
+DEFAULT_CONFIG_BLOCK = """
+[tool.scriber]
+version = "2"
+format = "md"
+output = ".scriber/scriber_pack.md"
+only_tree = false
+use_gitignore = true
+max_files = 60
+max_tokens = 100000
+min_score = 45
+path_style = "project-relative"
+allow_external_paths = false
+
+[tool.scriber.code_files]
+patterns = ["**/*.py", "**/*.pyi", "**/*.rs", "**/*.js", "**/*.jsx", "**/*.ts", "**/*.tsx"]
+
+[tool.scriber.support_files]
+enabled = true
+patterns = [
+ "**/*.toml",
+ "**/*.lock",
+ "pyproject.toml",
+ "README.md",
+ "requirements.txt",
+ "requirements/*.txt",
+ ".env.example",
+ "Dockerfile",
+ "docker-compose.yml",
+ ".github/workflows/*.yml",
+ "**/*.svg",
+]
+
+[tool.scriber.support_files.content]
+default = "auto"
+full = ["**/*.toml", "pyproject.toml", "README.md", "requirements.txt", "requirements/*.txt", ".env.example", "Dockerfile", "docker-compose.yml", ".github/workflows/*.yml"]
+tree_only = ["**/*.svg", "**/*.lock"]
+
+[tool.scriber.modules]
+enabled = true
+depth = 2
+include_direct_dependencies = true
+include_reverse_dependencies = true
+include_tests = true
+include_same_package = true
+include_parent_entrypoints = true
+include_project_configs = true
+content_min_score = 50
+tree_min_score = 30
+
+[tool.scriber.python]
+source_roots = ["src", "app", "."]
+test_roots = ["tests", "test"]
+entrypoint_patterns = ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"]
+
+[tool.scriber.tokens]
+estimator = "chars"
+chars_per_token = 4
+""".strip()
+
+
+def load_raw_pyproject(config_path: Path) -> dict[str, Any]:
+ with config_path.open("rb") as handle:
+ return tomllib.load(handle)
+
+
+def load_config(config_path: Path) -> ScriberConfig:
+ raw = load_raw_pyproject(config_path)
+ tool = raw.get("tool", {}) if isinstance(raw, dict) else {}
+ data = tool.get("scriber", {}) if isinstance(tool, dict) else {}
+
+ config = ScriberConfig(
+ code_patterns=list(DEFAULT_CODE_PATTERNS),
+ support_patterns=list(DEFAULT_SUPPORT_PATTERNS),
+ hard_ignore_patterns=list(DEFAULT_HARD_IGNORE),
+ )
+
+ if not isinstance(data, dict):
+ return config
+
+ config.version = str(data.get("version", config.version))
+ config.format = data.get("format", config.format)
+ config.output = Path(data.get("output", str(config.output)))
+ config.only_tree = bool(data.get("only_tree", config.only_tree))
+ config.use_gitignore = bool(data.get("use_gitignore", config.use_gitignore))
+ config.max_files = int(data.get("max_files", config.max_files))
+ config.max_tokens = int(data.get("max_tokens", config.max_tokens))
+ config.min_score = int(data.get("min_score", config.min_score))
+ config.path_style = str(data.get("path_style", config.path_style))
+ config.allow_external_paths = bool(data.get("allow_external_paths", config.allow_external_paths))
+
+ code_files = data.get("code_files", {})
+ if isinstance(code_files, dict) and isinstance(code_files.get("patterns"), list):
+ config.code_patterns = [str(item) for item in code_files["patterns"]]
+
+ support_files = data.get("support_files", {})
+ if isinstance(support_files, dict):
+ config.support = bool(support_files.get("enabled", config.support))
+ if isinstance(support_files.get("patterns"), list):
+ config.support_patterns = [str(item) for item in support_files["patterns"]]
+ content = support_files.get("content", {})
+ if isinstance(content, dict):
+ config.support_content = SupportContentConfig(
+ default=content.get("default", config.support_content.default),
+ full=[str(item) for item in content.get("full", config.support_content.full)],
+ tree_only=[str(item) for item in content.get("tree_only", config.support_content.tree_only)],
+ auto_max_bytes=int(content.get("auto_max_bytes", config.support_content.auto_max_bytes)),
+ )
+ if not config.support_content.full:
+ config.support_content.full = list(DEFAULT_SUPPORT_FULL)
+ if not config.support_content.tree_only:
+ config.support_content.tree_only = list(DEFAULT_SUPPORT_TREE_ONLY)
+
+ hard_ignore = data.get("hard_ignore", {})
+ if isinstance(hard_ignore, dict) and isinstance(hard_ignore.get("patterns"), list):
+ config.hard_ignore_patterns = [str(item) for item in hard_ignore["patterns"]]
+
+ modules = data.get("modules", {})
+ if isinstance(modules, dict):
+ scoring = dict(config.modules_config.scoring)
+ raw_scoring = modules.get("scoring", {})
+ if isinstance(raw_scoring, dict):
+ scoring.update({str(key): int(value) for key, value in raw_scoring.items()})
+ config.modules_config = ModuleConfig(
+ enabled=bool(modules.get("enabled", config.modules_config.enabled)),
+ depth=int(modules.get("depth", config.modules_config.depth)),
+ include_direct_dependencies=bool(modules.get("include_direct_dependencies", config.modules_config.include_direct_dependencies)),
+ include_reverse_dependencies=bool(modules.get("include_reverse_dependencies", config.modules_config.include_reverse_dependencies)),
+ include_tests=bool(modules.get("include_tests", config.modules_config.include_tests)),
+ include_same_package=bool(modules.get("include_same_package", config.modules_config.include_same_package)),
+ include_parent_entrypoints=bool(modules.get("include_parent_entrypoints", config.modules_config.include_parent_entrypoints)),
+ include_project_configs=bool(modules.get("include_project_configs", config.modules_config.include_project_configs)),
+ content_min_score=int(modules.get("content_min_score", config.modules_config.content_min_score)),
+ tree_min_score=int(modules.get("tree_min_score", config.modules_config.tree_min_score)),
+ scoring=scoring,
+ )
+ config.modules = config.modules_config.enabled
+
+ python = data.get("python", {})
+ if isinstance(python, dict):
+ config.python = PythonConfig(
+ source_roots=[str(item) for item in python.get("source_roots", config.python.source_roots)],
+ test_roots=[str(item) for item in python.get("test_roots", config.python.test_roots)],
+ module_init_files=[str(item) for item in python.get("module_init_files", config.python.module_init_files)],
+ entrypoint_patterns=[str(item) for item in python.get("entrypoint_patterns", config.python.entrypoint_patterns)],
+ )
+
+ tokens = data.get("tokens", {})
+ if isinstance(tokens, dict):
+ config.tokens = TokenConfig(
+ estimator=str(tokens.get("estimator", config.tokens.estimator)),
+ chars_per_token=int(tokens.get("chars_per_token", config.tokens.chars_per_token)),
+ )
+
+ cache = data.get("cache", {})
+ if isinstance(cache, dict):
+ config.cache = CacheConfig(
+ enabled=bool(cache.get("enabled", config.cache.enabled)),
+ dir=str(cache.get("dir", config.cache.dir)),
+ )
+
+ return config
+
+
+def apply_overrides(
+ config: ScriberConfig,
+ *,
+ output: str | None = None,
+ output_format: str | None = None,
+ only_tree: bool | None = None,
+ modules: bool | None = None,
+ support: bool | None = None,
+ max_files: int | None = None,
+ max_tokens: int | None = None,
+ min_score: int | None = None,
+ support_content: str | None = None,
+) -> ScriberConfig:
+ if output is not None:
+ config.output = Path(output)
+ if output_format is not None:
+ config.format = output_format # type: ignore[assignment]
+ if only_tree is not None:
+ config.only_tree = only_tree
+ if modules is not None:
+ config.modules = modules
+ config.modules_config.enabled = modules
+ if support is not None:
+ config.support = support
+ if max_files is not None:
+ config.max_files = max_files
+ if max_tokens is not None:
+ config.max_tokens = max_tokens
+ if min_score is not None:
+ config.min_score = min_score
+ if support_content is not None:
+ if support_content not in {"full", "auto", "tree_only"}:
+ raise ValueError("support_content must be one of: full, auto, tree_only")
+ config.support_content.default = support_content # type: ignore[assignment]
+ return config
+
+
+@dataclass(slots=True)
+class ConfigIssue:
+ severity: str # "warning" or "error"
+ message: str
+
+
+def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]:
+ issues: list[ConfigIssue] = []
+
+ # 1. check if raw_data contains tool.scriber
+ tool = raw_data.get("tool", {}) if isinstance(raw_data, dict) else {}
+ if not isinstance(tool, dict):
+ issues.append(ConfigIssue("error", "[tool] in pyproject.toml must be a table."))
+ return issues
+
+ data = tool.get("scriber", {}) if isinstance(tool, dict) else {}
+ if not data:
+ issues.append(ConfigIssue("warning", "[tool.scriber] section is missing or empty."))
+ return issues
+
+ if not isinstance(data, dict):
+ issues.append(ConfigIssue("error", "[tool.scriber] must be a table."))
+ return issues
+
+ # 2. check output format
+ if "format" in data and data["format"] not in {"md", "txt"}:
+ issues.append(ConfigIssue("error", f"Invalid format: '{data['format']}'. Must be 'md' or 'txt'."))
+
+ # 4. check support_content default
+ support_files = data.get("support_files", {})
+ if isinstance(support_files, dict):
+ content = support_files.get("content", {})
+ if isinstance(content, dict) and "default" in content:
+ val = content["default"]
+ if val not in {"full", "auto", "tree_only"}:
+ issues.append(ConfigIssue("error", f"Invalid support_files.content.default: '{val}'. Must be 'full', 'auto', or 'tree_only'."))
+
+ # 5. check numeric values >= 0
+ for field in ["max_files", "max_tokens", "min_score"]:
+ if field in data:
+ try:
+ val = int(data[field])
+ if val < 0:
+ issues.append(ConfigIssue("error", f"{field} must be a number >= 0. Got: {val}"))
+ except (ValueError, TypeError):
+ issues.append(ConfigIssue("error", f"{field} must be an integer. Got: {data[field]}"))
+
+ # 6. check patterns are list of strings
+ def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None:
+ if "patterns" in parent_dict:
+ patterns = parent_dict["patterns"]
+ if not isinstance(patterns, list):
+ issues.append(ConfigIssue("error", f"{path_name}.patterns must be a list of strings."))
+ else:
+ for item in patterns:
+ if not isinstance(item, str):
+ issues.append(ConfigIssue("error", f"Pattern in {path_name}.patterns must be a string. Got: {item}"))
+
+ code_files = data.get("code_files", {})
+ if isinstance(code_files, dict):
+ check_pattern_list(code_files, "code_files")
+ elif "code_files" in data:
+ issues.append(ConfigIssue("error", "code_files must be a table."))
+
+ if isinstance(support_files, dict):
+ check_pattern_list(support_files, "support_files")
+
+ # Check support_files.content full and tree_only patterns
+ content = support_files.get("content", {})
+ if isinstance(content, dict):
+ for field in ["full", "tree_only"]:
+ if field in content:
+ patterns = content[field]
+ if not isinstance(patterns, list):
+ issues.append(ConfigIssue("error", f"support_files.content.{field} must be a list of strings."))
+ else:
+ for item in patterns:
+ if not isinstance(item, str):
+ issues.append(ConfigIssue("error", f"Pattern in support_files.content.{field} must be a string. Got: {item}"))
+ elif "support_files" in data:
+ issues.append(ConfigIssue("error", "support_files must be a table."))
+
+ hard_ignore = data.get("hard_ignore", {})
+ if isinstance(hard_ignore, dict):
+ check_pattern_list(hard_ignore, "hard_ignore")
+ elif "hard_ignore" in data:
+ issues.append(ConfigIssue("error", "hard_ignore must be a table."))
+
+ return issues
+
+def validate_config(config: ScriberConfig, raw_data: dict[str, Any], config_path: Path | None = None) -> list[ConfigIssue]:
+ issues = validate_raw_config(raw_data)
+
+ # Check output path is not a directory
+ output_path = config.output
+ if not output_path.is_absolute() and config_path:
+ output_path = config_path.parent / output_path
+
+ if output_path.suffix == "" and not str(output_path).endswith("-"):
+ issues.append(ConfigIssue("warning", f"Output path '{output_path}' has no extension. Is it a directory?"))
+ if output_path.exists() and output_path.is_dir():
+ issues.append(ConfigIssue("error", f"Output path '{output_path}' points to an existing directory."))
+
+ return issues
+
diff --git a/src/scriber/core/errors.py b/src/scriber/core/errors.py
new file mode 100644
index 0000000..08992d1
--- /dev/null
+++ b/src/scriber/core/errors.py
@@ -0,0 +1,2 @@
+class ScriberError(Exception):
+ """Base exception for expected Scriber failures."""
diff --git a/src/scriber/core/init_config.py b/src/scriber/core/init_config.py
new file mode 100644
index 0000000..f345fe8
--- /dev/null
+++ b/src/scriber/core/init_config.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from pathlib import Path
+from scriber.core.errors import ScriberError
+from scriber.core.config import DEFAULT_CONFIG_BLOCK
+
+
+def replace_existing_tool_scriber_block(content: str, default_block: str) -> str:
+ lines = content.splitlines()
+ new_lines = []
+ in_scriber = False
+
+ for line in lines:
+ stripped = line.strip()
+ if stripped.startswith("[") and stripped.endswith("]"):
+ header = stripped[1:-1].strip()
+ if header == "tool.scriber" or header.startswith("tool.scriber."):
+ in_scriber = True
+ continue
+ else:
+ in_scriber = False
+
+ if not in_scriber:
+ new_lines.append(line)
+
+ cleaned = "\n".join(new_lines).strip()
+ if cleaned:
+ return cleaned + "\n\n" + default_block + "\n"
+ return default_block + "\n"
+
+
+def init_project(config_path: str | None = None, force: bool = False) -> Path:
+ path = Path(config_path or "pyproject.toml")
+ if path.is_dir():
+ path = path / "pyproject.toml"
+ if not path.is_absolute():
+ path = Path.cwd() / path
+
+ if path.exists():
+ content = path.read_text(encoding="utf-8")
+ has_scriber = "[tool.scriber]" in content
+
+ if has_scriber and not force:
+ raise ScriberError(f"Scriber config already exists. Use --force to replace it.")
+
+ if has_scriber:
+ new_content = replace_existing_tool_scriber_block(content, DEFAULT_CONFIG_BLOCK)
+ else:
+ if content and not content.endswith("\n"):
+ content += "\n"
+ new_content = content + "\n" + DEFAULT_CONFIG_BLOCK + "\n"
+
+ path.write_text(new_content, encoding="utf-8")
+ else:
+ path.write_text(DEFAULT_CONFIG_BLOCK + "\n", encoding="utf-8")
+
+ return path
diff --git a/src/scriber/core/matchers.py b/src/scriber/core/matchers.py
new file mode 100644
index 0000000..703a8c0
--- /dev/null
+++ b/src/scriber/core/matchers.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import fnmatch
+from pathlib import PurePosixPath
+
+
+def normalize_rel(value: str) -> str:
+ return value.replace("\\", "/").strip("/")
+
+
+def match_pattern(path: str | PurePosixPath, pattern: str) -> bool:
+ """Match a project-relative POSIX path against a pragmatic glob pattern.
+
+ This intentionally stays dependency-free. It is not a full gitwildmatch
+ implementation, but it handles the common patterns used in pyproject config:
+ `*.py`, `**/*.py`, `dir/**`, `dir/`, exact file paths and basename patterns.
+ """
+
+ rel = normalize_rel(str(path))
+ pat = pattern.replace("\\", "/").strip()
+ if not pat:
+ return False
+ if pat.startswith("/"):
+ pat = pat[1:]
+ pat = pat.strip("/") if pat.endswith("/") else pat
+
+ if rel == pat:
+ return True
+
+ if pat.endswith("/**"):
+ prefix = pat[:-3].strip("/")
+ return rel == prefix or rel.startswith(prefix + "/")
+
+ if fnmatch.fnmatch(rel, pat):
+ return True
+
+ name = rel.rsplit("/", 1)[-1]
+ if "/" not in pat and fnmatch.fnmatch(name, pat):
+ return True
+
+ if pat.startswith("**/"):
+ short = pat[3:]
+ if fnmatch.fnmatch(rel, short) or fnmatch.fnmatch(name, short):
+ return True
+
+ try:
+ return PurePosixPath(rel).match(pat)
+ except ValueError:
+ return False
+
+
+def matches_any(path: str | PurePosixPath, patterns: list[str]) -> bool:
+ return any(match_pattern(path, pattern) for pattern in patterns)
+
+
+class SimpleGitIgnore:
+ """Small .gitignore-style matcher used only for dependency-free defaults."""
+
+ def __init__(self, patterns: list[tuple[bool, str]]) -> None:
+ self.patterns = patterns
+
+ @classmethod
+ def from_file(cls, path):
+ if not path.exists():
+ return cls([])
+ parsed: list[tuple[bool, str]] = []
+ for raw in path.read_text(encoding="utf-8", errors="ignore").splitlines():
+ line = raw.strip()
+ if not line or line.startswith("#"):
+ continue
+ negated = line.startswith("!")
+ if negated:
+ line = line[1:].strip()
+ if line:
+ parsed.append((negated, line))
+ return cls(parsed)
+
+ def ignores(self, rel_path: str, is_dir: bool = False) -> bool:
+ rel = normalize_rel(rel_path)
+ ignored = False
+ for negated, pattern in self.patterns:
+ if self._matches(rel, pattern, is_dir):
+ ignored = not negated
+ return ignored
+
+ def _matches(self, rel: str, pattern: str, is_dir: bool) -> bool:
+ pat = pattern.replace("\\", "/").strip()
+ if not pat:
+ return False
+ if pat.startswith("/"):
+ pat = pat[1:]
+
+ if pat.endswith("/"):
+ prefix = pat.strip("/")
+ return rel == prefix or rel.startswith(prefix + "/")
+
+ if "/" not in pat:
+ parts = rel.split("/")
+ return any(fnmatch.fnmatch(part, pat) for part in parts)
+
+ return match_pattern(rel, pat)
diff --git a/src/scriber/core/models.py b/src/scriber/core/models.py
new file mode 100644
index 0000000..c521c49
--- /dev/null
+++ b/src/scriber/core/models.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+FileKind = Literal["code", "support", "other"]
+ContentPolicy = Literal["full", "auto", "tree_only"]
+OutputFormat = Literal["md", "txt"]
+PackMode = Literal["focused", "project_snapshot"]
+
+
+
+DEFAULT_SCORING: dict[str, int] = {
+ "seed_file": 100,
+ "seed_folder_file": 100,
+ "direct_dependency": 90,
+ "reverse_dependency": 85,
+ "related_test": 80,
+ "same_package": 65,
+ "parent_entrypoint": 60,
+ "support_near_seed": 60,
+ "project_config": 55,
+ "dependency_file": 52,
+ "runtime_support": 50,
+ "documentation": 45,
+ "name_similarity": 45,
+ "shared_dependency_bonus": 10,
+}
+
+
+@dataclass(slots=True)
+class ModuleConfig:
+ enabled: bool = True
+ depth: int = 2
+ include_direct_dependencies: bool = True
+ include_reverse_dependencies: bool = True
+ include_tests: bool = True
+ include_same_package: bool = True
+ include_parent_entrypoints: bool = True
+ include_project_configs: bool = True
+ content_min_score: int = 50
+ tree_min_score: int = 30
+ scoring: dict[str, int] = field(default_factory=lambda: dict(DEFAULT_SCORING))
+
+
+@dataclass(slots=True)
+class PythonConfig:
+ source_roots: list[str] = field(default_factory=lambda: ["src", "app", "."])
+ test_roots: list[str] = field(default_factory=lambda: ["tests", "test"])
+ module_init_files: list[str] = field(default_factory=lambda: ["__init__.py"])
+ entrypoint_patterns: list[str] = field(
+ default_factory=lambda: ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"]
+ )
+
+
+@dataclass(slots=True)
+class SupportContentConfig:
+ default: ContentPolicy = "auto"
+ full: list[str] = field(default_factory=list)
+ tree_only: list[str] = field(default_factory=list)
+ auto_max_bytes: int = 80_000
+
+
+@dataclass(slots=True)
+class TokenConfig:
+ estimator: str = "chars"
+ chars_per_token: int = 4
+
+
+@dataclass(slots=True)
+class CacheConfig:
+ enabled: bool = True
+ dir: str = ".scriber/cache"
+
+
+@dataclass(slots=True)
+class ScriberConfig:
+ version: str = "2"
+ format: OutputFormat = "md"
+ output: Path = Path(".scriber/scriber_pack.md")
+ only_tree: bool = False
+ modules: bool = True
+ support: bool = True
+ use_gitignore: bool = True
+ max_files: int = 60
+ max_tokens: int = 100_000
+ min_score: int = 45
+ path_style: str = "project-relative"
+ allow_external_paths: bool = False
+ code_patterns: list[str] = field(default_factory=list)
+ support_patterns: list[str] = field(default_factory=list)
+ hard_ignore_patterns: list[str] = field(default_factory=list)
+ support_content: SupportContentConfig = field(default_factory=SupportContentConfig)
+ modules_config: ModuleConfig = field(default_factory=ModuleConfig)
+ python: PythonConfig = field(default_factory=PythonConfig)
+ tokens: TokenConfig = field(default_factory=TokenConfig)
+ cache: CacheConfig = field(default_factory=CacheConfig)
+
+
+@dataclass(frozen=True, slots=True)
+class FileNode:
+ absolute: Path
+ relative: Path
+ kind: FileKind
+ language: str
+ size_bytes: int
+ is_binary: bool = False
+ support_category: str | None = None
+ content_policy: ContentPolicy = "auto"
+ _cached_text: str | None = field(default=None, init=False, repr=False, compare=False, hash=False)
+
+ def read_text(self) -> str:
+ if self._cached_text is not None:
+ return self._cached_text
+ from scriber.native import require_native
+ text = require_native().read_text(str(self.absolute))
+ object.__setattr__(self, "_cached_text", text)
+ return text
+
+
+@dataclass(slots=True)
+class SeedPath:
+ original: Path
+ absolute: Path
+ relative: Path
+ is_dir: bool
+ expanded_files: list[Path] = field(default_factory=list)
+
+
+@dataclass(slots=True)
+class Candidate:
+ file: FileNode
+ score: int
+ reasons: list[str] = field(default_factory=list)
+ seed_sources: set[Path] = field(default_factory=set)
+ include_content: bool = False
+ content: str | None = None
+ token_estimate: int = 0
+ omitted_reason: str | None = None
+ reason_counts: dict[str, int] = field(default_factory=dict)
+ reason_examples: dict[str, list[Path]] = field(default_factory=dict)
+ reason_summary: str = ""
+
+
+@dataclass(slots=True)
+class ModuleGraph:
+ imports: dict[Path, set[Path]] = field(default_factory=dict)
+ imported_by: dict[Path, set[Path]] = field(default_factory=dict)
+
+
+@dataclass(slots=True)
+class ScriberPack:
+ project_root: Path
+ config_path: Path
+ seed_paths: list[SeedPath]
+ candidates: list[Candidate]
+ graph: ModuleGraph
+ only_tree: bool
+ output_format: OutputFormat
+ mode: PackMode
+ total_tokens: int = 0
+ timings: dict[str, float] = field(default_factory=dict)
+
+ @property
+ def included_paths(self) -> list[Path]:
+ return [candidate.file.relative for candidate in self.candidates]
diff --git a/src/scriber/core/open_file.py b/src/scriber/core/open_file.py
new file mode 100644
index 0000000..55537b2
--- /dev/null
+++ b/src/scriber/core/open_file.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+import sys
+import os
+import subprocess
+from pathlib import Path
+
+
+def open_path(path: Path) -> None:
+ if not path.exists():
+ return
+
+ path_str = str(path.resolve())
+ try:
+ if sys.platform == "win32":
+ os.startfile(path_str)
+ elif sys.platform == "darwin":
+ subprocess.run(["open", path_str], check=True)
+ else:
+ subprocess.run(["xdg-open", path_str], check=True)
+ except Exception as exc:
+ sys.stderr.write(f"Warning: Failed to open pack file: {exc}\n")
diff --git a/src/scriber/core/root.py b/src/scriber/core/root.py
new file mode 100644
index 0000000..b8042b5
--- /dev/null
+++ b/src/scriber/core/root.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from .errors import ScriberError
+
+
+def resolve_config_path(paths: list[str], explicit_config: str | None = None) -> Path:
+ if explicit_config:
+ config = Path(explicit_config).expanduser()
+ if config.is_dir():
+ config = config / "pyproject.toml"
+ if not config.is_absolute():
+ config = Path.cwd() / config
+ config = config.resolve()
+ if not config.exists():
+ raise ScriberError(f"Config not found: {config}")
+ if config.name != "pyproject.toml":
+ raise ScriberError("Scriber 2.0 expects --config to point to pyproject.toml")
+ return config
+
+ starts: list[Path] = []
+ for raw in paths or ["."]:
+ path = Path(raw).expanduser()
+ if not path.is_absolute():
+ path = Path.cwd() / path
+ # We allow paths that do not exist to report a better error later, but
+ # root discovery should still start from the nearest existing parent.
+ probe = path.resolve(strict=False)
+ if probe.exists() and probe.is_file():
+ probe = probe.parent
+ elif not probe.exists() and probe.suffix:
+ probe = probe.parent
+ starts.append(probe)
+ starts.append(Path.cwd().resolve())
+
+ seen: set[Path] = set()
+ for start in starts:
+ for parent in [start, *start.parents]:
+ if parent in seen:
+ continue
+ seen.add(parent)
+ candidate = parent / "pyproject.toml"
+ if candidate.exists():
+ return candidate.resolve()
+
+ raise ScriberError("No pyproject.toml found. Run `scriber init` or pass `--config /path/to/pyproject.toml`.")
+
+
+def project_root_from_config(config_path: Path) -> Path:
+ return config_path.resolve().parent
+
+
+def ensure_inside_root(path: Path, root: Path, allow_external: bool) -> None:
+ if allow_external:
+ return
+ try:
+ path.resolve().relative_to(root.resolve())
+ except ValueError as exc:
+ raise ScriberError(f"Path is outside project root: {path}") from exc
+
+
+def rel_to_root(path: Path, root: Path) -> Path:
+ try:
+ return path.resolve().relative_to(root.resolve())
+ except ValueError:
+ return path.resolve()
diff --git a/src/scriber/engine/__init__.py b/src/scriber/engine/__init__.py
new file mode 100644
index 0000000..eca1ba7
--- /dev/null
+++ b/src/scriber/engine/__init__.py
@@ -0,0 +1,3 @@
+from .scorer import score_candidates
+
+__all__ = ["score_candidates"]
diff --git a/src/scriber/engine/scorer.py b/src/scriber/engine/scorer.py
new file mode 100644
index 0000000..0dbfb24
--- /dev/null
+++ b/src/scriber/engine/scorer.py
@@ -0,0 +1,302 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from scriber.core.matchers import match_pattern
+from scriber.core.models import Candidate, FileNode, ModuleGraph, ScriberConfig, SeedPath
+
+
+def _score(config: ScriberConfig, key: str) -> int:
+ return int(config.modules_config.scoring.get(key, 0))
+
+
+def _add_reason(candidate: Candidate, kind: str, label: str, example: Path | None = None) -> None:
+ candidate.reason_counts[kind] = candidate.reason_counts.get(kind, 0) + 1
+ if example is not None:
+ if kind not in candidate.reason_examples:
+ candidate.reason_examples[kind] = []
+ if example not in candidate.reason_examples[kind]:
+ candidate.reason_examples[kind].append(example)
+ if label not in candidate.reasons:
+ candidate.reasons.append(label)
+
+
+def _build_reason_summary(candidate: Candidate) -> str:
+ parts = []
+ for kind, count in candidate.reason_counts.items():
+ examples = candidate.reason_examples.get(kind, [])
+ if kind == "seed_file":
+ parts.append("seed file")
+ elif kind == "seed_folder_file":
+ parts.append("seed folder file")
+ elif kind == "direct_dependency":
+ if count > 1:
+ parts.append(f"imports {count} included files")
+ elif examples:
+ parts.append(f"imports {examples[0].name}")
+ else:
+ parts.append("imports seed")
+ elif kind == "reverse_dependency":
+ if count > 1:
+ parts.append(f"imported by {count} included files")
+ elif examples:
+ parts.append(f"imported by {examples[0].name}")
+ else:
+ parts.append("imported by seed")
+ elif kind == "related_test":
+ parts.append("related test")
+ elif kind == "same_package":
+ parts.append("same package")
+ elif kind == "parent_entrypoint":
+ parts.append("parent entrypoint")
+ elif kind == "name_similarity":
+ parts.append("name similarity")
+ elif kind == "support_near_seed":
+ parts.append("support file")
+ elif kind == "project_support":
+ parts.append("project support file")
+ elif kind == "shared_dependency":
+ parts.append("shared dependency bonus")
+ elif kind == "entrypoint":
+ parts.append("entrypoint file")
+ elif kind == "test_file":
+ parts.append("test file")
+ elif kind == "code_file":
+ parts.append("code file")
+ elif kind == "other_file":
+ parts.append("other file")
+ else:
+ parts.append(kind.replace("_", " "))
+ return "; ".join(parts)
+
+
+def _add(
+ candidates: dict[Path, Candidate],
+ files: dict[Path, FileNode],
+ rel: Path,
+ score: int,
+ kind: str,
+ label: str,
+ *,
+ seed: Path | None = None,
+) -> None:
+ file = files.get(rel)
+ if file is None:
+ return
+ existing = candidates.get(rel)
+ if existing is None:
+ existing = Candidate(file=file, score=score)
+ candidates[rel] = existing
+ else:
+ existing.score = max(existing.score, score)
+
+ _add_reason(existing, kind, label, example=seed)
+ if seed is not None:
+ existing.seed_sources.add(seed)
+
+
+def _is_test_file(rel: Path, config: ScriberConfig) -> bool:
+ parts = rel.parts
+ name = rel.name.lower()
+ if any(part in set(config.python.test_roots) for part in parts):
+ return True
+ return name.startswith("test_") or name.endswith("_test.py") or name.endswith(".test.py")
+
+
+def _name_related(a: Path, b: Path) -> bool:
+ a_stem = a.stem.lower().replace("test_", "").replace("_test", "")
+ b_stem = b.stem.lower().replace("test_", "").replace("_test", "")
+ if not a_stem or not b_stem:
+ return False
+ return a_stem in b_stem or b_stem in a_stem
+
+
+def _walk_neighbors(edges: dict[Path, set[Path]], start: Path, depth: int) -> dict[Path, int]:
+ found: dict[Path, int] = {}
+ frontier = {start}
+ visited = {start}
+ for distance in range(1, max(1, depth) + 1):
+ next_frontier: set[Path] = set()
+ for item in frontier:
+ for neighbor in edges.get(item, set()):
+ if neighbor in visited:
+ continue
+ visited.add(neighbor)
+ found.setdefault(neighbor, distance)
+ next_frontier.add(neighbor)
+ frontier = next_frontier
+ if not frontier:
+ break
+ return found
+
+
+def _support_base_score(file: FileNode, config: ScriberConfig) -> int:
+ category = file.support_category or "support file"
+ if category == "project config":
+ return _score(config, "project_config")
+ if category == "dependency file":
+ return _score(config, "dependency_file")
+ if category in {"runtime support", "runtime config", "ci support", "tooling config"}:
+ return _score(config, "runtime_support")
+ if category == "documentation":
+ return _score(config, "documentation")
+ return _score(config, "documentation")
+
+
+def _is_near_seed(support_file: Path, seed: Path) -> bool:
+ if support_file.parent == Path("."):
+ return True
+ seed_parent = seed.parent
+ return support_file.parent == seed_parent or support_file.parent in seed_parent.parents or seed_parent in support_file.parent.parents
+
+
+def _matches_entrypoint(rel: Path, config: ScriberConfig) -> bool:
+ return any(match_pattern(rel.name, pattern) for pattern in config.python.entrypoint_patterns)
+
+
+def score_candidates_project_snapshot(
+ *,
+ files: dict[Path, FileNode],
+ graph: ModuleGraph,
+ config: ScriberConfig,
+) -> list[Candidate]:
+ candidates: dict[Path, Candidate] = {}
+
+ for rel, file in files.items():
+ if file.kind == "code":
+ if _matches_entrypoint(rel, config):
+ _add(candidates, files, rel, 90, "entrypoint", "entrypoint file")
+ elif _is_test_file(rel, config):
+ _add(candidates, files, rel, 60, "test_file", "test file")
+ else:
+ _add(candidates, files, rel, 80, "code_file", "code file")
+ elif file.kind == "support" and config.support:
+ base = _support_base_score(file, config)
+ category = file.support_category or "support file"
+ _add(candidates, files, rel, base, "project_support", category)
+ elif file.kind == "other":
+ _add(candidates, files, rel, 40, "other_file", "other file")
+
+ for candidate in candidates.values():
+ candidate.reason_summary = _build_reason_summary(candidate)
+
+ filtered = [
+ candidate
+ for rel, candidate in candidates.items()
+ if candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score
+ ]
+ filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+
+ if config.max_files > 0 and len(filtered) > config.max_files:
+ pinned = [c for c in filtered if c.file.relative.name in {"pyproject.toml", "README.md"}]
+ rest = [c for c in filtered if c.file.relative.name not in {"pyproject.toml", "README.md"}]
+ remaining = max(0, config.max_files - len(pinned))
+ filtered = pinned + rest[:remaining]
+ filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+
+ return filtered
+
+
+def score_candidates(
+ *,
+ files: dict[Path, FileNode],
+ seeds: list[SeedPath],
+ graph: ModuleGraph,
+ config: ScriberConfig,
+ mode: str = "focused",
+) -> list[Candidate]:
+ if mode == "project_snapshot":
+ return score_candidates_project_snapshot(files=files, graph=graph, config=config)
+
+ candidates: dict[Path, Candidate] = {}
+ scoring = config.modules_config
+ seed_files = [file for seed in seeds for file in seed.expanded_files]
+ seed_set = set(seed_files)
+
+ for seed in seeds:
+ for rel in seed.expanded_files:
+ key = "seed_folder_file" if seed.is_dir else "seed_file"
+ reason = f"file inside seed folder `{seed.relative.as_posix()}`" if seed.is_dir else "seed file"
+ _add(candidates, files, rel, _score(config, key), "seed_folder_file" if seed.is_dir else "seed_file", reason, seed=rel)
+
+ if config.modules and scoring.enabled:
+ for seed_rel in seed_files:
+ if scoring.include_direct_dependencies:
+ for dep, distance in _walk_neighbors(graph.imports, seed_rel, scoring.depth).items():
+ score = max(scoring.tree_min_score, _score(config, "direct_dependency") - ((distance - 1) * 10))
+ _add(candidates, files, dep, score, "direct_dependency", f"direct dependency of `{seed_rel.as_posix()}`", seed=seed_rel)
+
+ if scoring.include_reverse_dependencies:
+ for dep, distance in _walk_neighbors(graph.imported_by, seed_rel, scoring.depth).items():
+ score = max(scoring.tree_min_score, _score(config, "reverse_dependency") - ((distance - 1) * 10))
+ _add(candidates, files, dep, score, "reverse_dependency", f"imports seed `{seed_rel.as_posix()}`", seed=seed_rel)
+
+ if scoring.include_same_package:
+ seed_parent = seed_rel.parent
+ for rel, file in files.items():
+ if file.kind == "code" and rel.parent == seed_parent and rel not in seed_set:
+ _add(candidates, files, rel, _score(config, "same_package"), "same_package", f"same package as `{seed_rel.as_posix()}`", seed=seed_rel)
+
+ if scoring.include_parent_entrypoints:
+ for rel, file in files.items():
+ if file.kind == "code" and _matches_entrypoint(rel, config):
+ if rel.parent == Path(".") or rel.parent in seed_rel.parents or seed_rel.parent in rel.parents:
+ _add(candidates, files, rel, _score(config, "parent_entrypoint"), "parent_entrypoint", f"parent/entrypoint near `{seed_rel.as_posix()}`", seed=seed_rel)
+
+ if scoring.include_tests:
+ for rel, file in files.items():
+ if file.kind != "code" or not _is_test_file(rel, config):
+ continue
+ if _name_related(rel, seed_rel) or seed_rel in graph.imports.get(rel, set()):
+ _add(candidates, files, rel, _score(config, "related_test"), "related_test", f"related test for `{seed_rel.as_posix()}`", seed=seed_rel)
+
+ for rel, file in files.items():
+ if file.kind == "code" and rel not in seed_set and _name_related(rel, seed_rel):
+ _add(candidates, files, rel, _score(config, "name_similarity"), "name_similarity", f"name similarity with `{seed_rel.as_posix()}`", seed=seed_rel)
+
+ if config.support:
+ for rel, file in files.items():
+ if file.kind != "support":
+ continue
+ base = _support_base_score(file, config)
+ reason = file.support_category or "support file"
+ if rel.name == "pyproject.toml":
+ _add(candidates, files, rel, _score(config, "project_config"), "project_support", "project config/root file")
+ continue
+ added = False
+ for seed_rel in seed_files:
+ if _is_near_seed(rel, seed_rel):
+ _add(candidates, files, rel, max(base, _score(config, "support_near_seed")), "support_near_seed", f"{reason} near `{seed_rel.as_posix()}`", seed=seed_rel)
+ added = True
+ if not added and file.relative.parent == Path(".") and scoring.include_project_configs:
+ _add(candidates, files, rel, base, "project_support", reason)
+ else:
+ if config.support:
+ pyproject = files.get(Path("pyproject.toml"))
+ if pyproject:
+ _add(candidates, files, Path("pyproject.toml"), _score(config, "project_config"), "project_support", "project config/root file")
+
+ for candidate in candidates.values():
+ if len(candidate.seed_sources) > 1:
+ candidate.score = min(100, candidate.score + _score(config, "shared_dependency_bonus"))
+ _add_reason(candidate, "shared_dependency", "shared by multiple seed paths")
+
+ for candidate in candidates.values():
+ candidate.reason_summary = _build_reason_summary(candidate)
+
+ required = set(seed_files)
+ filtered = [
+ candidate
+ for rel, candidate in candidates.items()
+ if rel in required or candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score
+ ]
+ filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+
+ if config.max_files > 0 and len(filtered) > config.max_files:
+ seeds_first = [candidate for candidate in filtered if candidate.file.relative in required or candidate.file.relative.name in {"pyproject.toml", "README.md"}]
+ rest = [candidate for candidate in filtered if candidate.file.relative not in required and candidate.file.relative.name not in {"pyproject.toml", "README.md"}]
+ remaining = max(0, config.max_files - len(seeds_first))
+ filtered = seeds_first + rest[:remaining]
+ filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+
+ return filtered
diff --git a/src/scriber/graph/__init__.py b/src/scriber/graph/__init__.py
new file mode 100644
index 0000000..1a60029
--- /dev/null
+++ b/src/scriber/graph/__init__.py
@@ -0,0 +1,3 @@
+from .builder import build_graph
+
+__all__ = ["build_graph"]
diff --git a/src/scriber/graph/builder.py b/src/scriber/graph/builder.py
new file mode 100644
index 0000000..a181441
--- /dev/null
+++ b/src/scriber/graph/builder.py
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from scriber.core.models import FileNode, ModuleGraph, ScriberConfig
+from scriber.graph.languages.python import build_module_map, parse_python_imports, resolve_import_record
+from scriber.scanner.files import read_text_lossy
+
+
+def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGraph:
+ graph = ModuleGraph()
+ if not files:
+ return graph
+
+ path_to_module: dict[Path, str] = {}
+ module_to_path: dict[str, Path] = {}
+
+ absolute_to_file: dict[Path, FileNode] = {}
+ dir_to_files: dict[Path, list[FileNode]] = {}
+ for node in files.values():
+ absolute_to_file[node.absolute] = node
+ dir_to_files.setdefault(node.absolute.parent, []).append(node)
+
+ sample = next(iter(files.values()))
+ root = Path(sample.absolute.as_posix()[:len(sample.absolute.as_posix()) - len(sample.relative.as_posix())]).resolve()
+
+ from scriber.cache import ScriberCache
+ cache = ScriberCache(config, root)
+
+ module_to_path, path_to_module = build_module_map(files, config.python)
+
+ for rel, file in files.items():
+ if file.kind != "code" or file.is_binary or file.language not in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}:
+ continue
+
+ try:
+ stat = file.absolute.stat()
+ mtime_ns = stat.st_mtime_ns
+ size = stat.st_size
+ except OSError:
+ continue
+
+ cached_data = cache.get_file(rel, mtime_ns, size)
+ if cached_data is not None:
+ cached_imports = cache.get_imports(rel)
+ if cached_imports is not None:
+ for target in cached_imports:
+ if target in files:
+ graph.imports.setdefault(rel, set()).add(target)
+ graph.imported_by.setdefault(target, set()).add(rel)
+ continue
+
+ resolved_set = set()
+
+ if file.language == "python":
+ current_module = path_to_module.get(rel)
+ if current_module:
+ try:
+ source = file.read_text()
+ except OSError:
+ continue
+ imports = parse_python_imports(file.absolute, source)
+ for record in imports:
+ for target in resolve_import_record(
+ record,
+ current_file=file,
+ current_module=current_module,
+ module_to_path=module_to_path,
+ ):
+ if target == rel:
+ continue
+ resolved_set.add(target)
+
+ elif file.language in {"javascript", "typescript", "react"}:
+ from scriber.graph.languages.javascript import parse_javascript_imports, resolve_javascript_import
+ try:
+ source = file.read_text()
+ except OSError:
+ continue
+ imports = parse_javascript_imports(source)
+ for spec in imports:
+ for target in resolve_javascript_import(spec, file, absolute_to_file):
+ if target == rel:
+ continue
+ resolved_set.add(target)
+
+ elif file.language == "rust":
+ from scriber.graph.languages.rust import parse_rust_imports, resolve_rust_import
+ try:
+ source = file.read_text()
+ except OSError:
+ continue
+ imports = parse_rust_imports(source)
+ for kind, spec in imports:
+ for target in resolve_rust_import(kind, spec, file, absolute_to_file):
+ if target == rel:
+ continue
+ resolved_set.add(target)
+
+ elif file.language == "go":
+ from scriber.graph.languages.go import parse_go_imports, resolve_go_import
+ try:
+ source = file.read_text()
+ except OSError:
+ continue
+ imports = parse_go_imports(source)
+ for spec in imports:
+ for target in resolve_go_import(spec, file, dir_to_files, root):
+ if target == rel:
+ continue
+ resolved_set.add(target)
+
+ elif file.language in {"c", "cpp"}:
+ from scriber.graph.languages.cpp import parse_cpp_includes, resolve_cpp_include
+ try:
+ source = file.read_text()
+ except OSError:
+ continue
+ imports = parse_cpp_includes(source)
+ for spec in imports:
+ for target in resolve_cpp_include(spec, file, absolute_to_file):
+ if target == rel:
+ continue
+ resolved_set.add(target)
+
+
+ for target in resolved_set:
+ graph.imports.setdefault(rel, set()).add(target)
+ graph.imported_by.setdefault(target, set()).add(rel)
+
+ cache.set_imports(rel, resolved_set)
+
+ for rel in files:
+ graph.imports.setdefault(rel, set())
+ graph.imported_by.setdefault(rel, set())
+
+ cache.save(set(files.keys()))
+ return graph
diff --git a/src/scriber/graph/languages/__init__.py b/src/scriber/graph/languages/__init__.py
new file mode 100644
index 0000000..04bc547
--- /dev/null
+++ b/src/scriber/graph/languages/__init__.py
@@ -0,0 +1 @@
+# Languages package init.
diff --git a/src/scriber/graph/languages/cpp.py b/src/scriber/graph/languages/cpp.py
new file mode 100644
index 0000000..5c19732
--- /dev/null
+++ b/src/scriber/graph/languages/cpp.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from scriber.core.models import FileNode
+
+# Match `#include "header.h"` or `#include `
+INCLUDE_RE = re.compile(r'#include\s*["<]([^">]+)[">]')
+
+
+def parse_cpp_includes(source: str) -> list[str]:
+ """Parse all include specifiers from C/C++ source code."""
+ includes = []
+ for match in INCLUDE_RE.finditer(source):
+ val = match.group(1)
+ if val:
+ includes.append(val)
+ return includes
+
+
+def resolve_cpp_include(
+ include_spec: str,
+ current_file: FileNode,
+ absolute_to_file: dict[Path, FileNode]
+) -> set[Path]:
+ """Resolve a C/C++ include specifier to a project file path."""
+ resolved = set()
+ parent = current_file.absolute.parent
+
+ # 1. Try resolving relative to current file's directory
+ try:
+ candidate = (parent / include_spec).resolve(strict=False)
+ except Exception:
+ candidate = parent / include_spec
+
+ node = absolute_to_file.get(candidate)
+ if node and not node.is_binary:
+ resolved.add(node.relative)
+ return resolved
+
+ # 2. Try resolving relative to project root or search paths in absolute_to_file
+ for path, n in absolute_to_file.items():
+ if n.is_binary:
+ continue
+ rel_posix = n.relative.as_posix()
+ # Match if the relative path matches the include spec exactly or ends with it (e.g. "subdir/header.h")
+ if rel_posix == include_spec or rel_posix.endswith("/" + include_spec):
+ resolved.add(n.relative)
+ return resolved
+
+ return resolved
diff --git a/src/scriber/graph/languages/go.py b/src/scriber/graph/languages/go.py
new file mode 100644
index 0000000..25b5fab
--- /dev/null
+++ b/src/scriber/graph/languages/go.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from scriber.core.models import FileNode
+
+
+IMPORT_SINGLE_RE = re.compile(r'\bimport\s+[\'"]([^\'"]+)[\'"]')
+IMPORT_BLOCK_RE = re.compile(r'\bimport\s*\(([^)]+)\)')
+
+
+def parse_go_imports(source: str) -> list[str]:
+ imports = []
+ for match in IMPORT_SINGLE_RE.finditer(source):
+ imports.append(match.group(1))
+ for match in IMPORT_BLOCK_RE.finditer(source):
+ block = match.group(1)
+ for line in block.splitlines():
+ line = line.strip()
+ if line.startswith("//"):
+ continue
+ m = re.search(r'[\'"]([^\'"]+)[\'"]', line)
+ if m:
+ imports.append(m.group(1))
+ return imports
+
+
+def resolve_go_import(import_spec: str, current_file: FileNode, dir_to_files: dict[Path, list[FileNode]], project_root: Path) -> set[Path]:
+ resolved = set()
+ go_mod_path = project_root / "go.mod"
+ module_name = None
+ if go_mod_path.exists():
+ try:
+ content = go_mod_path.read_text(encoding="utf-8")
+ m = re.search(r'^\s*module\s+(\S+)', content, re.MULTILINE)
+ if m:
+ module_name = m.group(1)
+ except Exception:
+ pass
+
+ if module_name and import_spec.startswith(module_name):
+ rel_spec = import_spec[len(module_name):].lstrip("/")
+ target_dir = (project_root / rel_spec).resolve()
+ for node in dir_to_files.get(target_dir, []):
+ if node.language == "go":
+ resolved.add(node.relative)
+
+ return resolved
diff --git a/src/scriber/graph/languages/javascript.py b/src/scriber/graph/languages/javascript.py
new file mode 100644
index 0000000..9ca43f7
--- /dev/null
+++ b/src/scriber/graph/languages/javascript.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import re
+import os
+from pathlib import Path
+from scriber.core.models import FileNode
+
+
+IMPORT_RE = re.compile(
+ r'(?:import|export)\s+(?:[\w*\s{},]*\s+from\s+)?[\'"]([^\'"]+)[\'"]'
+ r'|require\s*\(\s*[\'"]([^\'"]+)[\'"]\s*\)'
+)
+
+
+def parse_javascript_imports(source: str) -> list[str]:
+ imports = []
+ for match in IMPORT_RE.finditer(source):
+ val = match.group(1) or match.group(2)
+ if val:
+ imports.append(val)
+ return imports
+
+
+def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]:
+ resolved = set()
+ if not import_spec.startswith("."):
+ return resolved
+
+ parent = current_file.absolute.parent
+ try:
+ base_path = Path(os.path.abspath(parent / import_spec))
+ except Exception:
+ base_path = (parent / import_spec).resolve(strict=False)
+
+ extensions = ["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"]
+ for ext in extensions:
+ candidate = base_path.with_name(base_path.name + ext) if ext else base_path
+ node = absolute_to_file.get(candidate)
+ if node and not node.is_binary:
+ resolved.add(node.relative)
+ return resolved
+
+ # Try index files
+ for index_name in ["index.ts", "index.tsx", "index.js", "index.jsx"]:
+ candidate = base_path / index_name
+ node = absolute_to_file.get(candidate)
+ if node and not node.is_binary:
+ resolved.add(node.relative)
+ return resolved
+
+ return resolved
diff --git a/src/scriber/graph/languages/python.py b/src/scriber/graph/languages/python.py
new file mode 100644
index 0000000..60af766
--- /dev/null
+++ b/src/scriber/graph/languages/python.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass
+from pathlib import Path, PurePosixPath
+
+from scriber.core.models import FileNode, PythonConfig
+
+
+@dataclass(frozen=True, slots=True)
+class ImportRecord:
+ kind: str
+ module: str
+ names: tuple[str, ...] = ()
+ level: int = 0
+
+
+def parse_python_imports(path: Path, source: str) -> list[ImportRecord]:
+ try:
+ tree = ast.parse(source, filename=str(path))
+ except SyntaxError:
+ return []
+
+ imports: list[ImportRecord] = []
+ for node in ast.walk(tree):
+ if isinstance(node, ast.Import):
+ for alias in node.names:
+ imports.append(ImportRecord(kind="import", module=alias.name, names=(), level=0))
+ elif isinstance(node, ast.ImportFrom):
+ module = node.module or ""
+ names = tuple(alias.name for alias in node.names if alias.name != "*")
+ imports.append(ImportRecord(kind="from", module=module, names=names, level=node.level))
+ return imports
+
+
+def _is_under(rel: Path, root: str) -> bool:
+ if root in {"", "."}:
+ return True
+ root_path = PurePosixPath(root)
+ rel_path = PurePosixPath(rel.as_posix())
+ try:
+ rel_path.relative_to(root_path)
+ return True
+ except ValueError:
+ return False
+
+
+def _relative_to_root(rel: Path, root: str) -> Path:
+ if root in {"", "."}:
+ return rel
+ return Path(PurePosixPath(rel.as_posix()).relative_to(PurePosixPath(root)))
+
+
+def module_name_for_file(file: FileNode, python: PythonConfig) -> str | None:
+ if file.language != "python":
+ return None
+ rel = file.relative
+ roots = sorted(python.source_roots, key=lambda item: 0 if item == "." else len(item), reverse=True)
+ for source_root in roots:
+ if not _is_under(rel, source_root):
+ continue
+ under = _relative_to_root(rel, source_root)
+ if under.suffix not in {".py", ".pyi"}:
+ continue
+ parts = list(under.with_suffix("").parts)
+ if not parts:
+ continue
+ if under.name in python.module_init_files:
+ parts = parts[:-1]
+ if not parts:
+ continue
+ return ".".join(parts)
+ return None
+
+
+def build_module_map(files: dict[Path, FileNode], python: PythonConfig) -> tuple[dict[str, Path], dict[Path, str]]:
+ module_to_path: dict[str, Path] = {}
+ path_to_module: dict[Path, str] = {}
+ for rel, file in files.items():
+ module = module_name_for_file(file, python)
+ if not module:
+ continue
+ path_to_module[rel] = module
+ module_to_path.setdefault(module, rel)
+ return module_to_path, path_to_module
+
+
+def resolve_relative_module(current_module: str, current_is_init: bool, record: ImportRecord) -> str:
+ if record.level <= 0:
+ return record.module
+ if current_is_init:
+ package = current_module
+ else:
+ package = current_module.rsplit(".", 1)[0] if "." in current_module else ""
+ parts = package.split(".") if package else []
+ up = max(0, record.level - 1)
+ if up:
+ parts = parts[:-up] if up <= len(parts) else []
+ if record.module:
+ parts.extend(record.module.split("."))
+ return ".".join(part for part in parts if part)
+
+
+def resolve_import_record(
+ record: ImportRecord,
+ *,
+ current_file: FileNode,
+ current_module: str,
+ module_to_path: dict[str, Path],
+) -> set[Path]:
+ candidates: list[str] = []
+ current_is_init = current_file.absolute.name == "__init__.py"
+
+ if record.kind == "import":
+ candidates.append(record.module)
+ else:
+ base = resolve_relative_module(current_module, current_is_init, record) if record.level else record.module
+ for name in record.names:
+ if base:
+ candidates.append(f"{base}.{name}")
+ else:
+ candidates.append(name)
+ if base:
+ candidates.append(base)
+
+ resolved: set[Path] = set()
+ for candidate in candidates:
+ if not candidate:
+ continue
+ parts = candidate.split(".")
+ # Try the exact module first, then walk up to a package. This handles
+ # both `from package import symbol` and `from package import module`.
+ for end in range(len(parts), 0, -1):
+ module = ".".join(parts[:end])
+ path = module_to_path.get(module)
+ if path is not None:
+ resolved.add(path)
+ break
+ return resolved
diff --git a/src/scriber/graph/languages/rust.py b/src/scriber/graph/languages/rust.py
new file mode 100644
index 0000000..14feecc
--- /dev/null
+++ b/src/scriber/graph/languages/rust.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from scriber.core.models import FileNode
+
+
+MOD_RE = re.compile(r'\bmod\s+(\w+)\s*;')
+USE_RE = re.compile(r'\buse\s+([^;]+)\s*;')
+
+
+def parse_rust_imports(source: str) -> list[tuple[str, str]]:
+ imports = []
+ for match in MOD_RE.finditer(source):
+ imports.append(("mod", match.group(1)))
+ for match in USE_RE.finditer(source):
+ spec = match.group(1).strip()
+ if "{" in spec:
+ base, rest = spec.split("{", 1)
+ base = base.strip()
+ rest = rest.replace("}", "").strip()
+ for part in rest.split(","):
+ part = part.strip()
+ if part:
+ imports.append(("use", f"{base}{part}"))
+ else:
+ imports.append(("use", spec))
+ return imports
+
+
+def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]:
+ resolved = set()
+ parent = current_file.absolute.parent
+
+ if kind == "mod":
+ candidates = [
+ parent / f"{spec}.rs",
+ parent / spec / "mod.rs"
+ ]
+ for cand in candidates:
+ node = absolute_to_file.get(cand)
+ if node:
+ resolved.add(node.relative)
+ return resolved
+ return resolved
+
+ parts = spec.split("::")
+ if not parts:
+ return resolved
+
+ if parts[0] == "crate":
+ crate_root = None
+ curr = current_file.absolute.parent
+ while curr != curr.parent:
+ if (curr / "Cargo.toml").exists() or (curr / "src").exists():
+ crate_root = curr / "src" if (curr / "src").exists() else curr
+ break
+ curr = curr.parent
+ if not crate_root:
+ crate_root = current_file.absolute.parent
+
+ sub_parts = parts[1:]
+ if sub_parts:
+ for end in range(len(sub_parts), 0, -1):
+ module_path = crate_root / Path(*sub_parts[:end])
+ candidates = [
+ module_path.with_name(module_path.name + ".rs"),
+ module_path / "mod.rs"
+ ]
+ for cand in candidates:
+ node = absolute_to_file.get(cand)
+ if node:
+ resolved.add(node.relative)
+ return resolved
+ elif parts[0] == "super":
+ sub_parts = parts[1:]
+ crate_root = parent.parent
+ if sub_parts:
+ for end in range(len(sub_parts), 0, -1):
+ module_path = crate_root / Path(*sub_parts[:end])
+ candidates = [
+ module_path.with_name(module_path.name + ".rs"),
+ module_path / "mod.rs"
+ ]
+ for cand in candidates:
+ node = absolute_to_file.get(cand)
+ if node:
+ resolved.add(node.relative)
+ return resolved
+ elif parts[0] == "self":
+ sub_parts = parts[1:]
+ crate_root = parent
+ if sub_parts:
+ for end in range(len(sub_parts), 0, -1):
+ module_path = crate_root / Path(*sub_parts[:end])
+ candidates = [
+ module_path.with_name(module_path.name + ".rs"),
+ module_path / "mod.rs"
+ ]
+ for cand in candidates:
+ node = absolute_to_file.get(cand)
+ if node:
+ resolved.add(node.relative)
+ return resolved
+
+ return resolved
diff --git a/src/scriber/native.py b/src/scriber/native.py
new file mode 100644
index 0000000..08b415c
--- /dev/null
+++ b/src/scriber/native.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from typing import Any
+
+_NATIVE_MODULE = None
+_IMPORT_ERROR = None
+
+
+def _load_native() -> Any:
+ global _NATIVE_MODULE, _IMPORT_ERROR
+ if _NATIVE_MODULE is not None:
+ return _NATIVE_MODULE
+ if _IMPORT_ERROR is not None:
+ raise _IMPORT_ERROR
+ try:
+ from scriber import _native
+ _NATIVE_MODULE = _native
+ return _NATIVE_MODULE
+ except ImportError as e:
+ _IMPORT_ERROR = e
+ raise e
+
+
+def is_native_available() -> bool:
+ """Returns True if the native Rust module scriber._native is available."""
+ try:
+ _load_native()
+ return True
+ except ImportError:
+ return False
+
+
+def require_native() -> Any:
+ """Returns the native Rust module _native or raises ImportError with instructions."""
+ try:
+ native = _load_native()
+ if hasattr(native, "native_api_version") and native.native_api_version() != 1:
+ raise RuntimeError("Niezgodna wersja natywnego backendu Scriber (oczekiwano wersji 1).")
+ return native
+ except ImportError as e:
+ raise ImportError(
+ "Natywny moduΕ 'scriber._native' nie jest dostΔpny.\n"
+ "Upewnij siΔ, ΕΌe projekt zostaΕ poprawnie skompilowany "
+ "za pomocΔ
'uv run maturin develop' lub 'uv sync'."
+ ) from e
diff --git a/src/scriber/pack.py b/src/scriber/pack.py
new file mode 100644
index 0000000..1b9626e
--- /dev/null
+++ b/src/scriber/pack.py
@@ -0,0 +1,3 @@
+from .packer.pack import build_pack, build_and_write_pack
+
+__all__ = ["build_pack", "build_and_write_pack"]
diff --git a/src/scriber/packer/__init__.py b/src/scriber/packer/__init__.py
new file mode 100644
index 0000000..2a38d37
--- /dev/null
+++ b/src/scriber/packer/__init__.py
@@ -0,0 +1,3 @@
+from .pack import build_pack, build_and_write_pack
+
+__all__ = ["build_pack", "build_and_write_pack"]
diff --git a/src/scriber/packer/pack.py b/src/scriber/packer/pack.py
new file mode 100644
index 0000000..2e7011c
--- /dev/null
+++ b/src/scriber/packer/pack.py
@@ -0,0 +1,365 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Callable
+
+from scriber.core.config import apply_overrides, load_config
+from scriber.core.errors import ScriberError
+from scriber.core.models import Candidate, FileNode, ScriberPack, SeedPath
+from scriber.core.root import ensure_inside_root, project_root_from_config, rel_to_root, resolve_config_path
+from scriber.engine.scorer import score_candidates
+from scriber.graph.builder import build_graph
+from scriber.rendering.renderer import render_pack
+from scriber.scanner.files import classify_file, is_text_readable, read_text_lossy
+from scriber.tokens import estimate_tokens
+from scriber.scanner.scan import scan_project
+
+
+def _resolve_input(path_value: str, root: Path, allow_external: bool, path_base: str = "cwd") -> Path:
+ path = Path(path_value).expanduser()
+ if not path.is_absolute():
+ if path_base == "project":
+ path = (root / path).resolve(strict=False)
+ else:
+ path = (Path.cwd() / path).resolve(strict=False)
+ else:
+ path = path.resolve(strict=False)
+ if not path.exists():
+ # Try relative to project root as a convenience for programmatic calls.
+ alt = (root / path_value).resolve(strict=False)
+ if alt.exists():
+ path = alt
+ if not path.exists():
+ raise ScriberError(f"Input path not found: {path_value}")
+ ensure_inside_root(path, root, allow_external)
+ return path.resolve()
+
+
+def _ensure_seed_file(path: Path, root: Path, files: dict[Path, FileNode], config) -> FileNode:
+ rel = rel_to_root(path, root)
+ existing = files.get(rel)
+ if existing is not None:
+ return existing
+ node = classify_file(path, root, config)
+ if node is not None:
+ files[rel] = node
+ return node
+ # Explicit seed overrides hard-ignore classification if it is readable text.
+ node = FileNode(
+ absolute=path.resolve(),
+ relative=rel,
+ kind="other",
+ language="text",
+ size_bytes=path.stat().st_size,
+ is_binary=not is_text_readable(path),
+ support_category=None,
+ content_policy="auto",
+ )
+ files[rel] = node
+ return node
+
+
+def _expand_seed(path: Path, root: Path, files: dict[Path, FileNode], config) -> SeedPath:
+ rel = rel_to_root(path, root)
+ if path.is_file():
+ node = _ensure_seed_file(path, root, files, config)
+ return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=False, expanded_files=[node.relative])
+
+ expanded: list[Path] = []
+ for file_rel, node in files.items():
+ try:
+ file_rel.relative_to(rel)
+ except ValueError:
+ continue
+ if not node.is_binary:
+ expanded.append(file_rel)
+ expanded.sort(key=lambda item: item.as_posix())
+ if not expanded:
+ raise ScriberError(f"No readable project files found inside seed folder: {rel.as_posix()}")
+ return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=True, expanded_files=expanded)
+
+
+def _decide_content(candidate: Candidate, *, config, only_tree: bool, budget_left: int | None, is_seed: bool) -> tuple[bool, str | None, str | None, int]:
+ if only_tree:
+ return False, None, "only-tree mode", 0
+ file = candidate.file
+ if file.is_binary:
+ return False, None, "binary file", 0
+
+ should_include = False
+ reason: str | None = None
+
+ if is_seed:
+ should_include = True
+ elif file.kind == "code":
+ should_include = candidate.score >= config.modules_config.content_min_score
+ if not should_include:
+ reason = f"score below content_min_score={config.modules_config.content_min_score}"
+ elif file.kind == "support":
+ if file.content_policy == "tree_only":
+ should_include = False
+ reason = "support content policy: tree_only"
+ elif file.content_policy == "full":
+ should_include = True
+ else:
+ should_include = file.size_bytes <= config.support_content.auto_max_bytes
+ if not should_include:
+ reason = f"support file larger than auto_max_bytes={config.support_content.auto_max_bytes}"
+ else:
+ should_include = is_seed
+ if not should_include:
+ reason = "other file not selected for content"
+
+ if not should_include:
+ return False, None, reason, 0
+
+ try:
+ content = file.read_text()
+ except OSError as exc:
+ return False, None, f"read error: {exc}", 0
+
+ tokens = estimate_tokens(content, config.tokens)
+ if budget_left is not None and tokens > budget_left and not is_seed:
+ return False, None, "token budget exceeded", 0
+ return True, content, None, tokens
+
+
+def _apply_content_policy(pack: ScriberPack, config) -> None:
+ if pack.mode == "focused":
+ explicit_seed_files = {rel for seed in pack.seed_paths for rel in seed.expanded_files}
+ else:
+ explicit_seed_files = {rel for seed in pack.seed_paths if not seed.is_dir for rel in seed.expanded_files}
+ budget_left = config.max_tokens if config.max_tokens > 0 else None
+ total = 0
+ for candidate in pack.candidates:
+ is_explicit_seed = candidate.file.relative in explicit_seed_files
+ include, content, omitted, tokens = _decide_content(
+ candidate,
+ config=config,
+ only_tree=pack.only_tree,
+ budget_left=budget_left,
+ is_seed=is_explicit_seed,
+ )
+ candidate.include_content = include
+ candidate.content = content
+ candidate.omitted_reason = omitted
+ candidate.token_estimate = tokens
+ if include:
+ total += tokens
+ if budget_left is not None and not is_explicit_seed:
+ budget_left = max(0, budget_left - tokens)
+ pack.total_tokens = total
+
+
+def build_pack(
+ paths: list[str] | None = None,
+ *,
+ config_path: str | None = None,
+ output: str | None = None,
+ output_format: str | None = None,
+ only_tree: bool | None = None,
+ modules: bool | None = None,
+ support: bool | None = None,
+ max_files: int | None = None,
+ max_tokens: int | None = None,
+ min_score: int | None = None,
+ support_content: str | None = None,
+ progress_callback: Callable[[str], None] | None = None,
+ project: bool | None = None,
+ path_base: str = "project",
+) -> ScriberPack:
+ from time import perf_counter
+ timings = {}
+
+ t_start = perf_counter()
+ paths = paths or ["."]
+ resolved_config = resolve_config_path(paths, config_path)
+ root = project_root_from_config(resolved_config)
+ config = load_config(resolved_config)
+ config = apply_overrides(
+ config,
+ output=output,
+ output_format=output_format,
+ only_tree=only_tree,
+ modules=modules,
+ support=support,
+ max_files=max_files,
+ max_tokens=max_tokens,
+ min_score=min_score,
+ support_content=support_content,
+ )
+ timings["config_load"] = perf_counter() - t_start
+
+ t_scan = perf_counter()
+ if progress_callback: progress_callback("Skanowanie plikow...")
+ from scriber.native import require_native, is_native_available
+ native_files = None
+ if is_native_available():
+ from scriber.scanner.scan import scan_project_with_native
+ files, native_files = scan_project_with_native(root, config)
+ else:
+ files = scan_project(root, config)
+ resolved_inputs = [_resolve_input(item, root, config.allow_external_paths, path_base) for item in paths]
+ seeds = [_expand_seed(path, root, files, config) for path in resolved_inputs]
+ timings["scan"] = perf_counter() - t_scan
+
+ # Detect mode
+ is_project_snapshot = False
+ if project:
+ is_project_snapshot = True
+ else:
+ for path in resolved_inputs:
+ if path == root:
+ is_project_snapshot = True
+ break
+ mode = "project_snapshot" if is_project_snapshot else "focused"
+
+ # Use native code pack builder if available
+ if is_native_available():
+ native = require_native()
+
+ t_graph = perf_counter()
+ if progress_callback: progress_callback("Budowanie grafu modulow (natywnie)...")
+
+ assert native_files is not None
+
+ edges = native.build_import_graph(
+ str(root),
+ native_files,
+ config.python.source_roots,
+ config.python.module_init_files
+ )
+
+ from scriber.core.models import ModuleGraph
+ graph = ModuleGraph()
+ for edge in edges:
+ from_path = Path(getattr(edge, "from"))
+ to_path = Path(edge.to)
+ graph.imports.setdefault(from_path, set()).add(to_path)
+ graph.imported_by.setdefault(to_path, set()).add(from_path)
+
+ timings["graph_build"] = perf_counter() - t_graph
+
+ t_score = perf_counter()
+ if progress_callback: progress_callback("Ocenianie zaleznosci (natywnie)...")
+ scoring = config.modules_config.scoring
+ opts = native.NativePackOptions(
+ mode=mode,
+ max_files=config.max_files,
+ min_score=config.min_score,
+ tree_min_score=config.modules_config.tree_min_score,
+ seed_file_score=scoring.get("seed_file", 100),
+ seed_folder_file_score=scoring.get("seed_folder_file", 100),
+ direct_dependency_score=scoring.get("direct_dependency", 90),
+ reverse_dependency_score=scoring.get("reverse_dependency", 85),
+ same_package_score=scoring.get("same_package", 65),
+ parent_entrypoint_score=scoring.get("parent_entrypoint", 60),
+ related_test_score=scoring.get("related_test", 80),
+ name_similarity_score=scoring.get("name_similarity", 45),
+ support_near_seed_score=scoring.get("support_near_seed", 60),
+ project_config_score=scoring.get("project_config", 55),
+ dependency_file_score=scoring.get("dependency_file", 52),
+ runtime_support_score=scoring.get("runtime_support", 50),
+ documentation_score=scoring.get("documentation", 45),
+ shared_dependency_bonus=scoring.get("shared_dependency_bonus", 10),
+ modules_enabled=config.modules,
+ include_direct_dependencies=config.modules_config.include_direct_dependencies,
+ include_reverse_dependencies=config.modules_config.include_reverse_dependencies,
+ include_same_package=config.modules_config.include_same_package,
+ include_parent_entrypoints=config.modules_config.include_parent_entrypoints,
+ include_tests=config.modules_config.include_tests,
+ include_project_configs=config.modules_config.include_project_configs,
+ depth=config.modules_config.depth,
+ support_enabled=config.support,
+ entrypoint_patterns=config.python.entrypoint_patterns,
+ test_roots=config.python.test_roots,
+ )
+
+ rs_candidates = native.score_candidates_native(
+ native_files,
+ [seed.relative.as_posix() for seed in seeds],
+ edges,
+ opts
+ )
+
+ candidates = []
+ for rc in rs_candidates:
+ rel = Path(rc.path)
+ file_node = files.get(rel)
+ if file_node:
+ c = Candidate(
+ file=file_node,
+ score=rc.score,
+ reasons=rc.reasons,
+ reason_summary=rc.reason_summary,
+ include_content=rc.include_content,
+ omitted_reason=rc.omitted_reason,
+ )
+ candidates.append(c)
+ timings["scoring"] = perf_counter() - t_score
+ else:
+ t_graph = perf_counter()
+ if progress_callback: progress_callback("Budowanie grafu modulow...")
+ graph = build_graph(files, config)
+ timings["graph_build"] = perf_counter() - t_graph
+
+ t_score = perf_counter()
+ if progress_callback: progress_callback("Ocenianie zaleznosci...")
+ candidates = score_candidates(files=files, seeds=seeds, graph=graph, config=config, mode=mode)
+ timings["scoring"] = perf_counter() - t_score
+
+ pack = ScriberPack(
+ project_root=root,
+ config_path=resolved_config,
+ seed_paths=seeds,
+ candidates=candidates,
+ graph=graph,
+ only_tree=config.only_tree,
+ output_format=config.format,
+ mode=mode,
+ )
+
+ t_content = perf_counter()
+ if progress_callback: progress_callback("Aplikowanie regul zawartosci...")
+ _apply_content_policy(pack, config)
+ timings["content_read"] = perf_counter() - t_content
+
+ pack.timings = timings
+ return pack
+
+
+def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path | None, ScriberPack]:
+ explain_selection = kwargs.pop("explain_selection", False)
+ pack = build_pack(paths, **kwargs)
+ config_path = resolve_config_path(paths or ["."], kwargs.get("config_path"))
+ config = load_config(config_path)
+ config = apply_overrides(
+ config,
+ output=kwargs.get("output"),
+ output_format=kwargs.get("output_format"),
+ only_tree=kwargs.get("only_tree"),
+ modules=kwargs.get("modules"),
+ support=kwargs.get("support"),
+ max_files=kwargs.get("max_files"),
+ max_tokens=kwargs.get("max_tokens"),
+ min_score=kwargs.get("min_score"),
+ support_content=kwargs.get("support_content"),
+ )
+ progress = kwargs.get("progress_callback")
+ if progress: progress("Renderowanie Markdown...")
+ rendered = render_pack(pack, explain_selection=explain_selection)
+ output = config.output
+ if str(output) == "-":
+ import sys
+ try:
+ sys.stdout.buffer.write(rendered.encode("utf-8"))
+ sys.stdout.flush()
+ except (AttributeError, OSError):
+ print(rendered)
+ return None, pack
+ if not output.is_absolute():
+ output = pack.project_root / output
+ output.parent.mkdir(parents=True, exist_ok=True)
+ from scriber.native import require_native
+ require_native().write_text(str(output), rendered)
+ return output, pack
diff --git a/src/scriber/render.py b/src/scriber/render.py
new file mode 100644
index 0000000..c9eb710
--- /dev/null
+++ b/src/scriber/render.py
@@ -0,0 +1,3 @@
+from .rendering.renderer import render_pack, render_markdown, render_text
+
+__all__ = ["render_pack", "render_markdown", "render_text"]
diff --git a/src/scriber/rendering/__init__.py b/src/scriber/rendering/__init__.py
new file mode 100644
index 0000000..72031fd
--- /dev/null
+++ b/src/scriber/rendering/__init__.py
@@ -0,0 +1,3 @@
+from .renderer import render_pack, render_markdown, render_text
+
+__all__ = ["render_pack", "render_markdown", "render_text"]
diff --git a/src/scriber/rendering/renderer.py b/src/scriber/rendering/renderer.py
new file mode 100644
index 0000000..657cd55
--- /dev/null
+++ b/src/scriber/rendering/renderer.py
@@ -0,0 +1,278 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from scriber.core.models import Candidate, ScriberPack
+
+
+def _path(path: Path) -> str:
+ return path.as_posix()
+
+
+def _escape_table(value: str) -> str:
+ return value.replace("|", "\\|").replace("\n", " ")
+
+
+def _content_flag(candidate: Candidate) -> str:
+ if candidate.include_content:
+ return "yes"
+ if candidate.omitted_reason:
+ return f"no: {candidate.omitted_reason}"
+ return "no"
+
+
+def _table(candidates: list[Candidate], explain_selection: bool = False) -> str:
+ if not candidates:
+ return "_None._\n"
+ lines = ["| Score | Content | Path | Reason |", "|---:|---|---|---|"]
+ for candidate in candidates:
+ reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary
+ lines.append(
+ f"| {candidate.score} | {_escape_table(_content_flag(candidate))} | `{_escape_table(_path(candidate.file.relative))}` | {_escape_table(reason)} |"
+ )
+ return "\n".join(lines) + "\n"
+
+
+def render_tree(paths: list[Path]) -> str:
+ tree: dict[str, dict] = {}
+ for path in sorted(paths, key=lambda item: item.as_posix()):
+ node = tree
+ for part in path.parts:
+ node = node.setdefault(part, {})
+
+ def walk(node: dict[str, dict], prefix: str = "") -> list[str]:
+ lines: list[str] = []
+ items = sorted(node.items(), key=lambda item: item[0])
+ for index, (name, child) in enumerate(items):
+ is_last = index == len(items) - 1
+ branch = "βββ " if is_last else "βββ "
+ lines.append(f"{prefix}{branch}{name}")
+ extension = " " if is_last else "β "
+ lines.extend(walk(child, prefix + extension))
+ return lines
+
+ return ".\n" + "\n".join(walk(tree)) if tree else "."
+
+
+def render_module_graph(pack: ScriberPack) -> str:
+ included = set(pack.included_paths)
+ lines: list[str] = []
+
+ if pack.mode == "project_snapshot":
+ import_counts = []
+ imported_by_counts = []
+ for path in included:
+ imports = len(pack.graph.imports.get(path, set()) & included)
+ if imports > 0:
+ import_counts.append((path, imports))
+
+ imported_by = len(pack.graph.imported_by.get(path, set()) & included)
+ if imported_by > 0:
+ imported_by_counts.append((path, imported_by))
+
+ import_counts.sort(key=lambda x: (-x[1], x[0].as_posix()))
+ imported_by_counts.sort(key=lambda x: (-x[1], x[0].as_posix()))
+
+ lines.append("Top 5 files with most dependencies:")
+ for path, count in import_counts[:5]:
+ lines.append(f"- `{_path(path)}`: imports {count} included files")
+
+ lines.append("")
+ lines.append("Top 5 most imported files:")
+ for path, count in imported_by_counts[:5]:
+ lines.append(f"- `{_path(path)}`: imported by {count} included files")
+
+ return "\n".join(lines).strip() or "No module graph available."
+
+ for seed in pack.seed_paths:
+ for seed_file in seed.expanded_files:
+ lines.append(_path(seed_file))
+ imports = sorted(pack.graph.imports.get(seed_file, set()) & included, key=lambda item: item.as_posix())
+ imported_by = sorted(pack.graph.imported_by.get(seed_file, set()) & included, key=lambda item: item.as_posix())
+ edges = [("imports", item) for item in imports] + [("imported by", item) for item in imported_by]
+ for index, (kind, target) in enumerate(edges):
+ branch = "βββ" if index == len(edges) - 1 else "βββ"
+ lines.append(f"{branch} {kind} {_path(target)}")
+ if not edges:
+ lines.append("βββ no included import edges")
+ lines.append("")
+ return "\n".join(lines).strip() or "No module graph available."
+
+
+def _language_fence(language: str) -> str:
+ if language in {"python", "rust", "javascript", "typescript", "go", "java", "kotlin", "c", "cpp", "toml", "yaml", "json", "markdown", "dockerfile", "ini"}:
+ return language
+ return "text"
+
+
+def _fence_for(content: str) -> str:
+ longest = 0
+ current = 0
+ for char in content:
+ if char == "`":
+ current += 1
+ longest = max(longest, current)
+ else:
+ current = 0
+ return "`" * max(3, longest + 1)
+
+
+def render_summary(pack: ScriberPack) -> str:
+ code_count = len([c for c in pack.candidates if c.file.kind == "code"])
+ support_count = len([c for c in pack.candidates if c.file.kind == "support"])
+ content_count = len([c for c in pack.candidates if c.include_content])
+ tree_only_count = len([c for c in pack.candidates if not c.include_content])
+
+ lines = [
+ "## Pack summary",
+ "",
+ f"- Mode: `{pack.mode}`",
+ f"- Seed paths: `{len(pack.seed_paths)}`",
+ f"- Included code files: `{code_count}`",
+ f"- Included support files: `{support_count}`",
+ f"- Content files: `{content_count}`",
+ f"- Tree-only files: `{tree_only_count}`",
+ f"- Estimated tokens: `{pack.total_tokens}`",
+ ""
+ ]
+ return "\n".join(lines)
+
+
+def render_summary_text(pack: ScriberPack) -> str:
+ code_count = len([c for c in pack.candidates if c.file.kind == "code"])
+ support_count = len([c for c in pack.candidates if c.file.kind == "support"])
+ content_count = len([c for c in pack.candidates if c.include_content])
+ tree_only_count = len([c for c in pack.candidates if not c.include_content])
+
+ lines = [
+ "PACK SUMMARY",
+ "------------",
+ f"Mode: {pack.mode}",
+ f"Seed paths: {len(pack.seed_paths)}",
+ f"Included code files: {code_count}",
+ f"Included support files: {support_count}",
+ f"Content files: {content_count}",
+ f"Tree-only files: {tree_only_count}",
+ f"Estimated tokens: {pack.total_tokens}",
+ ""
+ ]
+ return "\n".join(lines)
+
+
+def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str:
+ code = [candidate for candidate in pack.candidates if candidate.file.kind == "code"]
+ support = [candidate for candidate in pack.candidates if candidate.file.kind == "support"]
+ other = [candidate for candidate in pack.candidates if candidate.file.kind == "other"]
+
+ lines: list[str] = []
+ lines.append("# Scriber 2.0 Pack")
+ lines.append("")
+ lines.append(render_summary(pack).rstrip())
+ lines.append("")
+ lines.append("## Project")
+ lines.append("")
+ lines.append(f"Root: `{pack.project_root}`")
+ lines.append(f"Config: `{pack.config_path.relative_to(pack.project_root).as_posix()}`")
+ lines.append(f"Format: `{pack.output_format}`")
+ lines.append(f"Only tree: `{str(pack.only_tree).lower()}`")
+ lines.append("")
+ lines.append("## Input paths")
+ lines.append("")
+ for seed in pack.seed_paths:
+ lines.append(f"- `{_path(seed.relative)}`")
+ lines.append("")
+ lines.append("## Included code files")
+ lines.append("")
+ lines.append(_table(code, explain_selection).rstrip())
+ lines.append("")
+ lines.append("## Included support files")
+ lines.append("")
+ lines.append(_table(support, explain_selection).rstrip())
+ if other:
+ lines.append("")
+ lines.append("## Included other files")
+ lines.append("")
+ lines.append(_table(other, explain_selection).rstrip())
+ lines.append("")
+ lines.append("## Module graph")
+ lines.append("")
+ lines.append("```text")
+ lines.append(render_module_graph(pack))
+ lines.append("```")
+ lines.append("")
+ lines.append("## Included project tree")
+ lines.append("")
+ lines.append("```text")
+ lines.append(render_tree(pack.included_paths))
+ lines.append("```")
+
+ if not pack.only_tree:
+ lines.append("")
+ lines.append("## File contents")
+ for candidate in pack.candidates:
+ lines.append("")
+ lines.append(f"### `{_path(candidate.file.relative)}`")
+ lines.append("")
+ if not candidate.include_content:
+ lines.append(f"_Content omitted: {candidate.omitted_reason or 'not selected for content'}._")
+ continue
+ content = candidate.content or ""
+ fence = _fence_for(content)
+ language = _language_fence(candidate.file.language)
+ lines.append(f"{fence}{language}")
+ lines.append(content.rstrip("\n"))
+ lines.append(fence)
+
+ lines.append("")
+ return "\n".join(lines)
+
+
+def render_text(pack: ScriberPack, explain_selection: bool = False) -> str:
+ lines: list[str] = []
+ lines.append("SCRIBER 2.0 PACK")
+ lines.append("================")
+ lines.append("")
+ lines.append(render_summary_text(pack).rstrip())
+ lines.append("")
+ lines.append(f"PROJECT ROOT: {pack.project_root}")
+ lines.append(f"CONFIG: {pack.config_path.relative_to(pack.project_root).as_posix()}")
+ lines.append(f"FORMAT: {pack.output_format}")
+ lines.append(f"ONLY TREE: {str(pack.only_tree).lower()}")
+ lines.append("")
+ lines.append("INPUT PATHS")
+ for seed in pack.seed_paths:
+ lines.append(f"- {_path(seed.relative)}")
+ lines.append("")
+ lines.append("INCLUDED FILES")
+ for candidate in pack.candidates:
+ reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary
+ lines.append(f"[{candidate.score:03d}] {_path(candidate.file.relative)}")
+ lines.append(f" kind: {candidate.file.kind}")
+ lines.append(f" content: {_content_flag(candidate)}")
+ lines.append(f" reason: {reason}")
+ lines.append("")
+ lines.append("MODULE GRAPH")
+ lines.append(render_module_graph(pack))
+ lines.append("")
+ lines.append("INCLUDED PROJECT TREE")
+ lines.append(render_tree(pack.included_paths))
+
+ if not pack.only_tree:
+ lines.append("")
+ lines.append("FILE CONTENTS")
+ lines.append("=============")
+ for candidate in pack.candidates:
+ lines.append("")
+ lines.append(f"--- FILE: {_path(candidate.file.relative)} ---")
+ if not candidate.include_content:
+ lines.append(f"[content omitted: {candidate.omitted_reason or 'not selected for content'}]")
+ continue
+ lines.append(candidate.content or "")
+ lines.append("")
+ return "\n".join(lines)
+
+
+def render_pack(pack: ScriberPack, explain_selection: bool = False) -> str:
+ if pack.output_format == "txt":
+ return render_text(pack, explain_selection=explain_selection)
+ return render_markdown(pack, explain_selection=explain_selection)
diff --git a/src/scriber/scanner/__init__.py b/src/scriber/scanner/__init__.py
new file mode 100644
index 0000000..9070647
--- /dev/null
+++ b/src/scriber/scanner/__init__.py
@@ -0,0 +1,21 @@
+from .files import (
+ classify_file,
+ is_probably_binary,
+ is_text_readable,
+ language_for,
+ read_text_lossy,
+ support_category,
+ support_content_policy,
+)
+from .scan import scan_project
+
+__all__ = [
+ "classify_file",
+ "is_probably_binary",
+ "is_text_readable",
+ "language_for",
+ "read_text_lossy",
+ "support_category",
+ "support_content_policy",
+ "scan_project",
+]
diff --git a/src/scriber/scanner/files.py b/src/scriber/scanner/files.py
new file mode 100644
index 0000000..f203dde
--- /dev/null
+++ b/src/scriber/scanner/files.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from scriber.core.matchers import match_pattern, matches_any
+from scriber.core.models import ContentPolicy, FileKind, FileNode, ScriberConfig
+
+LANGUAGE_BY_SUFFIX = {
+ ".py": "python",
+ ".pyi": "python",
+ ".rs": "rust",
+ ".js": "javascript",
+ ".jsx": "javascript",
+ ".ts": "typescript",
+ ".tsx": "typescript",
+ ".go": "go",
+ ".java": "java",
+ ".kt": "kotlin",
+ ".c": "c",
+ ".cpp": "cpp",
+ ".cc": "cpp",
+ ".cxx": "cpp",
+ ".h": "c",
+ ".hpp": "cpp",
+ ".hh": "cpp",
+ ".hxx": "cpp",
+ ".toml": "toml",
+ ".yaml": "yaml",
+ ".yml": "yaml",
+ ".json": "json",
+ ".md": "markdown",
+ ".rst": "rst",
+ ".txt": "text",
+ ".ini": "ini",
+ ".cfg": "ini",
+ ".lock": "lock",
+}
+
+
+def is_probably_binary(path: Path) -> bool:
+ from scriber.native import require_native
+ try:
+ return require_native().is_probably_binary(str(path))
+ except Exception:
+ return True
+
+
+def language_for(path: Path) -> str:
+ if path.name.startswith("Dockerfile"):
+ return "dockerfile"
+ return LANGUAGE_BY_SUFFIX.get(path.suffix.lower(), "text")
+
+
+def support_category(rel: Path) -> str:
+ s = rel.as_posix()
+ name = rel.name
+ if name == "pyproject.toml" or name.endswith(".toml") or name in {"setup.py", "setup.cfg", "tox.ini", "pytest.ini", "mypy.ini", "ruff.toml", ".ruff.toml"}:
+ return "project config"
+ if name.endswith(".lock") or name in {"requirements.txt", "poetry.lock", "uv.lock", "Pipfile", "Pipfile.lock", "package.json", "package-lock.json", "pnpm-lock.yaml", "yarn.lock", "Cargo.toml", "Cargo.lock", "go.mod", "go.sum"} or s.startswith("requirements/"):
+ return "dependency file"
+ if name.startswith("README") or name in {"CHANGELOG.md", "CONTRIBUTING.md"} or s.startswith("docs/"):
+ return "documentation"
+ if name.startswith("Dockerfile") or name.startswith("docker-compose") or name.startswith("compose"):
+ return "runtime support"
+ if s.startswith(".github/workflows/") or name == ".gitlab-ci.yml":
+ return "ci support"
+ if name.startswith(".env") or s.startswith("config/") or s.startswith("settings/"):
+ return "runtime config"
+ if name in {".pre-commit-config.yaml", "tsconfig.json"} or name.startswith("vite.config") or name.startswith("webpack.config"):
+ return "tooling config"
+ return "support file"
+
+
+def support_content_policy(rel: Path, config: ScriberConfig) -> ContentPolicy:
+ s = rel.as_posix()
+ if matches_any(s, config.support_content.tree_only):
+ return "tree_only"
+ if matches_any(s, config.support_content.full):
+ return "full"
+ return config.support_content.default
+
+
+def classify_file(path: Path, root: Path, config: ScriberConfig) -> FileNode | None:
+ rel = path.resolve().relative_to(root.resolve())
+ rel_s = rel.as_posix()
+
+ if matches_any(rel_s, config.hard_ignore_patterns):
+ return None
+
+ binary = is_probably_binary(path)
+ kind: FileKind = "other"
+ category = None
+ policy: ContentPolicy = "auto"
+
+ if matches_any(rel_s, config.code_patterns):
+ kind = "code"
+ elif config.support and matches_any(rel_s, config.support_patterns):
+ kind = "support"
+ category = support_category(rel)
+ policy = support_content_policy(rel, config)
+ else:
+ return None
+
+ try:
+ size = path.stat().st_size
+ except OSError:
+ size = 0
+
+ return FileNode(
+ absolute=path.resolve(),
+ relative=rel,
+ kind=kind,
+ language=language_for(path),
+ size_bytes=size,
+ is_binary=binary,
+ support_category=category,
+ content_policy=policy,
+ )
+
+
+def should_hard_ignore(rel: Path, config: ScriberConfig) -> bool:
+ return matches_any(rel.as_posix(), config.hard_ignore_patterns)
+
+
+def is_text_readable(path: Path) -> bool:
+ if is_probably_binary(path):
+ return False
+ try:
+ path.read_text(encoding="utf-8")
+ return True
+ except UnicodeDecodeError:
+ return False
+ except OSError:
+ return False
+
+
+def read_text_lossy(path: Path) -> str:
+ from scriber.native import require_native
+ return require_native().read_text(str(path))
+
+
+
diff --git a/src/scriber/scanner/scan.py b/src/scriber/scanner/scan.py
new file mode 100644
index 0000000..e2fa8a4
--- /dev/null
+++ b/src/scriber/scanner/scan.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from pathlib import Path
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.native import require_native
+
+
+def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
+ files, _ = scan_project_with_native(root, config)
+ return files
+
+
+def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Path, FileNode], list]:
+ root = root.resolve()
+ native = require_native()
+
+ native_files = native.scan_project(
+ str(root),
+ config.use_gitignore,
+ config.hard_ignore_patterns,
+ config.code_patterns,
+ config.support_patterns,
+ config.support_content.full,
+ config.support_content.tree_only,
+ config.support_content.default,
+ config.support
+ )
+
+ files: dict[Path, FileNode] = {}
+
+ from scriber.cache import ScriberCache
+ cache = ScriberCache(config, root)
+ active_files: set[Path] = set()
+
+ for item in native_files:
+ rel = Path(item.relative)
+ active_files.add(rel)
+
+ cached_data = cache.get_file(rel, item.mtime_ns, item.size_bytes)
+ if cached_data is not None:
+ node = FileNode(
+ absolute=(root / Path(cached_data["relative"])).resolve(strict=False),
+ relative=Path(cached_data["relative"]),
+ kind=cached_data["kind"],
+ language=cached_data["language"],
+ size_bytes=cached_data["size_bytes"],
+ is_binary=cached_data["is_binary"],
+ support_category=cached_data["support_category"],
+ content_policy=cached_data["content_policy"]
+ )
+ files[node.relative] = node
+ else:
+ node = FileNode(
+ absolute=(root / rel).resolve(strict=False),
+ relative=rel,
+ kind=item.kind,
+ language=item.language,
+ size_bytes=item.size_bytes,
+ is_binary=item.is_binary,
+ support_category=item.support_category,
+ content_policy=item.content_policy
+ )
+ files[node.relative] = node
+ cache.set_file(rel, item.mtime_ns, item.size_bytes, {
+ "relative": node.relative.as_posix(),
+ "kind": node.kind,
+ "language": node.language,
+ "size_bytes": node.size_bytes,
+ "is_binary": node.is_binary,
+ "support_category": node.support_category,
+ "content_policy": node.content_policy
+ })
+
+ cache.save(active_files)
+ return files, native_files
diff --git a/src/scriber/scanner/scan_py.py b/src/scriber/scanner/scan_py.py
new file mode 100644
index 0000000..2c0ebae
--- /dev/null
+++ b/src/scriber/scanner/scan_py.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from scriber.core.matchers import SimpleGitIgnore
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.scanner.files import classify_file, should_hard_ignore
+
+
+def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
+ root = root.resolve()
+ gitignore = SimpleGitIgnore.from_file(root / ".gitignore") if config.use_gitignore else SimpleGitIgnore([])
+ files: dict[Path, FileNode] = {}
+
+ from scriber.cache import ScriberCache
+ cache = ScriberCache(config, root)
+ active_files: set[Path] = set()
+
+ for dirpath, dirnames, filenames in os.walk(root):
+ current = Path(dirpath)
+ rel_dir = current.relative_to(root)
+
+ kept_dirs: list[str] = []
+ for dirname in dirnames:
+ child_rel = (rel_dir / dirname) if rel_dir.as_posix() != "." else Path(dirname)
+ if should_hard_ignore(child_rel, config):
+ continue
+ if config.use_gitignore and gitignore.ignores(child_rel.as_posix(), is_dir=True):
+ continue
+ kept_dirs.append(dirname)
+ dirnames[:] = kept_dirs
+
+ for filename in filenames:
+ path = current / filename
+ rel = path.relative_to(root)
+ if should_hard_ignore(rel, config):
+ continue
+ if config.use_gitignore and gitignore.ignores(rel.as_posix(), is_dir=False):
+ continue
+
+ try:
+ stat = path.stat()
+ mtime_ns = stat.st_mtime_ns
+ size = stat.st_size
+ except OSError:
+ continue
+
+ active_files.add(rel)
+
+ cached_data = cache.get_file(rel, mtime_ns, size)
+ if cached_data is not None:
+ node = FileNode(
+ absolute=(root / Path(cached_data["relative"])).resolve(strict=False),
+ relative=Path(cached_data["relative"]),
+ kind=cached_data["kind"],
+ language=cached_data["language"],
+ size_bytes=cached_data["size_bytes"],
+ is_binary=cached_data["is_binary"],
+ support_category=cached_data["support_category"],
+ content_policy=cached_data["content_policy"]
+ )
+ files[node.relative] = node
+ else:
+ node = classify_file(path, root, config)
+ if node is not None:
+ files[node.relative] = node
+ cache.set_file(rel, mtime_ns, size, {
+ "relative": node.relative.as_posix(),
+ "kind": node.kind,
+ "language": node.language,
+ "size_bytes": node.size_bytes,
+ "is_binary": node.is_binary,
+ "support_category": node.support_category,
+ "content_policy": node.content_policy
+ })
+
+ cache.save(active_files)
+ return files
diff --git a/src/scriber/tokens.py b/src/scriber/tokens.py
new file mode 100644
index 0000000..5b83624
--- /dev/null
+++ b/src/scriber/tokens.py
@@ -0,0 +1,14 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from scriber.core.models import TokenConfig
+
+
+def estimate_tokens(text: str, config: TokenConfig | None = None) -> int:
+ if config is None:
+ return max(1, len(text) // 4)
+ if config.estimator == "chars":
+ return max(1, len(text) // config.chars_per_token)
+ return max(1, len(text) // 4)
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..5c141bf
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from scriber.core.models import ScriberConfig
+from scriber.cache import ScriberCache, get_config_hash
+
+
+def test_cache_functionality(tmp_path: Path) -> None:
+ config = ScriberConfig()
+ # Ensure cache is enabled
+ config.cache.enabled = True
+ config.cache.dir = ".scriber/cache"
+
+ cache = ScriberCache(config, tmp_path)
+
+ rel_path = Path("src/main.py")
+ mtime = 123456789
+ size = 1000
+ data = {"kind": "code", "language": "python", "size_bytes": 1000, "is_binary": False, "support_category": None, "content_policy": "auto", "absolute": "src/main.py", "relative": "src/main.py"}
+
+ assert cache.get_file(rel_path, mtime, size) is None
+
+ cache.set_file(rel_path, mtime, size, data)
+ assert cache.get_file(rel_path, mtime, size) == data
+
+ # Check imports cache
+ imports = {Path("src/auth.py"), Path("src/db.py")}
+ assert cache.get_imports(rel_path) is None
+ cache.set_imports(rel_path, imports)
+ assert cache.get_imports(rel_path) == imports
+
+ # Save cache
+ cache.save(active_files={rel_path})
+
+ # Check that cache files were created
+ assert (tmp_path / ".scriber/cache/files.json").exists()
+ assert (tmp_path / ".scriber/cache/import_graph.json").exists()
+
+ # Reload cache and check if retrieved properly
+ new_cache = ScriberCache(config, tmp_path)
+ assert new_cache.get_file(rel_path, mtime, size) == data
+ assert new_cache.get_imports(rel_path) == imports
diff --git a/tests/test_config_schema.py b/tests/test_config_schema.py
new file mode 100644
index 0000000..ab377de
--- /dev/null
+++ b/tests/test_config_schema.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from pathlib import Path
+from scriber.core.config import load_config
+
+
+def test_config_schema_parsing(tmp_path: Path) -> None:
+ config_file = tmp_path / "pyproject.toml"
+ config_file.write_text("""
+[tool.scriber]
+format = "txt"
+max_tokens = 50000
+max_files = 30
+only_tree = true
+allow_external_paths = true
+
+[tool.scriber.modules]
+enabled = false
+content_min_score = 40
+
+[tool.scriber.code_files]
+patterns = ["**/*.py", "**/*.rs"]
+
+[tool.scriber.support_files]
+enabled = true
+patterns = ["pyproject.toml", "Dockerfile"]
+
+[tool.scriber.support_files.content]
+default = "tree_only"
+auto_max_bytes = 20000
+full = ["pyproject.toml"]
+tree_only = ["Dockerfile"]
+
+[tool.scriber.hard_ignore]
+patterns = [".git/**", "node_modules/**"]
+""".strip(), encoding="utf-8")
+
+ config = load_config(config_file)
+
+ assert config.format == "txt"
+ assert config.max_tokens == 50000
+ assert config.max_files == 30
+ assert config.only_tree is True
+ assert config.allow_external_paths is True
+
+ assert config.modules is False
+ assert config.modules_config.enabled is False
+ assert config.modules_config.content_min_score == 40
+
+ assert config.code_patterns == ["**/*.py", "**/*.rs"]
+
+ assert config.support is True
+ assert config.support_patterns == ["pyproject.toml", "Dockerfile"]
+
+ assert config.support_content.default == "tree_only"
+ assert config.support_content.auto_max_bytes == 20000
+ assert config.support_content.full == ["pyproject.toml"]
+ assert config.support_content.tree_only == ["Dockerfile"]
+
+ assert config.hard_ignore_patterns == [".git/**", "node_modules/**"]
+
+
+def test_validate_config_cli(tmp_path: Path, monkeypatch) -> None:
+ from scriber.cli.main import main
+
+ # 1. Valid config
+ config_file = tmp_path / "pyproject.toml"
+ config_file.write_text("[tool.scriber]\nformat = 'md'\n", encoding="utf-8")
+ monkeypatch.chdir(tmp_path)
+
+ code = main(["--validate-config"])
+ assert code == 0
+
+ # 2. Invalid config format
+ config_file.write_text("[tool.scriber]\nformat = 'invalid'\n", encoding="utf-8")
+ code = main(["--validate-config"])
+ assert code == 1
diff --git a/tests/test_init_config.py b/tests/test_init_config.py
new file mode 100644
index 0000000..59d0e85
--- /dev/null
+++ b/tests/test_init_config.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+import pytest
+from pathlib import Path
+from scriber.core.errors import ScriberError
+from scriber.core.init_config import init_project, replace_existing_tool_scriber_block
+
+
+def test_replace_existing_block() -> None:
+ content = """
+[build-system]
+requires = ["setuptools>=61"]
+
+[tool.scriber]
+version = "1"
+
+[tool.scriber.code_files]
+patterns = ["*.py"]
+
+[tool.pytest.ini_options]
+addopts = "-q"
+""".strip()
+
+ default_block = """
+[tool.scriber]
+version = "2"
+""".strip()
+
+ expected = """
+[build-system]
+requires = ["setuptools>=61"]
+
+[tool.pytest.ini_options]
+addopts = "-q"
+
+[tool.scriber]
+version = "2"
+""".strip() + "\n"
+
+ res = replace_existing_tool_scriber_block(content, default_block)
+ assert res == expected
+
+
+def test_init_project_file_missing(tmp_path: Path) -> None:
+ config_path = tmp_path / "pyproject.toml"
+ assert not config_path.exists()
+
+ path = init_project(str(config_path))
+ assert path == config_path.resolve()
+ assert config_path.exists()
+ assert "[tool.scriber]" in config_path.read_text(encoding="utf-8")
+
+
+def test_init_project_exists_no_scriber(tmp_path: Path) -> None:
+ config_path = tmp_path / "pyproject.toml"
+ config_path.write_text("[build-system]\n", encoding="utf-8")
+
+ init_project(str(config_path))
+ content = config_path.read_text(encoding="utf-8")
+ assert "[build-system]" in content
+ assert "[tool.scriber]" in content
+
+
+def test_init_project_exists_with_scriber_raises(tmp_path: Path) -> None:
+ config_path = tmp_path / "pyproject.toml"
+ config_path.write_text("[tool.scriber]\nversion = '1'\n", encoding="utf-8")
+
+ with pytest.raises(ScriberError, match="Scriber config already exists"):
+ init_project(str(config_path))
+
+
+def test_init_project_exists_with_scriber_force(tmp_path: Path) -> None:
+ config_path = tmp_path / "pyproject.toml"
+ config_path.write_text("""
+[build-system]
+requires = ["setuptools>=61"]
+
+[tool.scriber]
+version = '1'
+""".strip() + "\n", encoding="utf-8")
+
+ init_project(str(config_path), force=True)
+ content = config_path.read_text(encoding="utf-8")
+ assert "[build-system]" in content
+ assert "[tool.scriber]" in content
+ assert "version = '1'" not in content # must be replaced with the default block
+
+ # Ensure there is exactly one [tool.scriber] header in pyproject.toml
+ assert content.count("[tool.scriber]") == 1
diff --git a/tests/test_languages.py b/tests/test_languages.py
new file mode 100644
index 0000000..5f53f23
--- /dev/null
+++ b/tests/test_languages.py
@@ -0,0 +1,154 @@
+from __future__ import annotations
+
+from pathlib import Path
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.builder import build_graph
+
+
+def test_javascript_typescript_graph(tmp_path: Path) -> None:
+ config = ScriberConfig()
+
+ auth_path = tmp_path / "src/auth.ts"
+ auth_path.parent.mkdir(parents=True, exist_ok=True)
+ auth_path.write_text("export class Auth {}", encoding="utf-8")
+
+ main_path = tmp_path / "src/main.ts"
+ main_path.write_text("import { Auth } from './auth';\nimport 'lodash';", encoding="utf-8")
+
+ files = {
+ Path("src/auth.ts"): FileNode(
+ absolute=auth_path.resolve(),
+ relative=Path("src/auth.ts"),
+ kind="code",
+ language="typescript",
+ size_bytes=auth_path.stat().st_size
+ ),
+ Path("src/main.ts"): FileNode(
+ absolute=main_path.resolve(),
+ relative=Path("src/main.ts"),
+ kind="code",
+ language="typescript",
+ size_bytes=main_path.stat().st_size
+ )
+ }
+
+ graph = build_graph(files, config)
+ assert Path("src/auth.ts") in graph.imports[Path("src/main.ts")]
+ assert Path("src/main.ts") in graph.imported_by[Path("src/auth.ts")]
+
+
+def test_rust_graph(tmp_path: Path) -> None:
+ config = ScriberConfig()
+
+ cargo_toml = tmp_path / "Cargo.toml"
+ cargo_toml.write_text("[package]\nname = 'test'", encoding="utf-8")
+
+ auth_path = tmp_path / "src/auth.rs"
+ auth_path.parent.mkdir(parents=True, exist_ok=True)
+ auth_path.write_text("pub struct Auth;", encoding="utf-8")
+
+ main_path = tmp_path / "src/main.rs"
+ main_path.write_text("mod auth;\nuse crate::auth::Auth;\nuse super::unrelated;", encoding="utf-8")
+
+ files = {
+ Path("src/auth.rs"): FileNode(
+ absolute=auth_path.resolve(),
+ relative=Path("src/auth.rs"),
+ kind="code",
+ language="rust",
+ size_bytes=auth_path.stat().st_size
+ ),
+ Path("src/main.rs"): FileNode(
+ absolute=main_path.resolve(),
+ relative=Path("src/main.rs"),
+ kind="code",
+ language="rust",
+ size_bytes=main_path.stat().st_size
+ )
+ }
+
+ graph = build_graph(files, config)
+ assert Path("src/auth.rs") in graph.imports[Path("src/main.rs")]
+ assert Path("src/main.rs") in graph.imported_by[Path("src/auth.rs")]
+
+
+def test_go_graph(tmp_path: Path) -> None:
+ config = ScriberConfig()
+
+ go_mod = tmp_path / "go.mod"
+ go_mod.write_text("module github.com/user/project\n", encoding="utf-8")
+
+ db_path = tmp_path / "pkg/db/db.go"
+ db_path.parent.mkdir(parents=True, exist_ok=True)
+ db_path.write_text("package db\n", encoding="utf-8")
+
+ main_path = tmp_path / "cmd/main.go"
+ main_path.parent.mkdir(parents=True, exist_ok=True)
+ main_path.write_text('package main\nimport "github.com/user/project/pkg/db"\n', encoding="utf-8")
+
+ files = {
+ Path("pkg/db/db.go"): FileNode(
+ absolute=db_path.resolve(),
+ relative=Path("pkg/db/db.go"),
+ kind="code",
+ language="go",
+ size_bytes=db_path.stat().st_size
+ ),
+ Path("cmd/main.go"): FileNode(
+ absolute=main_path.resolve(),
+ relative=Path("cmd/main.go"),
+ kind="code",
+ language="go",
+ size_bytes=main_path.stat().st_size
+ )
+ }
+
+ graph = build_graph(files, config)
+ assert Path("pkg/db/db.go") in graph.imports[Path("cmd/main.go")]
+ assert Path("cmd/main.go") in graph.imported_by[Path("pkg/db/db.go")]
+
+
+def test_cpp_graph(tmp_path: Path) -> None:
+ config = ScriberConfig()
+
+ header_path = tmp_path / "src/auth.h"
+ header_path.parent.mkdir(parents=True, exist_ok=True)
+ header_path.write_text("class Auth {};", encoding="utf-8")
+
+ main_path = tmp_path / "src/main.cpp"
+ main_path.write_text('#include "auth.h"\n#include \n#include "utils/helper.hpp"', encoding="utf-8")
+
+ helper_path = tmp_path / "src/utils/helper.hpp"
+ helper_path.parent.mkdir(parents=True, exist_ok=True)
+ helper_path.write_text("void helper();", encoding="utf-8")
+
+ files = {
+ Path("src/auth.h"): FileNode(
+ absolute=header_path.resolve(),
+ relative=Path("src/auth.h"),
+ kind="code",
+ language="c",
+ size_bytes=header_path.stat().st_size
+ ),
+ Path("src/main.cpp"): FileNode(
+ absolute=main_path.resolve(),
+ relative=Path("src/main.cpp"),
+ kind="code",
+ language="cpp",
+ size_bytes=main_path.stat().st_size
+ ),
+ Path("src/utils/helper.hpp"): FileNode(
+ absolute=helper_path.resolve(),
+ relative=Path("src/utils/helper.hpp"),
+ kind="code",
+ language="cpp",
+ size_bytes=helper_path.stat().st_size
+ )
+ }
+
+ graph = build_graph(files, config)
+ assert Path("src/auth.h") in graph.imports[Path("src/main.cpp")]
+ assert Path("src/main.cpp") in graph.imported_by[Path("src/auth.h")]
+ assert Path("src/utils/helper.hpp") in graph.imports[Path("src/main.cpp")]
+ assert Path("src/main.cpp") in graph.imported_by[Path("src/utils/helper.hpp")]
+
diff --git a/tests/test_native.py b/tests/test_native.py
new file mode 100644
index 0000000..643d795
--- /dev/null
+++ b/tests/test_native.py
@@ -0,0 +1,422 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from scriber.core.models import ScriberConfig
+from scriber.native import is_native_available, require_native
+from scriber.scanner.scan import scan_project as scan_rust
+from scriber.scanner.scan_py import scan_project as scan_python
+
+
+def test_native_module_available() -> None:
+ assert is_native_available()
+ native = require_native()
+ assert native is not None
+
+
+def test_native_read_write(tmp_path: Path) -> None:
+ native = require_native()
+ test_file = tmp_path / "test.txt"
+ content = "Hello, native Rust world!\nWith some special characters: ΕΓ³Δ
dΕΊΕ\n"
+
+ native.write_text(str(test_file), content)
+ assert test_file.exists()
+
+ read_back = native.read_text(str(test_file))
+ assert read_back == content
+
+
+def test_native_binary_check(tmp_path: Path) -> None:
+ native = require_native()
+
+ # Test text file
+ txt_file = tmp_path / "normal.txt"
+ txt_file.write_text("Hello world", encoding="utf-8")
+ assert not native.is_probably_binary(str(txt_file))
+
+ # Test binary file
+ bin_file = tmp_path / "binary.bin"
+ bin_file.write_bytes(b"Hello\x00world")
+ assert native.is_probably_binary(str(bin_file))
+
+
+def test_native_scan_matches_python_scan(tmp_path: Path) -> None:
+ # Set up a mock project structure
+ (tmp_path / "src").mkdir()
+ (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8")
+ (tmp_path / "src" / "helper.py").write_text("import sys", encoding="utf-8")
+ (tmp_path / "src" / "binary.dat").write_bytes(b"\x00\x01\x02")
+ (tmp_path / "README.md").write_text("# Test Project", encoding="utf-8")
+ (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
+
+ # Hidden dir and ignored patterns
+ (tmp_path / ".git").mkdir()
+ (tmp_path / ".git" / "config").write_text("git config", encoding="utf-8")
+
+ config = ScriberConfig(
+ use_gitignore=True,
+ code_patterns=["**/*.py"],
+ support_patterns=["pyproject.toml", "README.md", "requirements.txt"],
+ hard_ignore_patterns=[".git/**", "**/binary.dat"],
+ )
+
+ # Create gitignore
+ (tmp_path / ".gitignore").write_text("*.pyc\n", encoding="utf-8")
+
+ rust_result = scan_rust(tmp_path, config)
+ python_result = scan_python(tmp_path, config)
+
+ # They should find the exact same relative paths
+ assert set(rust_result.keys()) == set(python_result.keys())
+
+ for path, rust_node in rust_result.items():
+ py_node = python_result[path]
+
+ # Verify fields match exactly
+ assert rust_node.relative == py_node.relative
+ assert rust_node.kind == py_node.kind
+ assert rust_node.language == py_node.language
+ assert rust_node.size_bytes == py_node.size_bytes
+ assert rust_node.is_binary == py_node.is_binary
+ assert rust_node.support_category == py_node.support_category
+ assert rust_node.content_policy == py_node.content_policy
+
+
+def test_native_no_support(tmp_path: Path) -> None:
+ (tmp_path / "src").mkdir()
+ (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8")
+ (tmp_path / "README.md").write_text("# Test Project", encoding="utf-8")
+ (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
+
+ config = ScriberConfig(
+ support=False,
+ code_patterns=["**/*.py"],
+ support_patterns=["pyproject.toml", "README.md"],
+ )
+
+ rust_result = scan_rust(tmp_path, config)
+ # Check that README.md and pyproject.toml are NOT in the result (they are support files)
+ for path, node in rust_result.items():
+ assert node.kind != "support"
+ assert Path("README.md") not in rust_result
+ assert Path("pyproject.toml") not in rust_result
+
+
+def test_native_write_creates_parent_dirs(tmp_path: Path) -> None:
+ native = require_native()
+ path = tmp_path / "a" / "b" / "out.txt"
+
+ native.write_text(str(path), "hello")
+
+ assert path.read_text(encoding="utf-8") == "hello"
+
+
+def write(path: Path, content: str) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(content, encoding="utf-8")
+
+
+def make_mixed_project(root: Path) -> None:
+ write(root / "pyproject.toml", "[tool.scriber]\nversion='2'\n")
+ write(root / "Cargo.toml", "[package]\nname='x'\n")
+ write(root / "Cargo.lock", "# lock\n")
+ write(root / "README.md", "# readme\n")
+ write(root / "src/main.py", "from .auth import Auth\n")
+ write(root / "src/auth.py", "class Auth: pass\n")
+ write(root / "src/main.rs", "mod auth;\n")
+ write(root / "src/auth.rs", "pub struct Auth;\n")
+ write(root / "frontend/main.ts", "import './auth'\n")
+ write(root / "frontend/auth.ts", "export const x = 1\n")
+ write(root / "node_modules/pkg/index.js", "ignored\n")
+ write(root / ".gitignore", "*.tmp\n")
+ write(root / "ignored.tmp", "ignored\n")
+ (root / "binary.bin").write_bytes(b"\x00\x01")
+
+
+def make_config() -> ScriberConfig:
+ return ScriberConfig(
+ use_gitignore=True,
+ code_patterns=["**/*.py", "**/*.rs", "**/*.ts"],
+ support_patterns=["pyproject.toml", "README.md", "Cargo.toml", "Cargo.lock"],
+ hard_ignore_patterns=["node_modules/**"],
+ )
+
+
+def test_native_scan_matches_python_scan_mixed_project(tmp_path: Path) -> None:
+ make_mixed_project(tmp_path)
+ config = make_config()
+
+ rs = scan_rust(tmp_path, config)
+ py = scan_python(tmp_path, config)
+
+ assert set(rs.keys()) == set(py.keys())
+
+
+def test_native_scan_support_false(tmp_path: Path) -> None:
+ make_mixed_project(tmp_path)
+ config = make_config()
+ config.support = False
+
+ rs = scan_rust(tmp_path, config)
+
+ assert all(node.kind != "support" for node in rs.values())
+
+
+def test_native_scan_gitignore(tmp_path: Path) -> None:
+ make_mixed_project(tmp_path)
+ config = make_config()
+ config.use_gitignore = True
+
+ rs = scan_rust(tmp_path, config)
+
+ assert Path("ignored.tmp") not in rs
+
+
+def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None:
+ make_mixed_project(tmp_path)
+ config = make_config()
+
+ python_files = scan_python(tmp_path, config)
+
+ from scriber.graph.builder import build_graph as build_python_graph
+ py_graph = build_python_graph(python_files, config)
+
+ native = require_native()
+ native_files = native.scan_project(
+ str(tmp_path),
+ config.use_gitignore,
+ config.hard_ignore_patterns,
+ config.code_patterns,
+ config.support_patterns,
+ config.support_content.full,
+ config.support_content.tree_only,
+ config.support_content.default,
+ config.support
+ )
+ edges = native.build_import_graph(
+ str(tmp_path),
+ native_files,
+ config.python.source_roots,
+ config.python.module_init_files
+ )
+
+ rs_imports = {}
+ for edge in edges:
+ rs_imports.setdefault(Path(getattr(edge, "from")), set()).add(Path(edge.to))
+
+ for path, targets in py_graph.imports.items():
+ file = python_files[path]
+ if file.language in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}:
+ rs_targets = rs_imports.get(path, set())
+ assert rs_targets == targets
+
+
+def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None:
+ make_mixed_project(tmp_path)
+ config = make_config()
+
+ python_files = scan_python(tmp_path, config)
+ from scriber.graph.builder import build_graph as build_python_graph
+ py_graph = build_python_graph(python_files, config)
+
+ from scriber.engine.scorer import score_candidates as score_python
+ from scriber.core.models import SeedPath
+ seed = SeedPath(
+ original=Path("src/main.py"),
+ absolute=(tmp_path / "src/main.py").resolve(),
+ relative=Path("src/main.py"),
+ is_dir=False,
+ expanded_files=[Path("src/main.py")]
+ )
+ py_candidates = score_python(files=python_files, seeds=[seed], graph=py_graph, config=config, mode="focused")
+
+ native = require_native()
+ native_files = native.scan_project(
+ str(tmp_path),
+ config.use_gitignore,
+ config.hard_ignore_patterns,
+ config.code_patterns,
+ config.support_patterns,
+ config.support_content.full,
+ config.support_content.tree_only,
+ config.support_content.default,
+ config.support
+ )
+ edges = native.build_import_graph(
+ str(tmp_path),
+ native_files,
+ config.python.source_roots,
+ config.python.module_init_files
+ )
+
+ scoring = config.modules_config.scoring
+ opts = native.NativePackOptions(
+ mode="focused",
+ max_files=config.max_files,
+ min_score=config.min_score,
+ tree_min_score=config.modules_config.tree_min_score,
+ seed_file_score=scoring.get("seed_file", 100),
+ seed_folder_file_score=scoring.get("seed_folder_file", 100),
+ direct_dependency_score=scoring.get("direct_dependency", 90),
+ reverse_dependency_score=scoring.get("reverse_dependency", 85),
+ same_package_score=scoring.get("same_package", 65),
+ parent_entrypoint_score=scoring.get("parent_entrypoint", 60),
+ related_test_score=scoring.get("related_test", 80),
+ name_similarity_score=scoring.get("name_similarity", 45),
+ support_near_seed_score=scoring.get("support_near_seed", 60),
+ project_config_score=scoring.get("project_config", 55),
+ dependency_file_score=scoring.get("dependency_file", 52),
+ runtime_support_score=scoring.get("runtime_support", 50),
+ documentation_score=scoring.get("documentation", 45),
+ shared_dependency_bonus=scoring.get("shared_dependency_bonus", 10),
+ modules_enabled=config.modules,
+ include_direct_dependencies=config.modules_config.include_direct_dependencies,
+ include_reverse_dependencies=config.modules_config.include_reverse_dependencies,
+ include_same_package=config.modules_config.include_same_package,
+ include_parent_entrypoints=config.modules_config.include_parent_entrypoints,
+ include_tests=config.modules_config.include_tests,
+ include_project_configs=config.modules_config.include_project_configs,
+ depth=config.modules_config.depth,
+ support_enabled=config.support,
+ entrypoint_patterns=config.python.entrypoint_patterns,
+ test_roots=config.python.test_roots,
+ )
+
+ rs_candidates = native.score_candidates_native(
+ native_files,
+ ["src/main.py"],
+ edges,
+ opts
+ )
+
+ py_map = {c.file.relative.as_posix(): c for c in py_candidates}
+ rs_map = {c.path: c for c in rs_candidates}
+
+ assert set(py_map.keys()) == set(rs_map.keys())
+ for path, py_c in py_map.items():
+ rs_c = rs_map[path]
+ assert rs_c.kind == py_c.file.kind
+ assert rs_c.score == py_c.score
+
+
+def test_native_render_tree_matches_python() -> None:
+ native = require_native()
+ paths = [
+ "src/main.py",
+ "src/auth.py",
+ "tests/test_auth.py",
+ "pyproject.toml",
+ "README.md",
+ ]
+
+ from scriber.rendering.renderer import render_tree as render_python_tree
+ py_tree = render_python_tree([Path(p) for p in paths])
+
+ rs_tree = native.render_tree(paths)
+
+ assert rs_tree.strip() == py_tree.strip()
+
+
+def test_default_toml_and_lock_support(tmp_path: Path) -> None:
+ from scriber.core.config import load_config
+ from scriber.scanner.scan import scan_project
+
+ # Create dummy files
+ (tmp_path / "src").mkdir()
+ (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8")
+ (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
+ (tmp_path / "some_random_config.toml").write_text("a = 1", encoding="utf-8")
+ (tmp_path / "some_random_lockfile.lock").write_text("lock", encoding="utf-8")
+
+ # Load default config
+ config = load_config(tmp_path / "pyproject.toml")
+ config.use_gitignore = False
+
+ # Assert that **/*.toml and **/*.lock are in support patterns
+ assert "**/*.toml" in config.support_patterns
+ assert "**/*.toml" in config.support_content.full
+ assert "**/*.lock" in config.support_patterns
+ assert "**/*.lock" in config.support_content.tree_only
+
+ # Scan the project
+ scanned = scan_project(tmp_path, config)
+
+ # Check TOML classifications
+ assert Path("some_random_config.toml") in scanned
+ node = scanned[Path("some_random_config.toml")]
+ assert node.kind == "support"
+ assert node.support_category == "project config"
+ assert node.content_policy == "full"
+
+ # Check lockfile classifications
+ assert Path("some_random_lockfile.lock") in scanned
+ node = scanned[Path("some_random_lockfile.lock")]
+ assert node.kind == "support"
+ assert node.support_category == "dependency file"
+ assert node.content_policy == "tree_only"
+
+
+def test_native_import_complex_python(tmp_path: Path) -> None:
+ (tmp_path / "src").mkdir()
+ (tmp_path / "src" / "a.py").write_text("class A: pass", encoding="utf-8")
+ (tmp_path / "src" / "b.py").write_text("class B: pass", encoding="utf-8")
+ (tmp_path / "src" / "c.py").write_text("class C: pass", encoding="utf-8")
+ (tmp_path / "src" / "d.py").write_text("class D: pass", encoding="utf-8")
+
+ import_test_content = """
+import os, sys
+import math as m, json
+from .a import A as AliasA
+from .b import (
+ B, # some comment here
+ C as AliasC
+)
+from .c import D
+"""
+ (tmp_path / "src" / "main.py").write_text(import_test_content, encoding="utf-8")
+ (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
+
+ config = ScriberConfig(
+ use_gitignore=False,
+ code_patterns=["**/*.py"],
+ support_patterns=["pyproject.toml"],
+ )
+
+ from scriber.scanner.scan import scan_project
+ files = scan_project(tmp_path, config)
+
+ native = require_native()
+ native_files = native.scan_project(
+ str(tmp_path),
+ config.use_gitignore,
+ config.hard_ignore_patterns,
+ config.code_patterns,
+ config.support_patterns,
+ config.support_content.full,
+ config.support_content.tree_only,
+ config.support_content.default,
+ config.support
+ )
+ edges = native.build_import_graph(
+ str(tmp_path),
+ native_files,
+ config.python.source_roots,
+ config.python.module_init_files
+ )
+
+ imports = {Path(getattr(edge, "from")): set() for edge in edges}
+ for edge in edges:
+ imports[Path(getattr(edge, "from"))].add(Path(edge.to))
+
+ main_path = Path("src/main.py")
+ assert main_path in imports
+
+ expected_imports = {
+ Path("src/a.py"),
+ Path("src/b.py"),
+ Path("src/c.py")
+ }
+ assert imports[main_path] == expected_imports
+
+
+
diff --git a/tests/test_processing_modes.py b/tests/test_processing_modes.py
deleted file mode 100644
index a795d26..0000000
--- a/tests/test_processing_modes.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""
-Tests for single-process and multi-process execution modes in Scriber.
-"""
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-from src.scriber.core import Scriber
-
-
-def test_single_process_mode_avoids_process_pool(tmp_path: Path):
- """
- Verifies that ProcessPoolExecutor is not used when single_process is True.
- """
- (tmp_path / "test.txt").write_text("hello world")
-
- with patch('src.scriber.core.ProcessPoolExecutor') as mock_executor:
- config = {"single_process": True, "exclude": []}
- scriber = Scriber(root_path=tmp_path, config=config)
- scriber.map_project()
-
- mock_executor.assert_not_called()
- stats = scriber.get_stats()
- assert stats['total_files'] == 1
- assert stats['total_tokens'] > 0
-
-
-def test_multi_process_mode_uses_process_pool(tmp_path: Path):
- """
- Verifies that ProcessPoolExecutor is used by default (single_process is False).
-
- This test uses a more advanced mock to simulate the return of futures
- and ensure the statistics are correctly aggregated from the mocked results.
- """
- (tmp_path / "test.txt").write_text("hello world")
- expected_stats = {"size": 11, "tokens": 2, "lang": "text"}
-
- with patch('src.scriber.core.ProcessPoolExecutor') as MockProcessPoolExecutor, \
- patch('src.scriber.core.as_completed') as mock_as_completed:
- mock_future = MagicMock()
- mock_future.result.return_value = expected_stats
- mock_as_completed.return_value = [mock_future]
-
- mock_executor_instance = MockProcessPoolExecutor.return_value.__enter__.return_value
-
- config = {"single_process": False, "exclude": []}
- scriber = Scriber(root_path=tmp_path, config=config)
- scriber.map_project()
-
- MockProcessPoolExecutor.assert_called_once()
- assert mock_executor_instance.submit.called
- mock_as_completed.assert_called_once()
-
- stats = scriber.get_stats()
- assert stats['total_files'] == 1
- assert stats['total_size_bytes'] == expected_stats['size']
- assert stats['total_tokens'] == expected_stats['tokens']
\ No newline at end of file
diff --git a/tests/test_scriber.py b/tests/test_scriber.py
new file mode 100644
index 0000000..8ddf870
--- /dev/null
+++ b/tests/test_scriber.py
@@ -0,0 +1,222 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from scriber.pack import build_pack
+from scriber.render import render_markdown
+
+
+def write(path: Path, content: str) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(content, encoding="utf-8")
+
+
+def make_project(tmp_path: Path) -> Path:
+ write(
+ tmp_path / "pyproject.toml",
+ """
+[tool.scriber]
+version = "2"
+format = "md"
+output = ".scriber/out.md"
+use_gitignore = false
+max_files = 50
+max_tokens = 100000
+min_score = 30
+
+[tool.scriber.code_files]
+patterns = ["**/*.py"]
+
+[tool.scriber.support_files]
+enabled = true
+patterns = ["pyproject.toml", "README.md", "requirements.txt", "poetry.lock", "Dockerfile"]
+
+[tool.scriber.support_files.content]
+default = "auto"
+full = ["pyproject.toml", "README.md", "requirements.txt", "Dockerfile"]
+tree_only = ["poetry.lock"]
+
+[tool.scriber.modules]
+enabled = true
+depth = 2
+include_direct_dependencies = true
+include_reverse_dependencies = true
+include_tests = true
+include_same_package = true
+include_parent_entrypoints = true
+include_project_configs = true
+content_min_score = 50
+tree_min_score = 30
+
+[tool.scriber.python]
+source_roots = ["src", "."]
+test_roots = ["tests"]
+entrypoint_patterns = ["main.py", "routes.py"]
+
+[tool.scriber.hard_ignore]
+patterns = [".git/**"]
+""".strip()
+ + "\n",
+ )
+ write(tmp_path / "README.md", "# Example\n")
+ write(tmp_path / "requirements.txt", "fastapi\n")
+ write(tmp_path / "poetry.lock", "very large lock in real life\n")
+ write(tmp_path / "Dockerfile", "FROM python:3.12\n")
+ write(tmp_path / "src/app/__init__.py", "")
+ write(tmp_path / "src/app/auth.py", "from .session import Session\nfrom .config import SETTINGS\n\nclass Auth: pass\n")
+ write(tmp_path / "src/app/session.py", "class Session: pass\n")
+ write(tmp_path / "src/app/config.py", "SETTINGS = {}\n")
+ write(tmp_path / "src/app/main.py", "from app.auth import Auth\n")
+ write(tmp_path / "src/api/routes.py", "from app.auth import Auth\n")
+ write(tmp_path / "tests/test_auth.py", "from app.auth import Auth\n\ndef test_auth():\n assert Auth\n")
+ write(tmp_path / "src/app/unrelated.py", "VALUE = 1\n")
+ return tmp_path
+
+
+def test_build_pack_includes_seed_dependencies_reverse_tests_and_support(tmp_path: Path, monkeypatch) -> None:
+ project = make_project(tmp_path)
+ monkeypatch.chdir(project)
+
+ pack = build_pack(["src/app/auth.py"], config_path="pyproject.toml")
+ paths = [path.as_posix() for path in pack.included_paths]
+
+ assert "src/app/auth.py" in paths
+ assert "src/app/session.py" in paths
+ assert "src/app/config.py" in paths
+ assert "src/api/routes.py" in paths
+ assert "tests/test_auth.py" in paths
+ assert "pyproject.toml" in paths
+ assert "README.md" in paths
+ assert "requirements.txt" in paths
+ assert "poetry.lock" in paths
+
+ by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates}
+ assert by_path["src/app/auth.py"].score == 100
+ assert by_path["src/app/session.py"].score >= 80
+ assert by_path["src/api/routes.py"].score >= 80
+ assert by_path["tests/test_auth.py"].score >= 80
+ assert by_path["poetry.lock"].include_content is False
+ assert "tree_only" in (by_path["poetry.lock"].omitted_reason or "")
+
+
+def test_only_tree_omits_contents(tmp_path: Path, monkeypatch) -> None:
+ project = make_project(tmp_path)
+ monkeypatch.chdir(project)
+
+ pack = build_pack(["src/app/auth.py"], config_path="pyproject.toml", only_tree=True)
+ assert pack.only_tree is True
+ assert all(candidate.include_content is False for candidate in pack.candidates)
+
+ rendered = render_markdown(pack)
+ assert "## Pack summary" in rendered
+ assert "Mode: `focused`" in rendered
+ assert "## File contents" not in rendered
+ assert "## Module graph" in rendered
+
+
+def test_multiple_paths_promote_shared_dependency(tmp_path: Path, monkeypatch) -> None:
+ project = make_project(tmp_path)
+ write(tmp_path / "src/app/billing.py", "from .config import SETTINGS\n")
+ monkeypatch.chdir(project)
+
+ pack = build_pack(["src/app/auth.py", "src/app/billing.py"], config_path="pyproject.toml")
+ by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates}
+ assert "src/app/config.py" in by_path
+ assert by_path["src/app/config.py"].score == 100
+ assert any("shared by multiple seed paths" in reason for reason in by_path["src/app/config.py"].reasons)
+
+
+def test_no_modules_keeps_seed_and_pyproject(tmp_path: Path, monkeypatch) -> None:
+ project = make_project(tmp_path)
+ monkeypatch.chdir(project)
+
+ pack = build_pack(["src/app/auth.py"], config_path="pyproject.toml", modules=False)
+ paths = [path.as_posix() for path in pack.included_paths]
+ assert "src/app/auth.py" in paths
+ assert "pyproject.toml" in paths
+ assert "src/app/session.py" not in paths
+
+
+def test_folder_seed_expands_files(tmp_path: Path, monkeypatch) -> None:
+ project = make_project(tmp_path)
+ monkeypatch.chdir(project)
+
+ pack = build_pack(["src/app"], config_path="pyproject.toml", modules=False)
+ paths = [path.as_posix() for path in pack.included_paths]
+ assert "src/app/auth.py" in paths
+ assert "src/app/session.py" in paths
+ assert "src/app/config.py" in paths
+
+
+def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None:
+ project = make_project(tmp_path)
+ monkeypatch.chdir(project)
+
+ pack = build_pack(["."], config_path="pyproject.toml")
+ assert pack.mode == "project_snapshot"
+
+ by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates}
+
+ # Entrypoint (e.g., src/app/main.py matches main.py pattern)
+ assert by_path["src/app/main.py"].score == 90
+ assert by_path["src/app/main.py"].reason_summary == "entrypoint file"
+
+ # Test file (tests/test_auth.py)
+ assert by_path["tests/test_auth.py"].score == 60
+ assert by_path["tests/test_auth.py"].reason_summary == "test file"
+
+ # Regular code file
+ assert by_path["src/app/auth.py"].score == 80
+ assert by_path["src/app/auth.py"].reason_summary == "code file"
+
+ # Support files
+ assert by_path["README.md"].score == 45
+ assert by_path["README.md"].reason_summary == "project support file"
+
+ # Ensure no near-seed duplication in project snapshot mode
+ assert "near" not in by_path["README.md"].reason_summary
+ assert "shared by multiple seed paths" not in by_path["README.md"].reasons
+
+
+def test_dry_run_and_open_cli(tmp_path: Path, monkeypatch) -> None:
+ from scriber.cli.main import main
+ project = make_project(tmp_path)
+ monkeypatch.chdir(project)
+
+ # Test dry run
+ code = main(["src/app/auth.py", "--dry-run"])
+ assert code == 0
+
+ # Ensure no output file was created under .scriber/out.md if it didn't exist
+ assert not (tmp_path / ".scriber/out.md").exists()
+
+ # Test open flag by mocking open_path to verify it gets called
+ called_with = None
+
+ def mock_open_path(path: Path) -> None:
+ nonlocal called_with
+ called_with = path
+
+ monkeypatch.setattr("scriber.core.open_file.open_path", mock_open_path)
+ code = main(["src/app/auth.py", "--open"])
+ assert code == 0
+ assert called_with == (tmp_path / ".scriber/out.md").resolve()
+
+
+def test_no_support_excludes_support_files_project_snapshot(tmp_path: Path) -> None:
+ project = make_project(tmp_path)
+
+ pack = build_pack(["."], config_path=str(project / "pyproject.toml"), support=False)
+
+ assert all(c.file.kind != "support" for c in pack.candidates)
+
+
+def test_no_support_excludes_support_files_folder_seed(tmp_path: Path) -> None:
+ project = make_project(tmp_path)
+
+ pack = build_pack(["."], config_path=str(project / "pyproject.toml"), support=False)
+
+ paths = {c.file.relative.as_posix() for c in pack.candidates}
+ assert "README.md" not in paths
+ assert "pyproject.toml" not in paths
+
diff --git a/tests/test_suite.py b/tests/test_suite.py
deleted file mode 100644
index 197293c..0000000
--- a/tests/test_suite.py
+++ /dev/null
@@ -1,457 +0,0 @@
-"""
-Tests for the main Scriber application, covering both core logic and the CLI.
-"""
-import io
-import json
-from collections import Counter
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-import tiktoken
-
-try:
- import tomllib
-except ImportError:
- import tomli as tomllib
-
-from src.scriber.cli import format_bytes
-from src.scriber.cli import main as cli_main
-from src.scriber.config import ScriberConfig
-from src.scriber.core import Scriber
-
-
-def test_direct_import():
- """Tests that the Scriber class can be imported directly from the package."""
- try:
- from src.scriber import Scriber
- except ImportError:
- pytest.fail("Could not import Scriber from src.scriber")
- assert callable(Scriber)
-
-
-# --- Test Core Scriber Functionality ---
-
-class TestCore:
- """Groups tests for the Scriber core logic found in `src.scriber.core`."""
-
- def test_default_exclusion(self, tmp_path: Path):
- """Tests that default patterns like .git/ and __pycache__/ are excluded."""
- (tmp_path / ".git").mkdir()
- (tmp_path / ".git" / "config").touch()
- (tmp_path / "main.py").touch()
- (tmp_path / "__pycache__").mkdir()
- (tmp_path / "__pycache__" / "cache.pyc").touch()
- (tmp_path / "build").mkdir()
- (tmp_path / "build" / "app").touch()
-
- scriber = Scriber(root_path=tmp_path)
- scriber.map_project()
-
- paths = {p.relative_to(tmp_path).as_posix() for p in scriber.mapped_files}
- assert "main.py" in paths
- assert not any(p.startswith('.git/') for p in paths)
- assert not any(p.startswith('__pycache__/') for p in paths)
- assert not any(p.startswith('build/') for p in paths)
-
- def test_directory_only_exclusion(self, tmp_path: Path):
- """Tests that a pattern with a trailing slash only excludes the directory."""
- (tmp_path / "my_app").mkdir()
- (tmp_path / "my_app" / "code.py").touch()
- (tmp_path / "my_app_file").touch()
-
- config = ScriberConfig(exclude=["my_app/"], include=[])
-
- scriber = Scriber(root_path=tmp_path, config=config)
- scriber.map_project()
- paths = {p.name for p in scriber.mapped_files}
-
- assert "my_app_file" in paths
- assert "code.py" not in paths
- assert len(paths) == 1
-
- def test_root_anchored_exclusion(self, tmp_path: Path):
- """Tests that a pattern with a leading slash only excludes at the root."""
- (tmp_path / "src").mkdir()
- (tmp_path / "src" / "config.yml").touch()
- (tmp_path / "config.yml").touch()
- config = ScriberConfig(exclude=["/config.yml"], include=[])
-
- scriber = Scriber(root_path=tmp_path, config=config)
- scriber.map_project()
- paths = {p.relative_to(tmp_path).as_posix() for p in scriber.mapped_files}
-
- assert "src/config.yml" in paths
- assert "config.yml" not in paths
-
- def test_unanchored_exclusion(self, tmp_path: Path):
- """Tests that a pattern without slashes excludes files/dirs anywhere."""
- (tmp_path / "src").mkdir()
- (tmp_path / "src" / "temp.log").touch()
- (tmp_path / "temp.log").touch()
- config = ScriberConfig(exclude=["temp.log"], include=[])
-
- scriber = Scriber(root_path=tmp_path, config=config)
- scriber.map_project()
-
- assert not scriber.mapped_files
-
- def test_gitignore_handling(self, tmp_path: Path):
- """Ensures .gitignore rules are correctly applied when enabled."""
- (tmp_path / "main.py").touch()
- (tmp_path / "ignored.log").touch()
- (tmp_path / ".gitignore").write_text("*.log")
-
- scriber = Scriber(root_path=tmp_path)
- scriber.map_project()
-
- paths = {p.name for p in scriber.mapped_files}
- assert "main.py" in paths
- assert "ignored.log" not in paths
-
- def test_disable_gitignore(self, tmp_path: Path):
- """Ensures .gitignore is ignored when `use_gitignore` is false in the config."""
- (tmp_path / "main.py").touch()
- (tmp_path / "not_ignored.log").touch()
- (tmp_path / ".gitignore").write_text("*.log")
- config = {"use_gitignore": False, "exclude": []}
- (tmp_path / ".scriber.json").write_text(json.dumps(config))
-
- scriber = Scriber(root_path=tmp_path)
- scriber.map_project()
-
- paths = {p.name for p in scriber.mapped_files}
- assert "main.py" in paths
- assert "not_ignored.log" in paths
-
- def test_binary_file_skipping(self, tmp_path: Path):
- """Tests that binary files are detected and correctly skipped."""
- (tmp_path / "app.exe").write_bytes(b"\x4d\x5a\x90\x00\x03\x00\x00\x00")
-
- config = ScriberConfig(include=["app.exe"], exclude=[])
- scriber = Scriber(root_path=tmp_path, config=config)
- scriber.map_project()
-
- assert len(scriber.mapped_files) == 0
- assert scriber.get_stats()['skipped_binary'] == 1
-
- def test_include_patterns(self, tmp_path: Path):
- """Tests that 'include' patterns correctly filter files when provided."""
- (tmp_path / "main.py").touch()
- (tmp_path / "script.js").touch()
- (tmp_path / "style.css").touch()
- (tmp_path / ".scriber.json").write_text(json.dumps({"include": ["*.py", "*.js"]}))
-
- scriber = Scriber(root_path=tmp_path)
- scriber.map_project()
-
- paths = {p.name for p in scriber.mapped_files}
- assert paths == {"main.py", "script.js"}
-
- def test_exclude_map_dictionary(self, tmp_path: Path):
- """Tests that the exclude_map dictionary filter works as intended."""
- (tmp_path / "app.py").touch()
- (tmp_path / "utils_test.py").touch()
- (tmp_path / "script.js").touch()
- (tmp_path / "archive.log").touch()
- (tmp_path / "README.md").touch()
-
- config = ScriberConfig(
- exclude_map={
- "python": ["*_test.py"],
- "global": ["*.log"]
- },
- exclude=[],
- include=[]
- )
- scriber = Scriber(root_path=tmp_path, config=config)
- files = scriber.get_mapped_files()
- mapped_names = {p.name for p in files}
-
- assert "app.py" in mapped_names
- assert "script.js" in mapped_names
- assert "README.md" in mapped_names
- assert "utils_test.py" not in mapped_names
- assert "archive.log" not in mapped_names
- assert len(mapped_names) == 3
-
- def test_hidden_files_are_in_tree_but_content_is_skipped(self, tmp_path: Path):
- """Tests that hidden files appear in the tree but their content is not in the output."""
- (tmp_path / "main.py").write_text("print('hello')")
- lock_content = "some-lock-file-content"
- (tmp_path / "poetry.lock").write_text(lock_content)
- config = {"hidden": ["poetry.lock"], "exclude": []}
- (tmp_path / ".scriber.json").write_text(json.dumps(config))
-
- scriber = Scriber(root_path=tmp_path)
- scriber.map_project()
-
- output_buffer = io.StringIO()
- scriber._write_output(output_buffer, tree_only=False, progress=None, task_id=None)
- output = output_buffer.getvalue()
-
- assert "poetry.lock" in output
- assert "[Content hidden based on configuration]" in output
- assert lock_content not in output
- assert "print('hello')" in output
-
- def test_hidden_files_are_excluded_from_token_count(self, tmp_path: Path):
- """Tests that hidden files contribute to size but not token count."""
- main_py_content = "def main(): pass"
- (tmp_path / "main.py").write_text(main_py_content)
- (tmp_path / "poetry.lock").write_text("some-lock-file-content")
- config = {"hidden": ["poetry.lock"], "exclude": [".scriber.json"]}
- (tmp_path / ".scriber.json").write_text(json.dumps(config))
-
- scriber = Scriber(root_path=tmp_path)
- scriber.map_project()
- stats = scriber.get_stats()
-
- tokenizer = tiktoken.get_encoding("cl100k_base")
- expected_tokens = len(tokenizer.encode(main_py_content))
-
- assert stats["total_files"] == 2
- assert stats["total_tokens"] == expected_tokens
- assert stats["total_size_bytes"] == (
- (tmp_path / "main.py").stat().st_size +
- (tmp_path / "poetry.lock").stat().st_size
- )
-
- def test_init_with_direct_config_object(self, tmp_path: Path):
- """Tests that Scriber can be configured directly with a ScriberConfig object."""
- (tmp_path / "app.py").touch()
- (tmp_path / "data.json").touch()
- direct_config = ScriberConfig(include=["*.py"], exclude=[])
-
- scriber = Scriber(root_path=tmp_path, config=direct_config)
- files = scriber.get_mapped_files()
-
- paths = {p.name for p in files}
- assert paths == {"app.py"}
- assert scriber.config_path_used is None
-
- def test_get_output_as_string(self, tmp_path: Path):
- """Tests that the full project map can be retrieved as a string."""
- (tmp_path / "main.py").write_text("print('test')")
- scriber = Scriber(root_path=tmp_path)
- output_str = scriber.get_output_as_string()
-
- assert isinstance(output_str, str)
- assert "Mapped Folder Structure" in output_str
- assert "main.py" in output_str
- assert "print('test')" in output_str
-
- def test_getters_trigger_map_project_automatically(self, tmp_path: Path):
- """Tests that getter methods automatically call map_project if not already run."""
- (tmp_path / "test.txt").touch()
- scriber = Scriber(root_path=tmp_path)
-
- assert not scriber.mapped_files
- stats = scriber.get_stats()
- assert len(scriber.mapped_files) == 1
- assert stats["total_files"] == 1
-
- def test_core_loads_external_toml_config(self, tmp_path: Path):
- """Tests core logic loads config from an external pyproject.toml via config_path."""
- config_dir = tmp_path / "config"
- config_dir.mkdir()
- toml_path = config_dir / "pyproject.toml"
- toml_path.write_text("[tool.scriber]\ninclude = ['*.py']")
-
- project_dir = tmp_path / "project"
- project_dir.mkdir()
- (project_dir / "app.py").touch()
- (project_dir / "data.json").touch()
-
- scriber = Scriber(root_path=project_dir, config_path=toml_path)
- scriber.map_project()
-
- paths = {p.name for p in scriber.mapped_files}
- assert paths == {"app.py"}
- assert scriber.config_path_used == toml_path
-
- def test_core_handles_nonexistent_config_path(self, tmp_path: Path, capsys):
- """Tests that a warning is printed for a non-existent --config path."""
- non_existent_path = tmp_path / "nonexistent.json"
- Scriber(root_path=tmp_path, config_path=non_existent_path)
- captured = capsys.readouterr()
- assert "Warning: Config file specified by --config not found" in captured.err
-
- def test_tree_representation(self, tmp_path: Path):
- """Checks if the folder tree string is formatted correctly."""
- (tmp_path / "src").mkdir()
- (tmp_path / "src" / "main.py").touch()
- (tmp_path / "README.md").touch()
-
- scriber = Scriber(root_path=tmp_path)
- scriber.map_project()
- tree_str = scriber._get_tree_representation()
-
- expected_lines = [
- tmp_path.name,
- "βββ README.md",
- "βββ src",
- " βββ main.py",
- ]
- actual_lines = tree_str.split('\n')
- # The tree formatting can have subtle whitespace differences, so we check line by line
- assert actual_lines[0] == expected_lines[0]
- assert "README.md" in actual_lines[1]
- assert "src" in actual_lines[2]
- assert "main.py" in actual_lines[3]
-
-
- @pytest.mark.parametrize("filename, expected_lang", [
- ("test.py", "python"),
- ("script.js", "javascript"),
- ("style.css", "css"),
- ("Dockerfile", "dockerfile"),
- ("unknown.xyz", ""),
- ])
- def test_language_detection(self, tmp_path: Path, filename: str, expected_lang: str):
- """Tests the language mapping utility for various file types."""
- scriber = Scriber(root_path=tmp_path)
- lang = scriber._get_language(Path(filename))
- assert lang == expected_lang
-
- def test_multi_root_collection(self, tmp_path: Path):
- """Tests that files from multiple root directories are collected."""
- project_a = tmp_path / "project_a"
- project_a.mkdir()
- (project_a / "a.py").touch()
-
- project_b = tmp_path / "project_b"
- project_b.mkdir()
- (project_b / "b.js").touch()
-
- scriber = Scriber(root_path=[project_a, project_b])
- scriber.map_project()
- mapped_names = {p.name for p in scriber.mapped_files}
-
- assert mapped_names == {"a.py", "b.js"}
- assert len(scriber.mapped_files) == 2
-
- def test_multi_root_tree_and_output(self, tmp_path: Path):
- """Tests tree and output format for multiple roots."""
- project_a = tmp_path / "project_a"
- project_a.mkdir()
- (project_a / "a.py").write_text("print('a')")
-
- project_b = tmp_path / "project_b"
- project_b.mkdir()
- (project_b / "b.js").write_text("console.log('b')")
-
- scriber = Scriber(root_path=[project_a, project_b])
- output = scriber.get_output_as_string()
-
- assert "project_a\nβββ a.py" in output
- assert "project_b\nβββ b.js" in output
- assert f"File: project_a/a.py" in output
- assert f"File: project_b/b.js" in output
-
-# --- Test CLI Functionality ---
-
-class TestCli:
- """Groups tests for the command-line interface in `src.scriber.cli`."""
-
- @patch('src.scriber.cli.run_scriber')
- def test_cli_run_command_is_default(self, mock_run_scriber, mocker):
- """Tests that the 'run' command is triggered by default with no subcommand."""
- mocker.patch('sys.argv', ['scriber'])
- cli_main()
- mock_run_scriber.assert_called_once()
-
- @patch('src.scriber.cli.Scriber')
- def test_cli_arguments_are_passed_correctly(self, mock_scriber, mocker, tmp_path: Path):
- """Tests if CLI arguments are correctly parsed and passed to the Scriber class."""
- mock_instance = MagicMock()
- mock_instance.get_output_as_string.return_value = "Mocked Output"
- mock_instance.config = ScriberConfig(output="default_name.txt")
- mock_instance.get_stats.return_value = {'total_files': 0, 'language_counts': Counter()}
- mock_instance.get_file_count.return_value = 0
- mock_scriber.return_value = mock_instance
- mocker.patch('pyperclip.copy')
-
- project_dir = tmp_path / "project"
- project_dir.mkdir()
- config_file = tmp_path / "config.json"
- config_file.touch()
-
- test_path_str = str(project_dir)
- test_output = "output.txt"
- test_config_str = str(config_file)
-
- mocker.patch('sys.argv', [
- 'scriber', 'run', test_path_str, '--output', test_output, '--config', test_config_str, '--tree-only'
- ])
-
- cli_main()
-
- mock_scriber.assert_called_with(Path(test_path_str).resolve(), config_path=Path(test_config_str))
-
- mock_instance.get_output_as_string.assert_called_once()
- call_kwargs = mock_instance.get_output_as_string.call_args.kwargs
- assert call_kwargs['tree_only'] is True
-
- output_file = project_dir / test_output
- assert output_file.is_file()
- assert output_file.read_text() == "Mocked Output"
-
- @patch('src.scriber.cli.Confirm.ask')
- @patch('src.scriber.cli.Prompt.ask')
- def test_cli_init_command_creates_config(self, mock_prompt, mock_confirm, tmp_path: Path, mocker):
- """Tests the interactive 'init' command for config file creation."""
- mocker.patch('pathlib.Path.cwd', return_value=tmp_path)
- mock_confirm.return_value = False
- mock_prompt.side_effect = ["*.tmp, *.log", "*.py", "", "1"]
-
- mocker.patch('sys.argv', ['scriber', 'init'])
- cli_main()
-
- config_path = tmp_path / ".scriber.json"
- assert config_path.exists()
-
- with open(config_path, "r", encoding="utf-8") as f:
- data = json.load(f)
-
- assert not data['use_gitignore']
- assert data['exclude'] == ['*.tmp', '*.log']
- assert data['include'] == ['*.py']
-
- @patch('src.scriber.cli.Confirm.ask')
- @patch('src.scriber.cli.Prompt.ask')
- def test_cli_init_command_creates_config_in_toml(self, mock_prompt, mock_confirm, tmp_path: Path, mocker):
- """Tests the interactive 'init' command for saving config to pyproject.toml."""
- mocker.patch('pathlib.Path.cwd', return_value=tmp_path)
-
- pyproject_path = tmp_path / "pyproject.toml"
- pyproject_path.write_text("[project]\nname = 'test-project'")
-
- mock_confirm.return_value = True
- mock_prompt.side_effect = ["*.log, .env", "*.py", "*.lock", "2"]
-
- mocker.patch('sys.argv', ['scriber', 'init'])
- cli_main()
-
- assert pyproject_path.exists()
-
- with open(pyproject_path, "rb") as f:
- data = tomllib.load(f)
-
- assert "tool" in data
- assert "scriber" in data["tool"]
- scriber_config = data["tool"]["scriber"]
- assert scriber_config['use_gitignore'] is True
- assert scriber_config['exclude'] == ['*.log', '.env']
- assert scriber_config['include'] == ['*.py']
- assert scriber_config['hidden'] == ['*.lock']
-
- @pytest.mark.parametrize("bytes_val, expected_str", [
- (500, "500 Bytes"),
- (2048, "2.00 KB"),
- (1500000, "1.43 MB"),
- (2 * 1024 * 1024, "2.00 MB"),
- ])
- def test_format_bytes_utility(self, bytes_val: int, expected_str: str):
- """Tests the byte formatting utility function."""
- assert format_bytes(bytes_val) == expected_str
\ No newline at end of file
diff --git a/tests/test_tokens.py b/tests/test_tokens.py
new file mode 100644
index 0000000..fe1e2b2
--- /dev/null
+++ b/tests/test_tokens.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+from pathlib import Path
+from scriber.tokens import estimate_tokens
+from scriber.core.models import TokenConfig
+from scriber.core.config import load_config
+
+
+def test_token_estimation_default() -> None:
+ text = "hello world"
+ # default chars_per_token is 4, len("hello world") == 11, 11 // 4 == 2
+ assert estimate_tokens(text) == 2
+
+
+def test_token_estimation_custom_config() -> None:
+ text = "hello world"
+ config = TokenConfig(estimator="chars", chars_per_token=2)
+ # len("hello world") == 11, 11 // 2 == 5
+ assert estimate_tokens(text, config) == 5
+
+
+def test_token_estimation_parsing_from_config(tmp_path: Path) -> None:
+ config_file = tmp_path / "pyproject.toml"
+ config_file.write_text("""
+[tool.scriber.tokens]
+estimator = "chars"
+chars_per_token = 5
+""".strip(), encoding="utf-8")
+
+ config = load_config(config_file)
+ assert config.tokens.estimator == "chars"
+ assert config.tokens.chars_per_token == 5
+
+ text = "hello world"
+ # len("hello world") == 11, 11 // 5 == 2
+ assert estimate_tokens(text, config.tokens) == 2