From 4447635352292cdb714ba5ee3e05f2f1553a6c02 Mon Sep 17 00:00:00 2001 From: Bas Schoenmaeckers Date: Sun, 17 May 2026 16:30:27 +0200 Subject: [PATCH 1/2] Add unicode & bytes c-api support --- crates/capi/src/bytesobject.rs | 68 ++++++++++++++ crates/capi/src/lib.rs | 2 + crates/capi/src/object.rs | 1 + crates/capi/src/unicodeobject.rs | 150 +++++++++++++++++++++++++++++++ 4 files changed, 221 insertions(+) create mode 100644 crates/capi/src/bytesobject.rs create mode 100644 crates/capi/src/unicodeobject.rs diff --git a/crates/capi/src/bytesobject.rs b/crates/capi/src/bytesobject.rs new file mode 100644 index 0000000000..4d1768bcef --- /dev/null +++ b/crates/capi/src/bytesobject.rs @@ -0,0 +1,68 @@ +use crate::PyObject; +use crate::object::define_py_check; +use crate::pystate::with_vm; +use core::ffi::c_char; +use rustpython_vm::builtins::PyBytes; + +define_py_check!(fn PyBytes_Check, types.bytes_type); +define_py_check!(exact fn PyBytes_CheckExact, types.bytes_type); + +#[unsafe(no_mangle)] +#[allow(clippy::uninit_vec)] +pub unsafe extern "C" fn PyBytes_FromStringAndSize( + bytes: *mut c_char, + len: isize, +) -> *mut PyObject { + with_vm(|vm| { + let data = if bytes.is_null() { + let mut data = Vec::with_capacity(len as usize); + unsafe { data.set_len(len as usize) }; + data + } else { + unsafe { core::slice::from_raw_parts(bytes as *const u8, len as usize) }.to_vec() + }; + vm.ctx.new_bytes(data) + }) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn PyBytes_Size(bytes: *mut PyObject) -> isize { + with_vm(|vm| { + let bytes = unsafe { &*bytes }.try_downcast_ref::(vm)?; + Ok(bytes.as_bytes().len()) + }) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn PyBytes_AsString(bytes: *mut PyObject) -> *mut c_char { + with_vm(|vm| { + let bytes = unsafe { &*bytes }.try_downcast_ref::(vm)?; + Ok(bytes.as_bytes().as_ptr()) + }) +} + +#[cfg(false)] +mod tests { + use pyo3::prelude::*; + use pyo3::types::PyBytes; + + #[test] + fn test_bytes() { + Python::attach(|py| { + let bytes = PyBytes::new(py, b"Hello, World!"); + assert_eq!(bytes.as_bytes(), b"Hello, World!"); + }) + } + + #[test] + fn test_bytes_uninit() { + Python::attach(|py| { + let bytes = PyBytes::new_with(py, 13, |data| { + data.copy_from_slice(b"Hello, World!"); + Ok(()) + }) + .unwrap(); + assert_eq!(bytes.as_bytes(), b"Hello, World!"); + }) + } +} diff --git a/crates/capi/src/lib.rs b/crates/capi/src/lib.rs index ebabc1521e..9ef0155a28 100644 --- a/crates/capi/src/lib.rs +++ b/crates/capi/src/lib.rs @@ -9,12 +9,14 @@ use std::sync::MutexGuard; extern crate alloc; pub mod abstract_; +pub mod bytesobject; pub mod import; pub mod object; pub mod pyerrors; pub mod pylifecycle; pub mod pystate; pub mod refcount; +pub mod unicodeobject; mod util; /// Get main interpreter of this process. Will be None if it has not been initialized yet. diff --git a/crates/capi/src/object.rs b/crates/capi/src/object.rs index f71475049f..89995f8e90 100644 --- a/crates/capi/src/object.rs +++ b/crates/capi/src/object.rs @@ -33,6 +33,7 @@ macro_rules! define_py_check { }; } +pub(crate) use define_py_check; define_py_check!(fn PyType_Check, types.type_type); define_py_check!(exact fn PyType_CheckExact, types.type_type); diff --git a/crates/capi/src/unicodeobject.rs b/crates/capi/src/unicodeobject.rs new file mode 100644 index 0000000000..7316de3206 --- /dev/null +++ b/crates/capi/src/unicodeobject.rs @@ -0,0 +1,150 @@ +use crate::PyObject; +use crate::object::define_py_check; +use crate::pystate::with_vm; +use core::ffi::{CStr, c_char, c_int}; +use core::ptr::NonNull; +use core::slice; +use core::str; +use rustpython_vm::PyObjectRef; +use rustpython_vm::builtins::PyStr; + +define_py_check!(fn PyUnicode_Check, types.str_type); +define_py_check!(exact fn PyUnicode_CheckExact, types.str_type); + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn PyUnicode_FromStringAndSize( + s: *const c_char, + len: isize, +) -> *mut PyObject { + with_vm(|vm| { + let len: usize = len + .try_into() + .map_err(|_| vm.new_system_error("length must be non-negative"))?; + + let text = if s.is_null() { + if len != 0 { + return Err(vm.new_system_error( + "PyUnicode_FromStringAndSize called with null data and non-zero len", + )); + } + "" + } else { + let bytes = unsafe { slice::from_raw_parts(s.cast::(), len) }; + str::from_utf8(bytes).expect("PyUnicode_FromStringAndSize got non-UTF8 data") + }; + + Ok(vm.ctx.new_str(text)) + }) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn PyUnicode_AsUTF8AndSize( + obj: *mut PyObject, + size: *mut isize, +) -> *const c_char { + with_vm(|vm| { + let unicode = unsafe { &*obj }.try_downcast_ref::(vm)?; + + let str = unicode.to_str().ok_or_else(|| { + vm.new_system_error("PyUnicode_AsUTF8AndSize only supports UTF-8 or ASCII strings") + })?; + + if size.is_null() { + // We do not support null size arguments because the returned string is not NULL terminated. + return Err( + vm.new_system_error("size argument to PyUnicode_AsUTF8AndSize cannot be null") + ); + } + + unsafe { *size = str.len() as isize }; + Ok(str.as_ptr()) + }) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn PyUnicode_AsEncodedString( + unicode: *mut PyObject, + encoding: *const c_char, + errors: *const c_char, +) -> *mut PyObject { + with_vm(|vm| { + let unicode = unsafe { &*unicode } + .try_downcast_ref::(vm)? + .to_owned(); + let encoding = if encoding.is_null() { + "utf-8" + } else { + unsafe { CStr::from_ptr(encoding) } + .to_str() + .expect("encoding must be valid UTF-8") + }; + let errors = if errors.is_null() { + None + } else { + let errors = unsafe { CStr::from_ptr(errors) } + .to_str() + .expect("errors must be valid UTF-8"); + Some(vm.ctx.new_utf8_str(errors)) + }; + vm.state + .codec_registry + .encode_text(unicode, encoding, errors, vm) + }) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn PyUnicode_InternInPlace(string: *mut *mut PyObject) { + with_vm(|vm| { + let old_str = unsafe { PyObjectRef::from_raw(NonNull::new_unchecked(*string)) } + .downcast_exact::(vm) + .expect("PyUnicode_InternInPlace called with non-string object"); + + let interned: PyObjectRef = vm.ctx.intern_str(old_str).to_owned().into(); + + unsafe { *string = interned.into_raw().as_ptr() } + }) +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn PyUnicode_EqualToUTF8AndSize( + unicode: *mut PyObject, + string: *const c_char, + size: isize, +) -> c_int { + with_vm(|vm| { + let unicode = unsafe { &*unicode }.try_downcast_ref::(vm)?; + let result = unsafe { + let slice = slice::from_raw_parts(string as _, size as _); + str::from_utf8(slice) + } + .ok() + .and_then(|other| Some(unicode.to_str()? == other)) + .unwrap_or(false); + + Ok(result) + }) +} + +#[cfg(false)] +mod tests { + use pyo3::intern; + use pyo3::prelude::*; + use pyo3::types::PyString; + + #[test] + fn test_unicode() { + Python::attach(|py| { + let string = PyString::new(py, "Hello, World!"); + assert!(string.is_instance_of::()); + assert_eq!(string.to_str().unwrap(), "Hello, World!"); + assert_eq!(string, "Hello, World!"); + }) + } + + #[test] + fn test_intern_str() { + Python::attach(|py| { + let _string = intern!(py, "Hello, World!"); + }) + } +} From 78ba5e0b37cd54e4239868c1bfdd43572bb2125c Mon Sep 17 00:00:00 2001 From: Bas Schoenmaeckers Date: Sun, 17 May 2026 17:49:19 +0200 Subject: [PATCH 2/2] Check for negative size --- crates/capi/src/bytesobject.rs | 13 +++++++++---- crates/capi/src/unicodeobject.rs | 6 +++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/crates/capi/src/bytesobject.rs b/crates/capi/src/bytesobject.rs index 4d1768bcef..1fe535efba 100644 --- a/crates/capi/src/bytesobject.rs +++ b/crates/capi/src/bytesobject.rs @@ -14,14 +14,19 @@ pub unsafe extern "C" fn PyBytes_FromStringAndSize( len: isize, ) -> *mut PyObject { with_vm(|vm| { + let len = len.try_into().map_err(|_| { + vm.new_system_error("Negative size passed to PyBytes_FromStringAndSize") + })?; + let data = if bytes.is_null() { - let mut data = Vec::with_capacity(len as usize); - unsafe { data.set_len(len as usize) }; + let mut data = Vec::with_capacity(len); + unsafe { data.set_len(len) }; data } else { - unsafe { core::slice::from_raw_parts(bytes as *const u8, len as usize) }.to_vec() + unsafe { core::slice::from_raw_parts(bytes as *const u8, len) }.to_vec() }; - vm.ctx.new_bytes(data) + + Ok(vm.ctx.new_bytes(data)) }) } diff --git a/crates/capi/src/unicodeobject.rs b/crates/capi/src/unicodeobject.rs index 7316de3206..76e0fb0df5 100644 --- a/crates/capi/src/unicodeobject.rs +++ b/crates/capi/src/unicodeobject.rs @@ -112,9 +112,13 @@ pub unsafe extern "C" fn PyUnicode_EqualToUTF8AndSize( size: isize, ) -> c_int { with_vm(|vm| { + let size = size.try_into().map_err(|_| { + vm.new_system_error("Negative size passed to PyUnicode_EqualToUTF8AndSize") + })?; + let unicode = unsafe { &*unicode }.try_downcast_ref::(vm)?; let result = unsafe { - let slice = slice::from_raw_parts(string as _, size as _); + let slice = slice::from_raw_parts(string as _, size); str::from_utf8(slice) } .ok()