diff --git a/newsfragments/3677.added.md b/newsfragments/3677.added.md new file mode 100644 index 00000000000..3e6bc56d582 --- /dev/null +++ b/newsfragments/3677.added.md @@ -0,0 +1 @@ +Add `PyString::to_cow`. Add `Py::to_str`, `Py::to_cow`, and `Py::to_string_lossy`, as ways to access Python string data safely beyond the GIL lifetime. diff --git a/src/prelude.rs b/src/prelude.rs index 2a0397c10bd..b5344cb9047 100644 --- a/src/prelude.rs +++ b/src/prelude.rs @@ -32,3 +32,4 @@ pub use crate::wrap_pyfunction; // pub(crate) use crate::types::bytes::PyBytesMethods; // pub(crate) use crate::types::float::PyFloatMethods; // pub(crate) use crate::types::sequence::PySequenceMethods; +// pub(crate) use crate::types::string::PyStringMethods; diff --git a/src/types/mod.rs b/src/types/mod.rs index a7ea5b631f8..8e1d6b00810 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -301,7 +301,7 @@ mod pysuper; pub(crate) mod sequence; pub(crate) mod set; mod slice; -mod string; +pub(crate) mod string; mod traceback; mod tuple; mod typeobject; diff --git a/src/types/string.rs b/src/types/string.rs index 1a6a717b57b..e175239d77d 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -1,7 +1,11 @@ #[cfg(not(Py_LIMITED_API))] use crate::exceptions::PyUnicodeDecodeError; +use crate::ffi_ptr_ext::FfiPtrExt; +use crate::instance::Py2Borrowed; +use crate::types::any::PyAnyMethods; +use crate::types::bytes::PyBytesMethods; use crate::types::PyBytes; -use crate::{ffi, PyAny, PyResult, Python}; +use crate::{ffi, IntoPy, Py, Py2, PyAny, PyResult, Python}; use std::borrow::Cow; use std::os::raw::c_char; use std::str; @@ -165,7 +169,7 @@ impl PyString { pub fn from_object<'p>(src: &'p PyAny, encoding: &str, errors: &str) -> PyResult<&'p PyString> { unsafe { src.py() - .from_owned_ptr_or_err::(ffi::PyUnicode_FromEncodedObject( + .from_owned_ptr_or_err(ffi::PyUnicode_FromEncodedObject( src.as_ptr(), encoding.as_ptr() as *const c_char, errors.as_ptr() as *const c_char, @@ -173,32 +177,32 @@ impl PyString { } } - /// Gets the Python string as a byte slice. + /// Gets the Python string as a Rust UTF-8 string slice. /// /// Returns a `UnicodeEncodeError` if the input is not valid unicode /// (containing unpaired surrogates). - #[inline] pub fn to_str(&self) -> PyResult<&str> { - let utf8_slice = { - cfg_if::cfg_if! { - if #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] { - // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10. - let mut size: ffi::Py_ssize_t = 0; - let data: *const u8 = unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() }; - if data.is_null() { - return Err(crate::PyErr::fetch(self.py())); - } else { - unsafe { std::slice::from_raw_parts(data, size as usize) } - } - } else { - let bytes = unsafe { - self.py().from_owned_ptr_or_err::(ffi::PyUnicode_AsUTF8String(self.as_ptr()))? - }; - bytes.as_bytes() - } - } - }; - Ok(unsafe { std::str::from_utf8_unchecked(utf8_slice) }) + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + { + Py2Borrowed::from_gil_ref(self).to_str() + } + + #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] + { + let bytes = unsafe { + self.py() + .from_owned_ptr_or_err::(ffi::PyUnicode_AsUTF8String(self.as_ptr())) + }?; + Ok(unsafe { std::str::from_utf8_unchecked(bytes.as_bytes()) }) + } + } + + /// Converts the `PyString` into a Rust string, avoiding copying when possible. + /// + /// Returns a `UnicodeEncodeError` if the input is not valid unicode + /// (containing unpaired surrogates). + pub fn to_cow(&self) -> PyResult> { + Py2Borrowed::from_gil_ref(self).to_cow() } /// Converts the `PyString` into a Rust string. @@ -206,20 +210,7 @@ impl PyString { /// Unpaired surrogates invalid UTF-8 sequences are /// replaced with `U+FFFD REPLACEMENT CHARACTER`. pub fn to_string_lossy(&self) -> Cow<'_, str> { - match self.to_str() { - Ok(s) => Cow::Borrowed(s), - Err(_) => { - let bytes = unsafe { - self.py() - .from_owned_ptr::(ffi::PyUnicode_AsEncodedString( - self.as_ptr(), - b"utf-8\0" as *const _ as _, - b"surrogatepass\0" as *const _ as _, - )) - }; - String::from_utf8_lossy(bytes.as_bytes()) - } - } + Py2Borrowed::from_gil_ref(self).to_string_lossy() } /// Obtains the raw data backing the Python string. @@ -238,6 +229,137 @@ impl PyString { /// expected on the targets where you plan to distribute your software. #[cfg(not(Py_LIMITED_API))] pub unsafe fn data(&self) -> PyResult> { + Py2Borrowed::from_gil_ref(self).data() + } +} + +/// Implementation of functionality for [`PyString`]. +/// +/// These methods are defined for the `Py2<'py, PyString>` smart pointer, so to use method call +/// syntax these methods are separated into a trait, because stable Rust does not yet support +/// `arbitrary_self_types`. +#[doc(alias = "PyString")] +pub(crate) trait PyStringMethods<'py> { + /// Gets the Python string as a Rust UTF-8 string slice. + /// + /// Returns a `UnicodeEncodeError` if the input is not valid unicode + /// (containing unpaired surrogates). + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + fn to_str(&self) -> PyResult<&str>; + + /// Converts the `PyString` into a Rust string, avoiding copying when possible. + /// + /// Returns a `UnicodeEncodeError` if the input is not valid unicode + /// (containing unpaired surrogates). + fn to_cow(&self) -> PyResult>; + + /// Converts the `PyString` into a Rust string. + /// + /// Unpaired surrogates invalid UTF-8 sequences are + /// replaced with `U+FFFD REPLACEMENT CHARACTER`. + fn to_string_lossy(&self) -> Cow<'_, str>; + + /// Obtains the raw data backing the Python string. + /// + /// If the Python string object was created through legacy APIs, its internal storage format + /// will be canonicalized before data is returned. + /// + /// # Safety + /// + /// This function implementation relies on manually decoding a C bitfield. In practice, this + /// works well on common little-endian architectures such as x86_64, where the bitfield has a + /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on + /// x86_64 platforms. + /// + /// By using this API, you accept responsibility for testing that PyStringData behaves as + /// expected on the targets where you plan to distribute your software. + #[cfg(not(Py_LIMITED_API))] + unsafe fn data(&self) -> PyResult>; +} + +impl<'py> PyStringMethods<'py> for Py2<'py, PyString> { + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + fn to_str(&self) -> PyResult<&str> { + Py2Borrowed::from(self).to_str() + } + + fn to_cow(&self) -> PyResult> { + Py2Borrowed::from(self).to_cow() + } + + fn to_string_lossy(&self) -> Cow<'_, str> { + Py2Borrowed::from(self).to_string_lossy() + } + + #[cfg(not(Py_LIMITED_API))] + unsafe fn data(&self) -> PyResult> { + Py2Borrowed::from(self).data() + } +} + +impl<'a> Py2Borrowed<'a, '_, PyString> { + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + #[allow(clippy::wrong_self_convention)] + fn to_str(self) -> PyResult<&'a str> { + // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10. + let mut size: ffi::Py_ssize_t = 0; + let data: *const u8 = + unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() }; + if data.is_null() { + Err(crate::PyErr::fetch(self.py())) + } else { + Ok(unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize)) + }) + } + } + + #[allow(clippy::wrong_self_convention)] + fn to_cow(self) -> PyResult> { + // TODO: this method can probably be deprecated once Python 3.9 support is dropped, + // because all versions then support the more efficient `to_str`. + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + { + self.to_str().map(Cow::Borrowed) + } + + #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] + { + let bytes = unsafe { + ffi::PyUnicode_AsUTF8String(self.as_ptr()) + .assume_owned_or_err(self.py())? + .downcast_into_unchecked::() + }; + Ok(Cow::Owned( + unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(), + )) + } + } + + #[allow(clippy::wrong_self_convention)] + fn to_string_lossy(self) -> Cow<'a, str> { + let ptr = self.as_ptr(); + let py = self.py(); + + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + if let Ok(s) = self.to_str() { + return Cow::Borrowed(s); + } + + let bytes = unsafe { + ffi::PyUnicode_AsEncodedString( + ptr, + b"utf-8\0".as_ptr().cast(), + b"surrogatepass\0".as_ptr().cast(), + ) + .assume_owned(py) + .downcast_into_unchecked::() + }; + Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned()) + } + + #[cfg(not(Py_LIMITED_API))] + unsafe fn data(self) -> PyResult> { let ptr = self.as_ptr(); #[cfg(not(Py_3_12))] @@ -275,6 +397,60 @@ impl PyString { } } +impl Py { + /// Gets the Python string as a Rust UTF-8 string slice. + /// + /// Returns a `UnicodeEncodeError` if the input is not valid unicode + /// (containing unpaired surrogates). + /// + /// Because `str` objects are immutable, the returned slice is independent of + /// the GIL lifetime. + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> { + self.attach_borrow(py).to_str() + } + + /// Converts the `PyString` into a Rust string, avoiding copying when possible. + /// + /// Returns a `UnicodeEncodeError` if the input is not valid unicode + /// (containing unpaired surrogates). + /// + /// Because `str` objects are immutable, the returned slice is independent of + /// the GIL lifetime. + pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult> { + self.attach_borrow(py).to_cow() + } + + /// Converts the `PyString` into a Rust string. + /// + /// Unpaired surrogates invalid UTF-8 sequences are + /// replaced with `U+FFFD REPLACEMENT CHARACTER`. + /// + /// Because `str` objects are immutable, the returned slice is independent of + /// the GIL lifetime. + pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> { + self.attach_borrow(py).to_string_lossy() + } +} + +impl IntoPy> for Py2<'_, PyString> { + fn into_py(self, _py: Python<'_>) -> Py { + self.into() + } +} + +impl IntoPy> for &Py2<'_, PyString> { + fn into_py(self, _py: Python<'_>) -> Py { + self.clone().into() + } +} + +impl IntoPy> for &'_ Py { + fn into_py(self, py: Python<'_>) -> Py { + self.clone_ref(py) + } +} + #[cfg(test)] mod tests { use super::*; @@ -284,7 +460,7 @@ mod tests { use std::borrow::Cow; #[test] - fn test_to_str_ascii() { + fn test_to_str_utf8() { Python::with_gil(|py| { let s = "ascii 🐈"; let obj: PyObject = PyString::new(py, s).into(); @@ -479,4 +655,42 @@ mod tests { assert_ne!(py_string1.as_ptr(), py_string3.as_ptr()); }); } + + #[test] + fn test_py_to_str_utf8() { + Python::with_gil(|py| { + let s = "ascii 🐈"; + let py_string: Py = PyString::new(py, s).into_py(py); + + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + assert_eq!(s, py_string.to_str(py).unwrap()); + + assert_eq!(s, py_string.to_cow(py).unwrap()); + }) + } + + #[test] + fn test_py_to_str_surrogate() { + Python::with_gil(|py| { + let py_string: Py = + py.eval(r"'\ud800'", None, None).unwrap().extract().unwrap(); + + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + assert!(py_string.to_str(py).is_err()); + + assert!(py_string.to_cow(py).is_err()); + }) + } + + #[test] + fn test_py_to_string_lossy() { + Python::with_gil(|py| { + let py_string: Py = py + .eval(r"'🐈 Hello \ud800World'", None, None) + .unwrap() + .extract() + .unwrap(); + assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World"); + }) + } }