Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement PyStringMethods #3677

Merged
merged 1 commit into from Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions newsfragments/3677.added.md
@@ -0,0 +1 @@
Add `PyString::to_cow`. Add `Py<PyString>::to_str`, `Py<PyString>::to_cow`, and `Py<PyString>::to_string_lossy`, as ways to access Python string data safely beyond the GIL lifetime.
1 change: 1 addition & 0 deletions src/prelude.rs
Expand Up @@ -32,3 +32,4 @@ pub use crate::wrap_pyfunction;
// pub(crate) use crate::types::bytes::PyBytesMethods;
// pub(crate) use crate::types::float::PyFloatMethods;
// pub(crate) use crate::types::sequence::PySequenceMethods;
// pub(crate) use crate::types::string::PyStringMethods;
2 changes: 1 addition & 1 deletion src/types/mod.rs
Expand Up @@ -301,7 +301,7 @@ mod pysuper;
pub(crate) mod sequence;
pub(crate) mod set;
mod slice;
mod string;
pub(crate) mod string;
mod traceback;
mod tuple;
mod typeobject;
292 changes: 253 additions & 39 deletions src/types/string.rs
@@ -1,7 +1,11 @@
#[cfg(not(Py_LIMITED_API))]
use crate::exceptions::PyUnicodeDecodeError;
use crate::ffi_ptr_ext::FfiPtrExt;
use crate::instance::Py2Borrowed;
use crate::types::any::PyAnyMethods;
use crate::types::bytes::PyBytesMethods;
use crate::types::PyBytes;
use crate::{ffi, PyAny, PyResult, Python};
use crate::{ffi, IntoPy, Py, Py2, PyAny, PyResult, Python};
use std::borrow::Cow;
use std::os::raw::c_char;
use std::str;
Expand Down Expand Up @@ -165,61 +169,48 @@ impl PyString {
pub fn from_object<'p>(src: &'p PyAny, encoding: &str, errors: &str) -> PyResult<&'p PyString> {
unsafe {
src.py()
.from_owned_ptr_or_err::<PyString>(ffi::PyUnicode_FromEncodedObject(
.from_owned_ptr_or_err(ffi::PyUnicode_FromEncodedObject(
src.as_ptr(),
encoding.as_ptr() as *const c_char,
errors.as_ptr() as *const c_char,
))
}
}

/// Gets the Python string as a byte slice.
/// Gets the Python string as a Rust UTF-8 string slice.
///
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
#[inline]
pub fn to_str(&self) -> PyResult<&str> {
let utf8_slice = {
cfg_if::cfg_if! {
if #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] {
// PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
let mut size: ffi::Py_ssize_t = 0;
let data: *const u8 = unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
if data.is_null() {
return Err(crate::PyErr::fetch(self.py()));
} else {
unsafe { std::slice::from_raw_parts(data, size as usize) }
}
} else {
let bytes = unsafe {
self.py().from_owned_ptr_or_err::<PyBytes>(ffi::PyUnicode_AsUTF8String(self.as_ptr()))?
};
bytes.as_bytes()
}
}
};
Ok(unsafe { std::str::from_utf8_unchecked(utf8_slice) })
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
{
Py2Borrowed::from_gil_ref(self).to_str()
}

#[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
{
let bytes = unsafe {
self.py()
.from_owned_ptr_or_err::<PyBytes>(ffi::PyUnicode_AsUTF8String(self.as_ptr()))
}?;
Ok(unsafe { std::str::from_utf8_unchecked(bytes.as_bytes()) })
}
}

/// Converts the `PyString` into a Rust string, avoiding copying when possible.
///
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
pub fn to_cow(&self) -> PyResult<Cow<'_, str>> {
Py2Borrowed::from_gil_ref(self).to_cow()
}

/// Converts the `PyString` into a Rust string.
///
/// Unpaired surrogates invalid UTF-8 sequences are
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
pub fn to_string_lossy(&self) -> Cow<'_, str> {
davidhewitt marked this conversation as resolved.
Show resolved Hide resolved
match self.to_str() {
Ok(s) => Cow::Borrowed(s),
Err(_) => {
let bytes = unsafe {
self.py()
.from_owned_ptr::<PyBytes>(ffi::PyUnicode_AsEncodedString(
self.as_ptr(),
b"utf-8\0" as *const _ as _,
b"surrogatepass\0" as *const _ as _,
))
};
String::from_utf8_lossy(bytes.as_bytes())
}
}
Py2Borrowed::from_gil_ref(self).to_string_lossy()
}

/// Obtains the raw data backing the Python string.
Expand All @@ -238,6 +229,137 @@ impl PyString {
/// expected on the targets where you plan to distribute your software.
#[cfg(not(Py_LIMITED_API))]
pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
Py2Borrowed::from_gil_ref(self).data()
}
}

/// Implementation of functionality for [`PyString`].
///
/// These methods are defined for the `Py2<'py, PyString>` smart pointer, so to use method call
/// syntax these methods are separated into a trait, because stable Rust does not yet support
/// `arbitrary_self_types`.
#[doc(alias = "PyString")]
pub(crate) trait PyStringMethods<'py> {
/// Gets the Python string as a Rust UTF-8 string slice.
///
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
fn to_str(&self) -> PyResult<&str>;

/// Converts the `PyString` into a Rust string, avoiding copying when possible.
///
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
fn to_cow(&self) -> PyResult<Cow<'_, str>>;

/// Converts the `PyString` into a Rust string.
///
/// Unpaired surrogates invalid UTF-8 sequences are
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
fn to_string_lossy(&self) -> Cow<'_, str>;

/// Obtains the raw data backing the Python string.
///
/// If the Python string object was created through legacy APIs, its internal storage format
/// will be canonicalized before data is returned.
///
/// # Safety
///
/// This function implementation relies on manually decoding a C bitfield. In practice, this
/// works well on common little-endian architectures such as x86_64, where the bitfield has a
/// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
/// x86_64 platforms.
///
/// By using this API, you accept responsibility for testing that PyStringData behaves as
/// expected on the targets where you plan to distribute your software.
#[cfg(not(Py_LIMITED_API))]
unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
}

impl<'py> PyStringMethods<'py> for Py2<'py, PyString> {
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
fn to_str(&self) -> PyResult<&str> {
Py2Borrowed::from(self).to_str()
}

fn to_cow(&self) -> PyResult<Cow<'_, str>> {
Py2Borrowed::from(self).to_cow()
}

fn to_string_lossy(&self) -> Cow<'_, str> {
Py2Borrowed::from(self).to_string_lossy()
}

#[cfg(not(Py_LIMITED_API))]
unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
Py2Borrowed::from(self).data()
}
}

impl<'a> Py2Borrowed<'a, '_, PyString> {
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
#[allow(clippy::wrong_self_convention)]
fn to_str(self) -> PyResult<&'a str> {
// PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
let mut size: ffi::Py_ssize_t = 0;
let data: *const u8 =
unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
if data.is_null() {
Err(crate::PyErr::fetch(self.py()))
} else {
Ok(unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
})
}
}

#[allow(clippy::wrong_self_convention)]
fn to_cow(self) -> PyResult<Cow<'a, str>> {
// TODO: this method can probably be deprecated once Python 3.9 support is dropped,
// because all versions then support the more efficient `to_str`.
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
{
self.to_str().map(Cow::Borrowed)
}

#[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
{
let bytes = unsafe {
ffi::PyUnicode_AsUTF8String(self.as_ptr())
.assume_owned_or_err(self.py())?
.downcast_into_unchecked::<PyBytes>()
};
Ok(Cow::Owned(
unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
))
}
}

#[allow(clippy::wrong_self_convention)]
fn to_string_lossy(self) -> Cow<'a, str> {
let ptr = self.as_ptr();
let py = self.py();

#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
if let Ok(s) = self.to_str() {
return Cow::Borrowed(s);
}

let bytes = unsafe {
ffi::PyUnicode_AsEncodedString(
ptr,
b"utf-8\0".as_ptr().cast(),
b"surrogatepass\0".as_ptr().cast(),
)
.assume_owned(py)
.downcast_into_unchecked::<PyBytes>()
};
Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
}

#[cfg(not(Py_LIMITED_API))]
unsafe fn data(self) -> PyResult<PyStringData<'a>> {
let ptr = self.as_ptr();

#[cfg(not(Py_3_12))]
Expand Down Expand Up @@ -275,6 +397,60 @@ impl PyString {
}
}

impl Py<PyString> {
/// Gets the Python string as a Rust UTF-8 string slice.
///
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
///
/// Because `str` objects are immutable, the returned slice is independent of
/// the GIL lifetime.
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
self.attach_borrow(py).to_str()
}

/// Converts the `PyString` into a Rust string, avoiding copying when possible.
///
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
///
/// Because `str` objects are immutable, the returned slice is independent of
/// the GIL lifetime.
pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
self.attach_borrow(py).to_cow()
}

/// Converts the `PyString` into a Rust string.
///
/// Unpaired surrogates invalid UTF-8 sequences are
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
///
/// Because `str` objects are immutable, the returned slice is independent of
/// the GIL lifetime.
pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
self.attach_borrow(py).to_string_lossy()
}
}

impl IntoPy<Py<PyString>> for Py2<'_, PyString> {
fn into_py(self, _py: Python<'_>) -> Py<PyString> {
self.into()
}
}

impl IntoPy<Py<PyString>> for &Py2<'_, PyString> {
fn into_py(self, _py: Python<'_>) -> Py<PyString> {
self.clone().into()
}
}

impl IntoPy<Py<PyString>> for &'_ Py<PyString> {
fn into_py(self, py: Python<'_>) -> Py<PyString> {
self.clone_ref(py)
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand All @@ -284,7 +460,7 @@ mod tests {
use std::borrow::Cow;

#[test]
fn test_to_str_ascii() {
fn test_to_str_utf8() {
Python::with_gil(|py| {
let s = "ascii 🐈";
let obj: PyObject = PyString::new(py, s).into();
Expand Down Expand Up @@ -479,4 +655,42 @@ mod tests {
assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
});
}

#[test]
fn test_py_to_str_utf8() {
Python::with_gil(|py| {
let s = "ascii 🐈";
let py_string: Py<PyString> = PyString::new(py, s).into_py(py);

#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
assert_eq!(s, py_string.to_str(py).unwrap());

assert_eq!(s, py_string.to_cow(py).unwrap());
})
}

#[test]
fn test_py_to_str_surrogate() {
Python::with_gil(|py| {
let py_string: Py<PyString> =
py.eval(r"'\ud800'", None, None).unwrap().extract().unwrap();

#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
assert!(py_string.to_str(py).is_err());

assert!(py_string.to_cow(py).is_err());
})
}

#[test]
fn test_py_to_string_lossy() {
Python::with_gil(|py| {
let py_string: Py<PyString> = py
.eval(r"'🐈 Hello \ud800World'", None, None)
.unwrap()
.extract()
.unwrap();
assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
})
}
}