Skip to content

Commit

Permalink
copr: Migrate scalar function Lpad from TiDB (tikv#7300)
Browse files Browse the repository at this point in the history
Signed-off-by: Qiannan Lyu <lvqiannan@gmail.com>
  • Loading branch information
hsqlu committed Apr 10, 2020
1 parent faa42d2 commit ce111fb
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 83 deletions.
85 changes: 2 additions & 83 deletions components/tidb_query_normal_expr/src/builtin_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use hex::{self, FromHex};

use tidb_query_datatype;
use tidb_query_datatype::prelude::*;
use tidb_query_shared_expr::conv::i64_to_usize;
use tidb_query_shared_expr::string::validate_target_len_for_pad;
use tikv_util::try_opt_or;

use crate::ScalarFunc;
Expand Down Expand Up @@ -1027,59 +1029,6 @@ impl ScalarFunc {
}
}

// when target_len is 0, return Some(0), means the pad function should return empty string
// currently there are three conditions it return None, which means pad function should return Null
// 1. target_len is negative
// 2. target_len of type in byte is larger then MAX_BLOB_WIDTH
// 3. target_len is greater than length of input string, *and* pad string is empty
// otherwise return Some(target_len)
#[inline]
fn validate_target_len_for_pad(
len_unsigned: bool,
target_len: i64,
input_len: usize,
size_of_type: usize,
pad_empty: bool,
) -> Option<usize> {
if target_len == 0 {
return Some(0);
}
let (target_len, target_len_positive) = i64_to_usize(target_len, len_unsigned);
if !target_len_positive
|| target_len.saturating_mul(size_of_type) > tidb_query_datatype::MAX_BLOB_WIDTH as usize
|| (pad_empty && input_len < target_len)
{
return None;
}
Some(target_len)
}

// Returns (isize, is_positive): convert an i64 to usize, and whether the input is positive
//
// # Examples
// ```
// assert_eq!(i64_to_usize(1_i64, false), (1_usize, true));
// assert_eq!(i64_to_usize(1_i64, false), (1_usize, true));
// assert_eq!(i64_to_usize(-1_i64, false), (1_usize, false));
// assert_eq!(i64_to_usize(u64::max_value() as i64, true), (u64::max_value() as usize, true));
// assert_eq!(i64_to_usize(u64::max_value() as i64, false), (1_usize, false));
// ```
#[inline]
fn i64_to_usize(i: i64, is_unsigned: bool) -> (usize, bool) {
if is_unsigned {
(i as u64 as usize, true)
} else if i >= 0 {
(i as usize, true)
} else {
let i = if i == i64::min_value() {
i64::max_value() as usize + 1
} else {
-i as usize
};
(i, false)
}
}

#[inline]
fn strip_whitespace(input: &[u8]) -> Vec<u8> {
let mut input_copy = Vec::<u8>::with_capacity(input.len());
Expand Down Expand Up @@ -3149,36 +3098,6 @@ mod tests {
}
}

#[test]
fn test_validate_target_len_for_pad() {
let cases = vec![
// target_len, input_len, size_of_type, pad_empty, result
(0, 10, 1, false, Some(0)),
(-1, 10, 1, false, None),
(12, 10, 1, true, None),
(i64::from(MAX_BLOB_WIDTH) + 1, 10, 1, false, None),
(i64::from(MAX_BLOB_WIDTH) / 4 + 1, 10, 4, false, None),
(12, 10, 1, false, Some(12)),
];
for case in cases {
let got = super::validate_target_len_for_pad(false, case.0, case.1, case.2, case.3);
assert_eq!(got, case.4);
}

let unsigned_cases = vec![
(u64::max_value(), 10, 1, false, None),
(u64::max_value(), 10, 4, false, None),
(u64::max_value(), 10, 1, true, None),
(u64::max_value(), 10, 4, true, None),
(12u64, 10, 4, false, Some(12)),
];
for case in unsigned_cases {
let got =
super::validate_target_len_for_pad(true, case.0 as i64, case.1, case.2, case.3);
assert_eq!(got, case.4);
}
}

fn common_rpad_cases() -> Vec<(Datum, Datum, Datum, Datum)> {
vec![
(
Expand Down
26 changes: 26 additions & 0 deletions components/tidb_query_shared_expr/src/conv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,29 @@ pub fn conv(s: &str, from_base: Int, to_base: Int) -> Option<Bytes> {
None
}
}

// Returns (isize, is_positive): convert an i64 to usize, and whether the input is positive
//
// # Examples
// ```
// assert_eq!(i64_to_usize(1_i64, false), (1_usize, true));
// assert_eq!(i64_to_usize(1_i64, false), (1_usize, true));
// assert_eq!(i64_to_usize(-1_i64, false), (1_usize, false));
// assert_eq!(i64_to_usize(u64::max_value() as i64, true), (u64::max_value() as usize, true));
// assert_eq!(i64_to_usize(u64::max_value() as i64, false), (1_usize, false));
// ```
#[inline]
pub fn i64_to_usize(i: i64, is_unsigned: bool) -> (usize, bool) {
if is_unsigned {
(i as u64 as usize, true)
} else if i >= 0 {
(i as usize, true)
} else {
let i = if i == i64::min_value() {
i64::max_value() as usize + 1
} else {
-i as usize
};
(i, false)
}
}
1 change: 1 addition & 0 deletions components/tidb_query_shared_expr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pub mod conv;
pub mod like;
pub mod miscellaneous;
pub mod rand;
pub mod string;
65 changes: 65 additions & 0 deletions components/tidb_query_shared_expr/src/string.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0.

use super::conv::i64_to_usize;

const MAX_BLOB_WIDTH: i32 = 16_777_216; // FIXME: Should be isize

// when target_len is 0, return Some(0), means the pad function should return empty string
// currently there are three conditions it return None, which means pad function should return Null
// 1. target_len is negative
// 2. target_len of type in byte is larger then MAX_BLOB_WIDTH
// 3. target_len is greater than length of input string, *and* pad string is empty
// otherwise return Some(target_len)
#[inline]
pub fn validate_target_len_for_pad(
len_unsigned: bool,
target_len: i64,
input_len: usize,
size_of_type: usize,
pad_empty: bool,
) -> Option<usize> {
if target_len == 0 {
return Some(0);
}
let (target_len, target_len_positive) = i64_to_usize(target_len, len_unsigned);
if !target_len_positive
|| target_len.saturating_mul(size_of_type) > MAX_BLOB_WIDTH as usize
|| (pad_empty && input_len < target_len)
{
return None;
}
Some(target_len)
}

#[cfg(test)]
mod tests {
#[test]
fn test_validate_target_len_for_pad() {
let cases = vec![
// target_len, input_len, size_of_type, pad_empty, result
(0, 10, 1, false, Some(0)),
(-1, 10, 1, false, None),
(12, 10, 1, true, None),
(i64::from(super::MAX_BLOB_WIDTH) + 1, 10, 1, false, None),
(i64::from(super::MAX_BLOB_WIDTH) / 4 + 1, 10, 4, false, None),
(12, 10, 1, false, Some(12)),
];
for case in cases {
let got = super::validate_target_len_for_pad(false, case.0, case.1, case.2, case.3);
assert_eq!(got, case.4);
}

let unsigned_cases = vec![
(u64::max_value(), 10, 1, false, None),
(u64::max_value(), 10, 4, false, None),
(u64::max_value(), 10, 1, true, None),
(u64::max_value(), 10, 4, true, None),
(12u64, 10, 4, false, Some(12)),
];
for case in unsigned_cases {
let got =
super::validate_target_len_for_pad(true, case.0 as i64, case.1, case.2, case.3);
assert_eq!(got, case.4);
}
}
}
103 changes: 103 additions & 0 deletions components/tidb_query_vec_expr/src/impl_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use tidb_query_codegen::rpn_fn;
use tidb_query_common::Result;
use tidb_query_datatype::codec::data_type::*;
use tidb_query_datatype::*;
use tidb_query_shared_expr::string::validate_target_len_for_pad;

const SPACE: u8 = 0o40u8;

Expand Down Expand Up @@ -116,6 +117,33 @@ pub fn rtrim(arg: &Option<Bytes>) -> Result<Option<Bytes>> {
}))
}

#[rpn_fn]
#[inline]
pub fn lpad(arg: &Option<Bytes>, len: &Option<Int>, pad: &Option<Bytes>) -> Result<Option<Bytes>> {
match (arg, len, pad) {
(Some(arg), Some(len), Some(pad)) => {
match validate_target_len_for_pad(*len < 0, *len, arg.len(), 1, pad.is_empty()) {
None => Ok(None),
Some(0) => Ok(Some(b"".to_vec())),
Some(target_len) => {
let r = if let Some(remain) = target_len.checked_sub(arg.len()) {
pad.iter()
.cycle()
.take(remain)
.chain(arg.iter())
.copied()
.collect::<Bytes>()
} else {
arg[..target_len].to_vec()
};
Ok(Some(r))
}
}
}
_ => Ok(None),
}
}

#[rpn_fn]
#[inline]
pub fn replace(
Expand Down Expand Up @@ -869,6 +897,81 @@ mod tests {
}
}

#[test]
fn test_lpad() {
let cases = vec![
(
Some(b"hello".to_vec()),
Some(0),
Some(b"h".to_vec()),
Some(b"".to_vec()),
),
(
Some(b"hello".to_vec()),
Some(1),
Some(b"h".to_vec()),
Some(b"h".to_vec()),
),
(Some(b"hello".to_vec()), Some(-1), Some(b"h".to_vec()), None),
(
Some(b"hello".to_vec()),
Some(3),
Some(b"".to_vec()),
Some(b"hel".to_vec()),
),
(Some(b"hello".to_vec()), Some(8), Some(b"".to_vec()), None),
(
Some(b"hello".to_vec()),
Some(8),
Some(b"he".to_vec()),
Some(b"hehhello".to_vec()),
),
(
Some(b"hello".to_vec()),
Some(9),
Some(b"he".to_vec()),
Some(b"hehehello".to_vec()),
),
(
Some(b"hello".to_vec()),
Some(5),
Some("您好".as_bytes().to_vec()),
Some(b"hello".to_vec()),
),
(Some(b"hello".to_vec()), Some(6), Some(b"".to_vec()), None),
(
Some(b"\x61\x76\x5e".to_vec()),
Some(2),
Some(b"\x35".to_vec()),
Some(b"\x61\x76".to_vec()),
),
(
Some(b"\x61\x76\x5e".to_vec()),
Some(5),
Some(b"\x35".to_vec()),
Some(b"\x35\x35\x61\x76\x5e".to_vec()),
),
(
Some(b"hello".to_vec()),
Some(i64::from(MAX_BLOB_WIDTH) + 1),
Some(b"he".to_vec()),
None,
),
(None, Some(-1), Some(b"h".to_vec()), None),
(None, None, None, None),
];

for (arg, len, pad, expect_output) in cases {
let output = RpnFnScalarEvaluator::new()
.push_param(arg)
.push_param(len)
.push_param(pad)
.evaluate(ScalarFuncSig::Lpad)
.unwrap();
assert_eq!(output, expect_output);
}
}

#[test]
fn test_replace() {
let cases = vec![
Expand Down
1 change: 1 addition & 0 deletions components/tidb_query_vec_expr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result<RpnFnMeta> {
ScalarFuncSig::HexStrArg => hex_str_arg_fn_meta(),
ScalarFuncSig::LTrim => ltrim_fn_meta(),
ScalarFuncSig::RTrim => rtrim_fn_meta(),
ScalarFuncSig::Lpad => lpad_fn_meta(),
ScalarFuncSig::Trim1Arg => trim_1_arg_fn_meta(),
ScalarFuncSig::Replace => replace_fn_meta(),
ScalarFuncSig::Left => left_fn_meta(),
Expand Down

0 comments on commit ce111fb

Please sign in to comment.