diff --git a/Project.toml b/Project.toml index 97a7362..b534846 100644 --- a/Project.toml +++ b/Project.toml @@ -4,8 +4,10 @@ authors = ["Daniel Rizk && Contributors"] version = "0.2.1" [deps] +StringEncodings = "69024149-9ee7-55f6-a4c4-859efe599b68" [compat] +StringEncodings = "0.3" julia = "1.6" [extras] diff --git a/README.md b/README.md index 0f1beb9..dfa1045 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ TidierStrings.jl currently supports: - `str_detect()` - `str_replace()` - `str_replace_all()` +- `str_replace_missing()` - `str_removal_all()` - `str_remove()` - `str_count()` @@ -57,6 +58,12 @@ TidierStrings.jl currently supports: - `str_starts()` - `str_ends()` - `str_which()` +- `str_flatten()` +- `str_flatten_comma()` +- `str_locate()` +- `str_locate_all()` +- `str_conv` +- `str_like` - `word()` ## Examples diff --git a/docs/src/index.md b/docs/src/index.md index 749dcad..ad45cc0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -10,6 +10,7 @@ This package includes: - `str_detect()` - `str_replace()` - `str_replace_all()` +- `str_replace_missing()` - `str_removal_all()` - `str_remove()` - `str_count()` @@ -28,4 +29,10 @@ This package includes: - `str_starts()` - `str_ends()` - `str_which()` +- `str_flatten()` +- `str_flatten_comma()` +- `str_locate()` +- `str_locate_all()` +- `str_conv` +- `str_like` - `word()` diff --git a/src/TidierStrings.jl b/src/TidierStrings.jl index 9219f45..575ad4d 100644 --- a/src/TidierStrings.jl +++ b/src/TidierStrings.jl @@ -1,7 +1,10 @@ module TidierStrings +using StringEncodings + export str_detect, str_replace, str_replace_all, str_remove_all, str_remove, str_count, str_squish, str_equal, str_to_upper, str_to_lower, str_split, str_subset, - str_to_title, str_to_sentence, str_dup, str_length, str_width, str_trim, str_unique, word, str_starts, str_ends, str_which + str_to_title, str_to_sentence, str_dup, str_length, str_width, str_trim, str_unique, word, str_starts, str_ends, str_which, str_flatten, str_flatten_comma, + str_locate, str_locate_all, str_conv, str_replace_missing, str_like include("strings_docstrings.jl") @@ -35,6 +38,109 @@ function str_detect(column, pattern::Union{String, Regex}) end end +""" +$docstring_str_locate +""" +function str_locate(string::AbstractString, pattern::Union{AbstractString,Regex}) + if isa(pattern, Regex) + regex_pattern = pattern + else + regex_pattern = Regex(pattern) + end + + match = Base.match(regex_pattern, string) + + if match === nothing + return (NaN, NaN) + else + return (first(match.offset), last(match.offset)) + end +end + +""" +$docstring_str_locate_all +""" +function str_locate_all(string::AbstractString, pattern::Union{AbstractString,Regex}) + if isa(pattern, Regex) + regex_pattern = pattern + else + regex_pattern = Regex(pattern) + end + + matches = collect(eachmatch(regex_pattern, string)) + + return [(first(m.offset), last(m.offset)) for m in matches] +end + +""" +$docstring_str_flatten +""" +function str_flatten(string::AbstractVector, collapse::AbstractString="", last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false) + if missing_rm + string = filter(!ismissing, string) + end + + if isempty(string) + return "" + elseif length(string) == 1 + return string[1] + else + if isnothing(last) + return join(string, collapse) + else + if length(string) == 2 + return join(string, last) + else + return join(string[1:end-1], collapse) * last * string[end] + end + end + end +end + +""" +$docstring_str_flatten_comma +""" +function str_flatten_comma(string::AbstractVector, last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false) + return str_flatten(string, ", ", last, missing_rm=missing_rm) +end + +""" +$docstring_str_conv +""" +function str_conv(string::Union{String,Vector{UInt8}}, encoding::String) + encoder = StringEncodings.Encoding(encoding) + if isa(string, Vector{UInt8}) + return StringEncodings.decode(string, encoder) + else + byte_array = StringEncodings.encode(string, encoder) + return StringEncodings.decode(byte_array, encoder) + end +end + +""" +$docstring_str_replace_missing +""" +function str_replace_missing(string::AbstractVector{Union{Missing,String}}, replacement::String="missing") + return [ismissing(s) ? replacement : s for s in string] +end + +""" +$docstring_str_like +""" +function str_like(string::AbstractVector{String}, pattern::String; ignore_case::Bool = true) + # Convert SQL LIKE pattern to Julia regex pattern + julia_pattern = replace(pattern, r"[%_]" => s -> s == "%" ? ".*" : ".") + julia_pattern = replace(julia_pattern, r"(\\%)" => "%") + julia_pattern = replace(julia_pattern, r"(\\_)" => "_") + + # Create a regular expression object + regex_flags = ignore_case ? "i" : "" + regex_pattern = Regex("^" * julia_pattern * "\$", regex_flags) + + # Apply the pattern to each string in the input vector + return [occursin(regex_pattern, str) for str in string] +end + """ $docstring_str_starts """ diff --git a/src/strings_docstrings.jl b/src/strings_docstrings.jl index 307ec8a..04259bd 100644 --- a/src/strings_docstrings.jl +++ b/src/strings_docstrings.jl @@ -32,6 +32,67 @@ true ``` """ +const docstring_str_flatten = +""" + str_flatten(string::AbstractVector, collapse::AbstractString="", last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false) + +Flatten a string vector into a single string. + +Arguments +- `string`: Input string. +- `collapse`: The string to insert between each string in the input vector. Default is `""`. +- `last`: The string to insert at the end of the flattened string. Default is `nothing`. +- `missing_rm`: Remove `Missing` values from the input vector. Default is `false`. + +Returns +A flattened string. + +Examples +```jldoctest +julia> str_flatten(["a", "b", "c"]) +"abc" + +julia> str_flatten(["a", "b", "c", "d"]) +"abcd" + +julia> str_flatten(['a', 'b', 'c'], "-") +"a-b-c" + +julia> str_flatten(['a', 'b', 'c'], ", ") +"a, b, c" + +julia> str_flatten(['a', 'b', 'c'], ", ", " and ") +"a, b and c" +``` +""" + +const docstring_str_flatten_comma = +""" + str_flatten_comma(string::AbstractVector, last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false) + +Flatten a string vector into a single string, separated by commas. + +Arguments +- `string`: Input string. +- `last`: The string to insert at the end of the flattened string. Default is `nothing`. +- `missing_rm`: Remove `Missing` values from the input vector. Default is `false`. + +Returns +A flattened string. + +Examples +```jldoctest +julia> str_flatten_comma(['a', 'b', 'c']) +"a, b, c" + +julia> str_flatten_comma(['a', 'b']) +"a, b" + +julia> str_flatten_comma(['a', 'b'], " and ") +"a and b" +``` +""" + const docstring_str_replace = """ str_replace(column::String, pattern::Union{String, Regex}, replacement::String) @@ -535,4 +596,128 @@ Int64[] julia> str_which(["apple", "banana", "pear", "pineapple"], "a", negate=true) # [] Int64[] ``` -""" \ No newline at end of file +""" + +const docstring_str_locate = +""" + str_locate(string::AbstractString, pattern::Union{AbstractString, Regex}) + +Returns the index of the first occurrence of a pattern in a string. + +Arguments +- `string`: Input string. +- `pattern`: The pattern to search for. Can be a string or a regular expression. + +A tuple `(start, end)` where `start` is the position at the start of the match and `end` is the position of the end. + +Examples +```jldoctest +julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate(fruit[1], "e") +(5, 5) + +julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate(fruit[2], "a") +(2, 2) +``` +""" +const docstring_str_locate_all = +""" + str_locate_all(string::AbstractString, pattern::Union{AbstractString, Regex}) + +Returns the indices of all occurrences of a pattern in a string. + +Arguments +- `string`: Input string. +- `pattern`: The pattern to search for. Can be a string or a regular expression. + +A vector of tuples `(start, end)` where `start` is the position at the start of the match and `end` is the position of the end. + +Examples +```jldoctest +julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate_all(fruit[1], "e") +1-element Vector{Tuple{Int64, Int64}}: + (5, 5) + +julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate_all(fruit[2], "a") +3-element Vector{Tuple{Int64, Int64}}: + (2, 2) + (4, 4) + (6, 6) +``` +""" + +const docstring_str_replace_missing = +""" + str_replace_missing(string::AbstractVector{Union{Missing,String}}, replacement::String="missing") + +Replaces missing values in a vector with a specified string. + +Arguments +- `string`: Input vector of strings. +- `replacement`: The string to replace missing values with. Default is "missing". + +Returns +The vector of strings with missing values replaced. + +Examples +```jldoctest +julia> str_replace_missing(["apple", missing, "pear", "pineapple"]) +4-element Vector{String}: + "apple" + "missing" + "pear" + "pineapple" +``` +""" + +const docstring_str_conv = +""" + str_conv(string::Union{String,Vector{UInt8}}, encoding::String) + +Converts a string to a different encoding. + +Arguments +- `string`: Input string. +- `encoding`: A String that specifies the encoding to use. + +Returns +The converted string. + +Examples +```jldoctest +julia> str_conv("Hello, World!", "UTF-8") +"Hello, World!" + +julia> str_conv("Hello, World!", "ASCII") +"Hello, World!" + +julia> str_conv("Héllo, Wörld!", "ISO-8859-1") +"Héllo, Wörld!" + +julia> str_conv([0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2C, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21], "UTF-8") +"Hello, world!" +``` +""" + +const docstring_str_like = +""" + str_like(string::AbstractVector{String}, pattern::String; ignore_case::Bool = true) + +Detect a pattern in each string of the input vector using SQL-like pattern matching. + +Arguments +- `string`: Input string. +- `pattern`: The pattern to check for. Can be a string or a regular expression. +- `ignore_case`: Whether to ignore case when matching. Default is `true`. + +Returns +A vector of booleans indicating if the string matches the pattern. + +```jldoctest +julia> str_like(["Hello", "world", "HELLO", "WORLD"], "H_llo") +4-element Vector{Bool}: + 1 + 0 + 1 + 0 +``` +"""