Skip to content

Commit

Permalink
Merge pull request #17 from cecoeco/main
Browse files Browse the repository at this point in the history
  • Loading branch information
drizk1 committed May 30, 2024
2 parents e7e0004 + 3a53db0 commit 8d77d5f
Show file tree
Hide file tree
Showing 5 changed files with 309 additions and 2 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ authors = ["Daniel Rizk && Contributors"]
version = "0.2.1"

[deps]
StringEncodings = "69024149-9ee7-55f6-a4c4-859efe599b68"

[compat]
StringEncodings = "0.3"
julia = "1.6"

[extras]
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ TidierStrings.jl currently supports:
- `str_detect()`
- `str_replace()`
- `str_replace_all()`
- `str_replace_missing()`
- `str_removal_all()`
- `str_remove()`
- `str_count()`
Expand All @@ -57,6 +58,12 @@ TidierStrings.jl currently supports:
- `str_starts()`
- `str_ends()`
- `str_which()`
- `str_flatten()`
- `str_flatten_comma()`
- `str_locate()`
- `str_locate_all()`
- `str_conv`
- `str_like`
- `word()`

## Examples
Expand Down
7 changes: 7 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ This package includes:
- `str_detect()`
- `str_replace()`
- `str_replace_all()`
- `str_replace_missing()`
- `str_removal_all()`
- `str_remove()`
- `str_count()`
Expand All @@ -28,4 +29,10 @@ This package includes:
- `str_starts()`
- `str_ends()`
- `str_which()`
- `str_flatten()`
- `str_flatten_comma()`
- `str_locate()`
- `str_locate_all()`
- `str_conv`
- `str_like`
- `word()`
108 changes: 107 additions & 1 deletion src/TidierStrings.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
module TidierStrings

using StringEncodings

export str_detect, str_replace, str_replace_all, str_remove_all, str_remove, str_count, str_squish, str_equal, str_to_upper, str_to_lower, str_split, str_subset,
str_to_title, str_to_sentence, str_dup, str_length, str_width, str_trim, str_unique, word, str_starts, str_ends, str_which
str_to_title, str_to_sentence, str_dup, str_length, str_width, str_trim, str_unique, word, str_starts, str_ends, str_which, str_flatten, str_flatten_comma,
str_locate, str_locate_all, str_conv, str_replace_missing, str_like

include("strings_docstrings.jl")

Expand Down Expand Up @@ -35,6 +38,109 @@ function str_detect(column, pattern::Union{String, Regex})
end
end

"""
$docstring_str_locate
"""
function str_locate(string::AbstractString, pattern::Union{AbstractString,Regex})
if isa(pattern, Regex)
regex_pattern = pattern
else
regex_pattern = Regex(pattern)
end

match = Base.match(regex_pattern, string)

if match === nothing
return (NaN, NaN)
else
return (first(match.offset), last(match.offset))
end
end

"""
$docstring_str_locate_all
"""
function str_locate_all(string::AbstractString, pattern::Union{AbstractString,Regex})
if isa(pattern, Regex)
regex_pattern = pattern
else
regex_pattern = Regex(pattern)
end

matches = collect(eachmatch(regex_pattern, string))

return [(first(m.offset), last(m.offset)) for m in matches]
end

"""
$docstring_str_flatten
"""
function str_flatten(string::AbstractVector, collapse::AbstractString="", last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false)
if missing_rm
string = filter(!ismissing, string)
end

if isempty(string)
return ""
elseif length(string) == 1
return string[1]
else
if isnothing(last)
return join(string, collapse)
else
if length(string) == 2
return join(string, last)
else
return join(string[1:end-1], collapse) * last * string[end]
end
end
end
end

"""
$docstring_str_flatten_comma
"""
function str_flatten_comma(string::AbstractVector, last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false)
return str_flatten(string, ", ", last, missing_rm=missing_rm)
end

"""
$docstring_str_conv
"""
function str_conv(string::Union{String,Vector{UInt8}}, encoding::String)
encoder = StringEncodings.Encoding(encoding)
if isa(string, Vector{UInt8})
return StringEncodings.decode(string, encoder)
else
byte_array = StringEncodings.encode(string, encoder)
return StringEncodings.decode(byte_array, encoder)
end
end

"""
$docstring_str_replace_missing
"""
function str_replace_missing(string::AbstractVector{Union{Missing,String}}, replacement::String="missing")
return [ismissing(s) ? replacement : s for s in string]
end

"""
$docstring_str_like
"""
function str_like(string::AbstractVector{String}, pattern::String; ignore_case::Bool = true)
# Convert SQL LIKE pattern to Julia regex pattern
julia_pattern = replace(pattern, r"[%_]" => s -> s == "%" ? ".*" : ".")
julia_pattern = replace(julia_pattern, r"(\\%)" => "%")
julia_pattern = replace(julia_pattern, r"(\\_)" => "_")

# Create a regular expression object
regex_flags = ignore_case ? "i" : ""
regex_pattern = Regex("^" * julia_pattern * "\$", regex_flags)

# Apply the pattern to each string in the input vector
return [occursin(regex_pattern, str) for str in string]
end

"""
$docstring_str_starts
"""
Expand Down
187 changes: 186 additions & 1 deletion src/strings_docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,67 @@ true
```
"""

const docstring_str_flatten =
"""
str_flatten(string::AbstractVector, collapse::AbstractString="", last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false)
Flatten a string vector into a single string.
Arguments
- `string`: Input string.
- `collapse`: The string to insert between each string in the input vector. Default is `""`.
- `last`: The string to insert at the end of the flattened string. Default is `nothing`.
- `missing_rm`: Remove `Missing` values from the input vector. Default is `false`.
Returns
A flattened string.
Examples
```jldoctest
julia> str_flatten(["a", "b", "c"])
"abc"
julia> str_flatten(["a", "b", "c", "d"])
"abcd"
julia> str_flatten(['a', 'b', 'c'], "-")
"a-b-c"
julia> str_flatten(['a', 'b', 'c'], ", ")
"a, b, c"
julia> str_flatten(['a', 'b', 'c'], ", ", " and ")
"a, b and c"
```
"""

const docstring_str_flatten_comma =
"""
str_flatten_comma(string::AbstractVector, last::Union{Nothing,AbstractString}=nothing; missing_rm::Bool=false)
Flatten a string vector into a single string, separated by commas.
Arguments
- `string`: Input string.
- `last`: The string to insert at the end of the flattened string. Default is `nothing`.
- `missing_rm`: Remove `Missing` values from the input vector. Default is `false`.
Returns
A flattened string.
Examples
```jldoctest
julia> str_flatten_comma(['a', 'b', 'c'])
"a, b, c"
julia> str_flatten_comma(['a', 'b'])
"a, b"
julia> str_flatten_comma(['a', 'b'], " and ")
"a and b"
```
"""

const docstring_str_replace =
"""
str_replace(column::String, pattern::Union{String, Regex}, replacement::String)
Expand Down Expand Up @@ -535,4 +596,128 @@ Int64[]
julia> str_which(["apple", "banana", "pear", "pineapple"], "a", negate=true) # []
Int64[]
```
"""
"""

const docstring_str_locate =
"""
str_locate(string::AbstractString, pattern::Union{AbstractString, Regex})
Returns the index of the first occurrence of a pattern in a string.
Arguments
- `string`: Input string.
- `pattern`: The pattern to search for. Can be a string or a regular expression.
A tuple `(start, end)` where `start` is the position at the start of the match and `end` is the position of the end.
Examples
```jldoctest
julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate(fruit[1], "e")
(5, 5)
julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate(fruit[2], "a")
(2, 2)
```
"""
const docstring_str_locate_all =
"""
str_locate_all(string::AbstractString, pattern::Union{AbstractString, Regex})
Returns the indices of all occurrences of a pattern in a string.
Arguments
- `string`: Input string.
- `pattern`: The pattern to search for. Can be a string or a regular expression.
A vector of tuples `(start, end)` where `start` is the position at the start of the match and `end` is the position of the end.
Examples
```jldoctest
julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate_all(fruit[1], "e")
1-element Vector{Tuple{Int64, Int64}}:
(5, 5)
julia> fruit = ["apple", "banana", "pear", "pineapple"]; str_locate_all(fruit[2], "a")
3-element Vector{Tuple{Int64, Int64}}:
(2, 2)
(4, 4)
(6, 6)
```
"""

const docstring_str_replace_missing =
"""
str_replace_missing(string::AbstractVector{Union{Missing,String}}, replacement::String="missing")
Replaces missing values in a vector with a specified string.
Arguments
- `string`: Input vector of strings.
- `replacement`: The string to replace missing values with. Default is "missing".
Returns
The vector of strings with missing values replaced.
Examples
```jldoctest
julia> str_replace_missing(["apple", missing, "pear", "pineapple"])
4-element Vector{String}:
"apple"
"missing"
"pear"
"pineapple"
```
"""

const docstring_str_conv =
"""
str_conv(string::Union{String,Vector{UInt8}}, encoding::String)
Converts a string to a different encoding.
Arguments
- `string`: Input string.
- `encoding`: A String that specifies the encoding to use.
Returns
The converted string.
Examples
```jldoctest
julia> str_conv("Hello, World!", "UTF-8")
"Hello, World!"
julia> str_conv("Hello, World!", "ASCII")
"Hello, World!"
julia> str_conv("Héllo, Wörld!", "ISO-8859-1")
"Héllo, Wörld!"
julia> str_conv([0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2C, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21], "UTF-8")
"Hello, world!"
```
"""

const docstring_str_like =
"""
str_like(string::AbstractVector{String}, pattern::String; ignore_case::Bool = true)
Detect a pattern in each string of the input vector using SQL-like pattern matching.
Arguments
- `string`: Input string.
- `pattern`: The pattern to check for. Can be a string or a regular expression.
- `ignore_case`: Whether to ignore case when matching. Default is `true`.
Returns
A vector of booleans indicating if the string matches the pattern.
```jldoctest
julia> str_like(["Hello", "world", "HELLO", "WORLD"], "H_llo")
4-element Vector{Bool}:
1
0
1
0
```
"""

0 comments on commit 8d77d5f

Please sign in to comment.