diff --git a/Project.toml b/Project.toml index b28f06a..8ae89eb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierFiles" uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db" authors = ["Daniel Rizk and contributors"] -version = "0.1.2" +version = "0.1.3" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/README.md b/README.md index ee04c42..3f16adb 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,6 @@ The path can be a file available either locally or on the web. ```julia read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv", skip = 2, n_max = 3, col_select = ["ID", "Score"], missingstring = ["4"]) ``` - ``` 3×2 DataFrame Row │ ID Score @@ -80,4 +79,20 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin 1 │ 3 77 2 │ missing 85 3 │ 5 95 +``` + +Read multiple files by passing paths as a vector. +``` +path = "https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv" +read_csv([path, path], skip=3) +``` +``` +4×3 DataFrame + Row │ ID Name Score + │ Int64 String7 Int64 +─────┼─────────────────────── + 1 │ 4 David 85 + 2 │ 5 Eva 95 + 3 │ 4 David 85 + 4 │ 5 Eva 95 ``` \ No newline at end of file diff --git a/docs/examples/UserGuide/delim.jl b/docs/examples/UserGuide/delim.jl index b73cebd..d4232de 100644 --- a/docs/examples/UserGuide/delim.jl +++ b/docs/examples/UserGuide/delim.jl @@ -16,7 +16,7 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin #These functions read a delimited file (CSV, TSV, or custom delimiter) into a DataFrame. The arguments are: -# - `file`: Path to the file or a URL. +# - `file`: Path or vector of paths to the file(s) or a URL(s). # - `delim`: Field delimiter. Default is ',' for `read_csv`, '\t' for `read_tsv` and `read_delim`. # - `col_names`: Use first row as column names. Can be `true`, `false`, or an array of strings. Default is `true`. # - `skip`: Number of lines to skip before reading data. Default is 0. diff --git a/docs/examples/UserGuide/parquet.jl b/docs/examples/UserGuide/parquet.jl index aab487c..b3e0788 100644 --- a/docs/examples/UserGuide/parquet.jl +++ b/docs/examples/UserGuide/parquet.jl @@ -4,7 +4,7 @@ # This function reads a Parquet (.parquet) file into a DataFrame. The arguments are: -# - `path`: The path to the .parquet file. +# - `path`: The path or vector of paths or URLs to the .parquet file. # - `col_names`: Indicates if the first row of the file is used as column names. Default is `true`. # - `skip`: Number of initial rows to skip before reading data. Default is 0. # - `n_max`: Maximum number of rows to read. Default is `Inf` (read all rows). diff --git a/docs/src/index.md b/docs/src/index.md index 26e00ee..9ac5db8 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -68,7 +68,6 @@ The path can be a file available either locally or on the web. ```julia read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv", skip = 2, n_max = 3, col_select = ["ID", "Score"], missingstring = ["4"]) ``` - ``` 3×2 DataFrame Row │ ID Score @@ -77,4 +76,20 @@ read_csv("https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testin 1 │ 3 77 2 │ missing 85 3 │ 5 95 +``` + +Read multiple files by passing paths as a vector. +``` +path = "https://raw.githubusercontent.com/TidierOrg/TidierFiles.jl/main/testing_files/csvtest.csv" +read_csv([path, path], skip=3) +``` +``` +4×3 DataFrame + Row │ ID Name Score + │ Int64 String7 Int64 +─────┼─────────────────────── + 1 │ 4 David 85 + 2 │ 5 Eva 95 + 3 │ 4 David 85 + 4 │ 5 Eva 95 ``` \ No newline at end of file diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl index fbc8b48..d9255fa 100644 --- a/src/TidierFiles.jl +++ b/src/TidierFiles.jl @@ -27,7 +27,7 @@ include("arrow_files.jl") """ $docstring_read_csv """ -function read_csv(file; +function read_csv(files; delim=',', col_names=true, skip=0, @@ -36,19 +36,22 @@ function read_csv(file; comment=nothing, missingstring="", escape_double=true, - ntasks::Int = Threads.nthreads(), # Default ntasks value - num_threads::Union{Int, Nothing}=nothing) # Optional num_threads + ntasks::Int = Threads.nthreads(), + num_threads::Union{Int, Nothing}=nothing) - # Use num_threads if provided, otherwise stick with ntasks + # Normalize input to always be a vector of files + file_list = (typeof(files) <: AbstractString) ? [files] : files + + # Use num_threads if provided, otherwise use ntasks effective_ntasks = isnothing(num_threads) ? ntasks : num_threads - - # Convert n_max from Inf to Nothing for compatibility with CSV.File's limit argument + + # Convert n_max from Inf to Nothing for compatibility limit = isinf(n_max) ? nothing : Int(n_max) - # Calculate skipto and header correctly + # Calculate skipto and header skipto = skip + (col_names === true ? 1 : 0) - # Prepare arguments for CSV.read, including the effective number of tasks to use + # Prepare CSV reading options read_options = ( delim = delim, header = col_names === true ? 1 : 0, @@ -64,64 +67,68 @@ function read_csv(file; ntasks = effective_ntasks > 1 ) + # Initialize an empty DataFrame + final_df = DataFrame() - # Filter options to remove any set to `nothing` - # clean_options = Dict{Symbol,Any}(filter(p -> !isnothing(p[2]), read_options)) - - # Check if the file is a URL and read accordingly - if startswith(file, "http://") || startswith(file, "https://") - # Fetch the content from the URL - response = HTTP.get(file) - - # Ensure the request was successful - if response.status != 200 - error("Failed to fetch the CSV file: HTTP status code ", response.status) + # Loop over files + for file in file_list + if startswith(file, "http://") || startswith(file, "https://") + # Fetch the content from the URL + response = HTTP.get(file) + if response.status != 200 + error("Failed to fetch the CSV file: HTTP status code ", response.status) + end + # Read the CSV data + df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame + else + # Read from a local file + df = CSV.File(file; read_options...) |> DataFrame end - # Read the CSV data from the fetched content using cleaned options - df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame - else - # Read from a local file using cleaned options - df = CSV.File(file; read_options...) |> DataFrame + # Concatenate the read DataFrame to the final DataFrame + final_df = isempty(final_df) ? df : vcat(final_df, df, cols=:union) end - return df + return final_df end """ $docstring_read_delim """ -function read_delim(file; - delim='\t', - decimal = '.', - col_names=true, - skip=0, - n_max=Inf, - groupmark=nothing, - col_select=nothing, - comment=nothing, - missingstring="", - escape_double=true, - ntasks::Int = Threads.nthreads(), # Default ntasks value - num_threads::Union{Int, Nothing}=nothing) # Optional num_threads - - # Use num_threads if provided, otherwise stick with ntasks +function read_delim(files; + delim='\t', + decimal='.', + col_names=true, + skip=0, + n_max=Inf, + groupmark=nothing, + col_select=nothing, + comment=nothing, + missingstring="", + escape_double=true, + ntasks::Int = Threads.nthreads(), + num_threads::Union{Int, Nothing}=nothing) + + # Normalize input to always be a vector of files + file_list = (typeof(files) <: AbstractString) ? [files] : files + + # Use num_threads if provided, otherwise use ntasks effective_ntasks = isnothing(num_threads) ? ntasks : num_threads - - # Convert n_max from Inf to Nothing for compatibility with CSV.File's limit argument + + # Convert n_max from Inf to Nothing for compatibility limit = isinf(n_max) ? nothing : Int(n_max) - # Calculate skipto and header correctly + # Calculate skipto and header skipto = skip + (col_names === true ? 1 : 0) - # Prepare arguments for CSV.read, including the effective number of tasks to use + # Prepare CSV reading options read_options = ( delim = delim, decimal = decimal, header = col_names === true ? 1 : 0, skipto = skipto + 1, select = col_select, - groupmark=groupmark, + groupmark = groupmark, footerskip = 0, limit = limit, comment = comment, @@ -131,32 +138,37 @@ function read_delim(file; normalizenames = false, ntasks = effective_ntasks > 1 ) - # Filter options to remove any set to `nothing` - # clean_options = Dict{Symbol,Any}(filter(p -> !isnothing(p[2]), read_options)) - - # Read the file into a DataFrame - if startswith(file, "http://") || startswith(file, "https://") - # Fetch the content from the URL - response = HTTP.get(file) - - # Ensure the request was successful - if response.status != 200 - error("Failed to fetch the delim file: HTTP status code ", response.status) + + # Initialize an empty DataFrame + final_df = DataFrame() + + # Loop over files + for file in file_list + if startswith(file, "http://") || startswith(file, "https://") + # Fetch the content from the URL + response = HTTP.get(file) + if response.status != 200 + error("Failed to fetch the delim file: HTTP status code ", response.status) + end + # Read the CSV data + df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame + else + # Read from a local file + df = CSV.File(file; read_options...) |> DataFrame end - # Read the CSV data from the fetched content using cleaned options - df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame - else - # Read from a local file using cleaned options - df = CSV.File(file; read_options...) |> DataFrame + # Concatenate the read DataFrame to the final DataFrame + final_df = isempty(final_df) ? df : vcat(final_df, df, cols=:union) end - return df + + return final_df end + """ $docstring_read_tsv """ -function read_tsv(file; +function read_tsv(files; delim='\t', col_names=true, skip=0, @@ -165,19 +177,22 @@ function read_tsv(file; comment=nothing, missingstring="", escape_double=true, - ntasks::Int = Threads.nthreads(), # Default ntasks value - num_threads::Union{Int, Nothing}=nothing) # Optional num_threads - - # Use num_threads if provided, otherwise stick with ntasks + ntasks::Int = Threads.nthreads(), + num_threads::Union{Int, Nothing}=nothing) + + # Normalize input to always be a vector of files + file_list = (typeof(files) <: AbstractString) ? [files] : files + + # Use num_threads if provided, otherwise use ntasks effective_ntasks = isnothing(num_threads) ? ntasks : num_threads - - # Convert n_max from Inf to Nothing for compatibility with CSV.File's limit argument + + # Convert n_max from Inf to Nothing for compatibility limit = isinf(n_max) ? nothing : Int(n_max) - # Calculate skipto and header correctly + # Calculate skipto and header skipto = skip + (col_names === true ? 1 : 0) - # Prepare arguments for CSV.read, including the effective number of tasks to use + # Prepare CSV reading options read_options = ( delim = delim, header = col_names === true ? 1 : 0, @@ -192,30 +207,37 @@ function read_tsv(file; normalizenames = false, ntasks = effective_ntasks > 1 ) - # Read the TSV file into a DataFrame - if startswith(file, "http://") || startswith(file, "https://") - # Fetch the content from the URL - response = HTTP.get(file) - - # Ensure the request was successful - if response.status != 200 - error("Failed to fetch the TSV file: HTTP status code ", response.status) + + # Initialize an empty DataFrame + final_df = DataFrame() + + # Loop over files + for file in file_list + if startswith(file, "http://") || startswith(file, "https://") + # Fetch the content from the URL + response = HTTP.get(file) + if response.status != 200 + error("Failed to fetch the TSV file: HTTP status code ", response.status) + end + # Read the CSV data + df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame + else + # Read from a local file + df = CSV.File(file; read_options...) |> DataFrame end - # Read the CSV data from the fetched content using cleaned options - df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame - else - # Read from a local file using cleaned options - df = CSV.File(file; read_options...) |> DataFrame + # Concatenate the read DataFrame to the final DataFrame + final_df = isempty(final_df) ? df : vcat(final_df, df, cols=:union) end - return df + + return final_df end #""" #$docstring_read_csv2 #""" -function read_csv2(file; +function read_csv2(files; delim=';', decimal = ',', col_names=true, @@ -226,19 +248,22 @@ function read_csv2(file; comment=nothing, missingstring="", escape_double=true, - ntasks::Int = Threads.nthreads(), # Default ntasks value - num_threads::Union{Int, Nothing}=nothing) # Optional num_threads + ntasks::Int = Threads.nthreads(), + num_threads::Union{Int, Nothing}=nothing) + + # Normalize input to always be a vector of files + file_list = (typeof(files) <: AbstractString) ? [files] : files - # Use num_threads if provided, otherwise stick with ntasks + # Use num_threads if provided, otherwise use ntasks effective_ntasks = isnothing(num_threads) ? ntasks : num_threads - - # Convert n_max from Inf to Nothing for compatibility with CSV.File's limit argument + + # Convert n_max from Inf to Nothing for compatibility limit = isinf(n_max) ? nothing : Int(n_max) - # Calculate skipto and header correctly + # Calculate skipto and header skipto = skip + (col_names === true ? 1 : 0) - # Prepare arguments for CSV.read, including the effective number of tasks to use + # Prepare CSV reading options read_options = ( delim = delim, decimal = decimal, @@ -256,28 +281,29 @@ function read_csv2(file; ntasks = effective_ntasks > 1 ) + # Initialize an empty DataFrame + final_df = DataFrame() - # Filter options to remove any set to `nothing` - # clean_options = Dict{Symbol,Any}(filter(p -> !isnothing(p[2]), read_options)) - - # Check if the file is a URL and read accordingly - if startswith(file, "http://") || startswith(file, "https://") - # Fetch the content from the URL - response = HTTP.get(file) - - # Ensure the request was successful - if response.status != 200 - error("Failed to fetch the CSV file: HTTP status code ", response.status) + # Loop over files + for file in file_list + if startswith(file, "http://") || startswith(file, "https://") + # Fetch the content from the URL + response = HTTP.get(file) + if response.status != 200 + error("Failed to fetch the CSV file: HTTP status code ", response.status) + end + # Read the CSV data + df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame + else + # Read from a local file + df = CSV.File(file; read_options...) |> DataFrame end - # Read the CSV data from the fetched content using cleaned options - df = CSV.File(IOBuffer(response.body); read_options...) |> DataFrame - else - # Read from a local file using cleaned options - df = CSV.File(file; read_options...) |> DataFrame + # Concatenate the read DataFrame to the final DataFrame + final_df = isempty(final_df) ? df : vcat(final_df, df, cols=:union) end - return df + return final_df end diff --git a/src/docstrings.jl b/src/docstrings.jl index ad1df95..17b4cc6 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -6,7 +6,7 @@ const docstring_read_csv = Reads a CSV file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments -`file`: Path to the CSV file or a URL to a CSV file. +`file`: Path or vector of paths to the CSV file or a URL to a CSV file. `delim`: The character delimiting fields in the file. Default is ','. `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. `skip`: Number of initial lines to skip before reading data. Default is 0. @@ -41,7 +41,7 @@ const docstring_read_tsv = Reads a TSV file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments -`file`: Path to the TSV file or a URL to a TSV file. +`file`: Path or vector of paths to the TSV file or a URL to a TSV file. `delim`: The character delimiting fields in the file. Default is ','. `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. `skip`: Number of initial lines to skip before reading data. Default is 0. @@ -77,7 +77,7 @@ const docstring_read_delim = Reads a delimited file or URL into a DataFrame, with options to specify delimiter, column names, and other CSV parsing options. # Arguments -`file`: Path to the CSV file or a URL to a CSV file. +`file`: Path or vector of paths to the CSV file or a URL to a CSV file. `delim`: The character delimiting fields in the file. Default is ','. `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. `skip`: Number of initial lines to skip before reading data. Default is 0. @@ -575,6 +575,7 @@ Write a DataFrame to an Parquet (.parquet) file. Arguments -`df`: The DataFrame to be written to a file. -`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. + # Examples ```jldoctest julia> df = DataFrame(AA=["Par", "quet"], AB=[10.1, 10.2]); @@ -585,15 +586,15 @@ julia> write_parquet(df, "test.parquet"); const docstring_read_parquet = """ - read_parquet(df, path) -Read a Paquet File (.parquet) to a DataFrame.. + read_parquet(path) +Read a Paquet File (.parquet) to a DataFrame. Arguments --`df`: The DataFrame to be written to a file. --`path`: String as path where the .dta file will be created. If a file at this path already exists, it will be overwritten. +-`path`: Path or vector of paths or URLs to parquet file to be read `col_names`: Indicates if the first row of the CSV is used as column names. Can be true, false, or an array of strings. Default is true. `skip`: Number of initial lines to skip before reading data. Default is 0. `n_max`: Maximum number of rows to read. Default is Inf (read all rows). -`col_select`: Optional vector of symbols or strings to select which columns to load. + # Examples ```jldoctest julia> df = DataFrame(AA=["Par", "quet"], AB=[10.1, 10.2]); diff --git a/src/parquet_files.jl b/src/parquet_files.jl index 84b002c..51fe3bb 100644 --- a/src/parquet_files.jl +++ b/src/parquet_files.jl @@ -1,57 +1,70 @@ """ $docstring_read_parquet """ -function read_parquet(data_file; +function read_parquet(files; col_select=nothing, skip=0, n_max=Inf, - col_names=true) # Handle column names display - # Determine if the file is a local file or a URL - if startswith(data_file, "http://") || startswith(data_file, "https://") - # Fetch the content from the URL - response = HTTP.get(data_file) + col_names=true) - # Ensure the request was successful - if response.status != 200 - error("Failed to fetch the Parquet file: HTTP status code ", response.status) + # Normalize input to always be a vector of files + file_list = (typeof(files) <: AbstractString) ? [files] : files + + # Initialize an empty DataFrame + final_df = DataFrame() + + # Loop over files + for data_file in file_list + # Determine if the file is a local file or a URL + if startswith(data_file, "http://") || startswith(data_file, "https://") + # Fetch the content from the URL + response = HTTP.get(data_file) + + # Ensure the request was successful + if response.status != 200 + error("Failed to fetch the Parquet file: HTTP status code ", response.status) + end + + # Use the content fetched from the URL as an IOBuffer for reading + file_to_read = IOBuffer(response.body) + else + # Use the local file path + file_to_read = data_file end - # Use the content fetched from the URL as an IOBuffer for reading - file_to_read = IOBuffer(response.body) - else - # Use the local file path - file_to_read = data_file - end + # Open the dataset + ds = Parquet2.Dataset(file_to_read) + df = DataFrame(ds; copycols=false) # Load the entire dataset initially - # Open the dataset - ds = Parquet2.Dataset(file_to_read) - df = DataFrame(ds; copycols=false) # Load the entire dataset initially + # Apply column selection if provided + if !isnothing(col_select) + # Ensure column names are in the correct format + col_select = [typeof(c) === Symbol ? string(c) : c for c in col_select] + df = select(df, col_select) + end - # Apply column selection if provided - if !isnothing(col_select) - # Ensure column names are in the correct format - col_select = [typeof(c) === Symbol ? string(c) : c for c in col_select] - df = select(df, col_select) - end + # Apply skip and limit + if skip > 0 || !isinf(n_max) + start_idx = max(1, skip + 1) + end_idx = !isinf(n_max) ? start_idx + n_max - 1 : nrow(df) + df = df[start_idx:min(end_idx, nrow(df)), :] + end - # Apply skip and limit - if skip > 0 || !isinf(n_max) - start_idx = max(1, skip + 1) - end_idx = !isinf(n_max) ? start_idx + n_max - 1 : nrow(df) - df = df[start_idx:min(end_idx, nrow(df)), :] - end + # If column names should not be displayed as headers + if !col_names + # Create a DataFrame with the original column names as the first row + col_names_df = DataFrame([transpose(names(df))], [:ColumnNames]) + # Concatenate the DataFrame with column names as the first row + df = vcat(col_names_df, df) + # Rename columns to generic names + rename!(df, Symbol.(:Column, 1:ncol(df))) + end - # If column names should not be displayed as headers - if !col_names - # Create a DataFrame with the original column names as the first row - col_names_df = DataFrame([transpose(names(df))], [:ColumnNames]) - # Concatenate the DataFrame with column names as the first row - df = vcat(col_names_df, df) - # Rename columns to generic names - rename!(df, Symbol.(:Column, 1:ncol(df))) + # Concatenate the read DataFrame to the final DataFrame + final_df = isempty(final_df) ? df : vcat(final_df, df, cols=:union) end - return df + return final_df end """