diff --git a/Project.toml b/Project.toml index ea32cb27..283696b6 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.8.0" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b" JuMP = "4076af6c-e467-56ae-b986-b466b2749572" @@ -17,10 +18,12 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +TulipaIO = "7b3808b7-0819-42d4-885c-978ba173db11" [compat] CSV = "0.10" DataFrames = "1" +DuckDB = "0.10" Graphs = "1.8" HiGHS = "1" JuMP = "1" diff --git a/docs/src/tutorials.md b/docs/src/tutorials.md index 7ece43d5..79b631c4 100644 --- a/docs/src/tutorials.md +++ b/docs/src/tutorials.md @@ -91,7 +91,11 @@ using TulipaEnergyModel input_dir = "../../test/inputs/Tiny" # hide # input_dir should be the path to Tiny -table_tree = create_input_dataframes_from_csv_folder(input_dir) +connection = create_connection_and_import_from_csv_folder(input_dir) +``` + +```@example manual +table_tree = create_input_dataframes(connection) ``` The `table_tree` contains all tables in the folder, which are then processed into the internal structures below: diff --git a/src/TulipaEnergyModel.jl b/src/TulipaEnergyModel.jl index 7286bbce..19622aee 100644 --- a/src/TulipaEnergyModel.jl +++ b/src/TulipaEnergyModel.jl @@ -5,7 +5,9 @@ module TulipaEnergyModel ## Data using CSV: CSV using DataFrames: DataFrames, DataFrame +using DuckDB: DuckDB, DBInterface using TOML: TOML +using TulipaIO: TulipaIO ## Graph using Graphs: Graphs, SimpleDiGraph diff --git a/src/input-schemas.jl b/src/input-schemas.jl index a24bc858..ebafe98b 100644 --- a/src/input-schemas.jl +++ b/src/input-schemas.jl @@ -121,16 +121,16 @@ const schemas = ( ) const schema_per_file = OrderedDict( - "assets-timeframe-partitions.csv" => schemas.assets.timeframe_partition, - "assets-data.csv" => schemas.assets.data, - "assets-timeframe-profiles.csv" => schemas.assets.profiles_reference, - "assets-rep-periods-profiles.csv" => schemas.assets.profiles_reference, - "assets-rep-periods-partitions.csv" => schemas.assets.rep_periods_partition, - "flows-data.csv" => schemas.flows.data, - "flows-rep-periods-profiles.csv" => schemas.flows.profiles_reference, - "flows-rep-periods-partitions.csv" => schemas.flows.rep_periods_partition, - "profiles-timeframe-.csv" => schemas.timeframe.profiles_data, - "profiles-rep-periods-.csv" => schemas.rep_periods.profiles_data, - "rep-periods-data.csv" => schemas.rep_periods.data, - "rep-periods-mapping.csv" => schemas.rep_periods.mapping, + "assets_timeframe_partitions" => schemas.assets.timeframe_partition, + "assets_data" => schemas.assets.data, + "assets_timeframe_profiles" => schemas.assets.profiles_reference, + "assets_rep_periods_profiles" => schemas.assets.profiles_reference, + "assets_rep_periods_partitions" => schemas.assets.rep_periods_partition, + "flows_data" => schemas.flows.data, + "flows_rep_periods_profiles" => schemas.flows.profiles_reference, + "flows_rep_periods_partitions" => schemas.flows.rep_periods_partition, + "profiles_timeframe_" => schemas.timeframe.profiles_data, + "profiles_rep_periods_" => schemas.rep_periods.profiles_data, + "rep_periods_data" => schemas.rep_periods.data, + "rep_periods_mapping" => schemas.rep_periods.mapping, ) diff --git a/src/io.jl b/src/io.jl index b5e86c89..ba325066 100644 --- a/src/io.jl +++ b/src/io.jl @@ -1,5 +1,7 @@ export create_energy_problem_from_csv_folder, create_input_dataframes_from_csv_folder, + create_connection_and_import_from_csv_folder, + create_input_dataframes, create_internal_structures, save_solution_to_file, compute_assets_partitions!, @@ -7,7 +9,6 @@ export create_energy_problem_from_csv_folder, """ energy_problem = create_energy_problem_from_csv_folder(input_folder; strict = false) - Returns the [`TulipaEnergyModel.EnergyProblem`](@ref) reading all data from CSV files in the `input_folder`. This is a wrapper around `create_graph_and_representative_periods_from_csv_folder` that creates @@ -15,8 +16,8 @@ the `EnergyProblem` structure. Set `strict = true` to error if assets are missing from partition data. """ function create_energy_problem_from_csv_folder(input_folder::AbstractString; strict = false) - table_tree = create_input_dataframes_from_csv_folder(input_folder; strict = strict) - return EnergyProblem(table_tree) + connection = create_connection_and_import_from_csv_folder(input_folder) + return EnergyProblem(connection; strict = strict) end """ @@ -25,61 +26,110 @@ end Returns the `table_tree::TableTree` structure that holds all data. Set `strict = true` to error if assets are missing from partition data. -The following files are expected to exist in the input folder: - - - `assets-timeframe-partitions.csv`: Following the schema `schemas.assets.timeframe_partition`. - - `assets-data.csv`: Following the schema `schemas.assets.data`. - - `assets-timeframe-profiles.csv`: Following the schema `schemas.assets.profiles_reference`. - - `assets-rep-periods-profiles.csv`: Following the schema `schemas.assets.profiles_reference`. - - `assets-rep-periods-partitions.csv`: Following the schema `schemas.assets.rep_periods_partition`. - - `flows-data.csv`: Following the schema `schemas.flows.data`. - - `flows-rep-periods-profiles.csv`: Following the schema `schemas.flows.profiles_reference`. - - `flows-rep-periods-partitions.csv`: Following the schema `schemas.flows.rep_periods_partition`. - - `profiles-timeframe-.csv`: Following the schema `schemas.timeframe.profiles_data`. - - `profiles-rep-periods-.csv`: Following the schema `schemas.rep_periods.profiles_data`. - - `rep-periods-data.csv`: Following the schema `schemas.rep_periods.data`. - - `rep-periods-mapping.csv`: Following the schema `schemas.rep_periods.mapping`. +This is a convenience function calling [`create_input_dataframes_from_csv_folder`](@ref) and +[`create_input_dataframes`](@ref). """ function create_input_dataframes_from_csv_folder(input_folder::AbstractString; strict = false) - df_assets_data = read_csv_with_implicit_schema(input_folder, "assets-data.csv") - df_flows_data = read_csv_with_implicit_schema(input_folder, "flows-data.csv") - df_rep_periods = read_csv_with_implicit_schema(input_folder, "rep-periods-data.csv") - df_rp_mapping = read_csv_with_implicit_schema(input_folder, "rep-periods-mapping.csv") + connection = create_connection_and_import_from_csv_folder(input_folder) + + return create_input_dataframes(connection; strict = strict) +end + +""" + connection = create_connection_and_import_from_csv_folder(input_folder) + +Creates a DuckDB connection and reads the CSVs in the `input_folder` into the DB. +The names of the tables will be the names of the files, except that `-` will be converted +into `_`, and the extension will be ignored. +""" +function create_connection_and_import_from_csv_folder(input_folder::AbstractString) + connection = DBInterface.connect(DuckDB.DB) + + for filename in readdir(input_folder) + if !endswith(".csv")(filename) + continue + end + table_name, _ = splitext(filename) + table_name = replace(table_name, "-" => "_") + TulipaIO.create_tbl(connection, joinpath(input_folder, filename); name = table_name) + end + + return connection +end + +""" + table_tree = create_input_dataframes(connection) - period_types = ["rep-periods", "timeframe"] +Returns the `table_tree::TableTree` structure that holds all data using a DB `connection` that +has loaded all the relevant tables. +Set `strict = true` to error if assets are missing from partition data. + +The following tables are expected to exist in the DB. + +> !!! warn +> +> The schemas are currently being ignored, see issue +[#636](https://github.com/TulipaEnergy/TulipaEnergyModel.jl/issues/636) for more information. + + _ `assets_timeframe_partitions`: Following the schema `schemas.assets.timeframe_partition`. + _ `assets_data`: Following the schema `schemas.assets.data`. + _ `assets_timeframe_profiles`: Following the schema `schemas.assets.profiles_reference`. + _ `assets_rep_periods_profiles`: Following the schema `schemas.assets.profiles_reference`. + _ `assets_rep_periods_partitions`: Following the schema `schemas.assets.rep_periods_partition`. + _ `flows_data`: Following the schema `schemas.flows.data`. + _ `flows_rep_periods_profiles`: Following the schema `schemas.flows.profiles_reference`. + _ `flows_rep_periods_partitions`: Following the schema `schemas.flows.rep_periods_partition`. + _ `profiles_timeframe_`: Following the schema `schemas.timeframe.profiles_data`. + _ `profiles_rep_periods_`: Following the schema `schemas.rep_periods.profiles_data`. + _ `rep_periods_data`: Following the schema `schemas.rep_periods.data`. + _ `rep_periods_mapping`: Following the schema `schemas.rep_periods.mapping`. +""" +function create_input_dataframes(connection::DuckDB.DB; strict = false) + function read_table(table_name) + schema = get_schema(table_name) + df = DataFrame(DBInterface.execute(connection, "SELECT * FROM $table_name")) + # enforcing schema to match what Tulipa expects; DuckDB string -> symbol, int -> string + for (key, value) in schema + if value <: Union{Missing,Symbol} + df[!, key] = [ismissing(x) ? x : Symbol(x) for x in df[!, key]] + end + if value <: Union{Missing,String} && !(eltype(df[!, key]) <: Union{Missing,String}) + df[!, key] = [ismissing(x) ? x : string(x) for x in df[!, key]] + end + end + return df + end + df_assets_data = read_table("assets_data") + df_flows_data = read_table("flows_data") + df_rep_periods = read_table("rep_periods_data") + df_rp_mapping = read_table("rep_periods_mapping") dfs_assets_profiles = Dict( - period_type => - read_csv_with_implicit_schema(input_folder, "assets-$period_type-profiles.csv") for - period_type in period_types + period_type => read_table("assets_$(period_type)_profiles") for period_type in PERIOD_TYPES ) - df_flows_profiles = - read_csv_with_implicit_schema(input_folder, "flows-rep-periods-profiles.csv") + df_flows_profiles = read_table("flows_rep_periods_profiles") dfs_assets_partitions = Dict( - period_type => - read_csv_with_implicit_schema(input_folder, "assets-$period_type-partitions.csv") - for period_type in period_types + period_type => read_table("assets_$(period_type)_partitions") for + period_type in PERIOD_TYPES ) - df_flows_partitions = - read_csv_with_implicit_schema(input_folder, "flows-rep-periods-partitions.csv") + df_flows_partitions = read_table("flows_rep_periods_partitions") + tables_list = DBInterface.execute(connection, "SHOW TABLES") dfs_profiles = Dict( period_type => Dict( begin - regex = "profiles-$(period_type)-(.*).csv" - # Sanitized key: Spaces and dashes convert to underscore - key = Symbol(replace(match(Regex(regex), filename)[1], r"[ -]" => "_")) - value = read_csv_with_implicit_schema(input_folder, filename) + regex = "profiles_$(period_type)_(.*)" + key = Symbol(match(Regex(regex), row.name)[1]) + value = read_table(row.name) key => value - end for filename in readdir(input_folder) if - startswith("profiles-$period_type-")(filename) - ) for period_type in period_types + end for row in tables_list if startswith("profiles_$period_type")(row.name) + ) for period_type in PERIOD_TYPES ) # Error if partition data is missing assets (if strict) if strict missing_assets = - setdiff(df_assets_data[!, :name], dfs_assets_partitions["rep-periods"][!, :asset]) + setdiff(df_assets_data[!, :name], dfs_assets_partitions[:rep_periods][!, :asset]) if length(missing_assets) > 0 msg = "Error: Partition data missing for these assets: \n" for a in missing_assets @@ -199,7 +249,7 @@ function create_internal_structures(table_tree::TableTree) for a in MetaGraphsNext.labels(graph) compute_assets_partitions!( graph[a].rep_periods_partitions, - table_tree.partitions.assets["rep-periods"], + table_tree.partitions.assets[:rep_periods], a, representative_periods, ) @@ -220,7 +270,7 @@ function create_internal_structures(table_tree::TableTree) if row.is_seasonal # Search for this row in the table_tree.partitions.assets and error if it is not found found = false - for partition_row in eachrow(table_tree.partitions.assets["timeframe"]) + for partition_row in eachrow(table_tree.partitions.assets[:timeframe]) if row.name == partition_row.asset graph[row.name].timeframe_partitions = _parse_rp_partition( Val(partition_row.specification), @@ -238,11 +288,11 @@ function create_internal_structures(table_tree::TableTree) end end - for asset_profile_row in eachrow(table_tree.profiles.assets["rep-periods"]) # row = asset, profile_type, profile_name + for asset_profile_row in eachrow(table_tree.profiles.assets[:rep_periods]) # row = asset, profile_type, profile_name gp = DataFrames.groupby( # 3. group by RP filter( :profile_name => ==(asset_profile_row.profile_name), # 2. Filter profile_name - table_tree.profiles.data["rep-periods"][asset_profile_row.profile_type]; # 1. Get the profile of given type + table_tree.profiles.data[:rep_periods][asset_profile_row.profile_type]; # 1. Get the profile of given type view = true, ), :rep_period, @@ -259,7 +309,7 @@ function create_internal_structures(table_tree::TableTree) gp = DataFrames.groupby( filter( :profile_name => ==(flow_profile_row.profile_name), - table_tree.profiles.data["rep-periods"][flow_profile_row.profile_type]; + table_tree.profiles.data[:rep_periods][flow_profile_row.profile_type]; view = true, ), :rep_period; @@ -272,10 +322,10 @@ function create_internal_structures(table_tree::TableTree) end end - for asset_profile_row in eachrow(table_tree.profiles.assets["timeframe"]) # row = asset, profile_type, profile_name + for asset_profile_row in eachrow(table_tree.profiles.assets[:timeframe]) # row = asset, profile_type, profile_name df = filter( :profile_name => ==(asset_profile_row.profile_name), # 2. Filter profile_name - table_tree.profiles.data["timeframe"][asset_profile_row.profile_type]; # 1. Get the profile of given type + table_tree.profiles.data[:timeframe][asset_profile_row.profile_type]; # 1. Get the profile of given type view = true, ) graph[asset_profile_row.asset].timeframe_profiles[asset_profile_row.profile_type] = df.value @@ -284,41 +334,18 @@ function create_internal_structures(table_tree::TableTree) return graph, representative_periods, timeframe end -""" - read_csv_with_schema(file_path, schema; csvargs...) - -Reads the csv at `file_path` and validates the data using the `schema`. -It assumes that the file's header is at the second row. -The first row of the file contains some metadata information that is not used. -Additional keywords arguments can be passed to `CSV.read`. -""" -function read_csv_with_schema(file_path, schema; csvargs...) - df = CSV.read(file_path, DataFrame; header = 2, types = schema, strict = true, csvargs...) - - return df -end - -""" - read_csv_with_implicit_schema(dir, filename; csvargs...) - -Reads the csv at direcory `dir` named `filename` and validates the data using a schema -based on `filename`. -The function [`read_csv_with_schema`](@ref) reads the file. -Additional keywords arguments can be passed to `CSV.read`. -""" -function read_csv_with_implicit_schema(dir, filename; csvargs...) - schema = if haskey(schema_per_file, filename) - schema_per_file[filename] +function get_schema(tablename) + if haskey(schema_per_file, tablename) + return schema_per_file[tablename] else - if startswith("profiles-timeframe")(filename) - schema_per_file["profiles-timeframe-.csv"] - elseif startswith("profiles-rep-periods")(filename) - schema_per_file["profiles-rep-periods-.csv"] + if startswith("profiles_timeframe")(tablename) + return schema_per_file["profiles_timeframe_"] + elseif startswith("profiles_rep_periods")(tablename) + return schema_per_file["profiles_rep_periods_"] else - error("No implicit schema for file $filename") + error("No implicit schema for table named $tablename") end end - read_csv_with_schema(joinpath(dir, filename), schema) end """ diff --git a/src/structures.jl b/src/structures.jl index 484d3cd3..2506aa1c 100644 --- a/src/structures.jl +++ b/src/structures.jl @@ -4,7 +4,9 @@ export GraphAssetData, const TimestepsBlock = UnitRange{Int} const PeriodsBlock = UnitRange{Int} -const PeriodType = String +const PeriodType = Symbol +const PERIOD_TYPES = [:rep_periods, :timeframe] + const TableNodeStatic = @NamedTuple{assets::DataFrame, flows::DataFrame} const TableNodeProfiles = @NamedTuple{ assets::Dict{PeriodType,DataFrame}, @@ -254,6 +256,7 @@ It hides the complexity behind the energy problem, making the usage more friendl See the [basic example tutorial](@ref basic-example) to see how these can be used. """ mutable struct EnergyProblem + db_connection::DuckDB.DB table_tree::TableTree graph::MetaGraph{ Int, @@ -279,17 +282,19 @@ mutable struct EnergyProblem time_solve_model::Float64 """ - EnergyProblem(dfs_input) + EnergyProblem(connection) - Constructs a new EnergyProblem object from the input dataframes. - This will call [`create_internal_structures`](@ref). + Constructs a new EnergyProblem object using the `connection`. + This will call relevant functions to generate all input that is required for the model creation. """ - function EnergyProblem(dfs_input) - graph, representative_periods, timeframe = create_internal_structures(dfs_input) + function EnergyProblem(connection; strict = false) + table_tree = create_input_dataframes(connection; strict = strict) + graph, representative_periods, timeframe = create_internal_structures(table_tree) constraints_partitions = compute_constraints_partitions(graph, representative_periods) return new( - dfs_input, + connection, + table_tree, graph, representative_periods, constraints_partitions, diff --git a/test/Project.toml b/test/Project.toml index 78074d8b..33a0d49a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,6 +3,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Cbc = "9961bab8-2fa3-5c5a-9d89-47fab24efd76" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1" GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b" diff --git a/test/runtests.jl b/test/runtests.jl index cf82e36c..30a3b58a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,7 @@ using CSV using Cbc using DataFrames +using DuckDB: DBInterface using GLPK using Graphs using HiGHS diff --git a/test/test-io.jl b/test/test-io.jl index 10aac015..6b54016b 100644 --- a/test/test-io.jl +++ b/test/test-io.jl @@ -1,10 +1,6 @@ @testset "Input validation" begin - @testset "Make sure that input validation fails for bad files" begin - dir = joinpath(INPUT_FOLDER, "Tiny") - @test_throws ArgumentError TulipaEnergyModel.read_csv_with_schema( - joinpath(dir, "bad-assets-data.csv"), - TulipaEnergyModel.schemas.assets.data, - ) + @testset "Test that missing schemas throw correctly" begin + @test_throws ErrorException TulipaEnergyModel.get_schema("bad_assets_data") end @testset "Check missing asset partition if strict" begin @@ -146,3 +142,22 @@ end graph, rps, tf = create_internal_structures(table_tree) @test graph[missing_asset].timeframe_partitions == [i:i for i in 1:tf.num_periods] end + +@testset "Test that non-csv files are ignored when reading csv from a folder" begin + dir = mktempdir() + for (root, _, files) in walkdir(joinpath(INPUT_FOLDER, "Norse")) + for file in files + cp(joinpath(root, file), joinpath(dir, file)) + end + end + + connection1 = create_connection_and_import_from_csv_folder(dir) + open(joinpath(dir, "some-file.xyz"), "w") do io + println(io, "nothing here") + end + connection2 = create_connection_and_import_from_csv_folder(dir) + + table_list(con) = [x.name for x in DBInterface.execute(con, "SHOW TABLES")] + + @test table_list(connection1) == table_list(connection2) +end