Skip to content

Commit

Permalink
Read CSV using TulipaIO and DuckDB and create table_tree from DB (#634)
Browse files Browse the repository at this point in the history
Co-authored-by: Suvayu Ali <suvayu@users.noreply.github.com>
  • Loading branch information
abelsiqueira and suvayu committed May 14, 2024
1 parent 055996c commit 292dbdd
Show file tree
Hide file tree
Showing 9 changed files with 162 additions and 104 deletions.
3 changes: 3 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ version = "0.8.0"
[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
Expand All @@ -17,10 +18,12 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
TulipaIO = "7b3808b7-0819-42d4-885c-978ba173db11"

[compat]
CSV = "0.10"
DataFrames = "1"
DuckDB = "0.10"
Graphs = "1.8"
HiGHS = "1"
JuMP = "1"
Expand Down
6 changes: 5 additions & 1 deletion docs/src/tutorials.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,11 @@ using TulipaEnergyModel
input_dir = "../../test/inputs/Tiny" # hide
# input_dir should be the path to Tiny
table_tree = create_input_dataframes_from_csv_folder(input_dir)
connection = create_connection_and_import_from_csv_folder(input_dir)
```

```@example manual
table_tree = create_input_dataframes(connection)
```

The `table_tree` contains all tables in the folder, which are then processed into the internal structures below:
Expand Down
2 changes: 2 additions & 0 deletions src/TulipaEnergyModel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ module TulipaEnergyModel
## Data
using CSV: CSV
using DataFrames: DataFrames, DataFrame
using DuckDB: DuckDB, DBInterface
using TOML: TOML
using TulipaIO: TulipaIO

## Graph
using Graphs: Graphs, SimpleDiGraph
Expand Down
24 changes: 12 additions & 12 deletions src/input-schemas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,16 @@ const schemas = (
)

const schema_per_file = OrderedDict(
"assets-timeframe-partitions.csv" => schemas.assets.timeframe_partition,
"assets-data.csv" => schemas.assets.data,
"assets-timeframe-profiles.csv" => schemas.assets.profiles_reference,
"assets-rep-periods-profiles.csv" => schemas.assets.profiles_reference,
"assets-rep-periods-partitions.csv" => schemas.assets.rep_periods_partition,
"flows-data.csv" => schemas.flows.data,
"flows-rep-periods-profiles.csv" => schemas.flows.profiles_reference,
"flows-rep-periods-partitions.csv" => schemas.flows.rep_periods_partition,
"profiles-timeframe-<type>.csv" => schemas.timeframe.profiles_data,
"profiles-rep-periods-<type>.csv" => schemas.rep_periods.profiles_data,
"rep-periods-data.csv" => schemas.rep_periods.data,
"rep-periods-mapping.csv" => schemas.rep_periods.mapping,
"assets_timeframe_partitions" => schemas.assets.timeframe_partition,
"assets_data" => schemas.assets.data,
"assets_timeframe_profiles" => schemas.assets.profiles_reference,
"assets_rep_periods_profiles" => schemas.assets.profiles_reference,
"assets_rep_periods_partitions" => schemas.assets.rep_periods_partition,
"flows_data" => schemas.flows.data,
"flows_rep_periods_profiles" => schemas.flows.profiles_reference,
"flows_rep_periods_partitions" => schemas.flows.rep_periods_partition,
"profiles_timeframe_<type>" => schemas.timeframe.profiles_data,
"profiles_rep_periods_<type>" => schemas.rep_periods.profiles_data,
"rep_periods_data" => schemas.rep_periods.data,
"rep_periods_mapping" => schemas.rep_periods.mapping,
)
183 changes: 105 additions & 78 deletions src/io.jl
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
export create_energy_problem_from_csv_folder,
create_input_dataframes_from_csv_folder,
create_connection_and_import_from_csv_folder,
create_input_dataframes,
create_internal_structures,
save_solution_to_file,
compute_assets_partitions!,
compute_flows_partitions!

"""
energy_problem = create_energy_problem_from_csv_folder(input_folder; strict = false)
Returns the [`TulipaEnergyModel.EnergyProblem`](@ref) reading all data from CSV files
in the `input_folder`.
This is a wrapper around `create_graph_and_representative_periods_from_csv_folder` that creates
the `EnergyProblem` structure.
Set `strict = true` to error if assets are missing from partition data.
"""
function create_energy_problem_from_csv_folder(input_folder::AbstractString; strict = false)
table_tree = create_input_dataframes_from_csv_folder(input_folder; strict = strict)
return EnergyProblem(table_tree)
connection = create_connection_and_import_from_csv_folder(input_folder)
return EnergyProblem(connection; strict = strict)
end

"""
Expand All @@ -25,61 +26,110 @@ end
Returns the `table_tree::TableTree` structure that holds all data.
Set `strict = true` to error if assets are missing from partition data.
The following files are expected to exist in the input folder:
- `assets-timeframe-partitions.csv`: Following the schema `schemas.assets.timeframe_partition`.
- `assets-data.csv`: Following the schema `schemas.assets.data`.
- `assets-timeframe-profiles.csv`: Following the schema `schemas.assets.profiles_reference`.
- `assets-rep-periods-profiles.csv`: Following the schema `schemas.assets.profiles_reference`.
- `assets-rep-periods-partitions.csv`: Following the schema `schemas.assets.rep_periods_partition`.
- `flows-data.csv`: Following the schema `schemas.flows.data`.
- `flows-rep-periods-profiles.csv`: Following the schema `schemas.flows.profiles_reference`.
- `flows-rep-periods-partitions.csv`: Following the schema `schemas.flows.rep_periods_partition`.
- `profiles-timeframe-<type>.csv`: Following the schema `schemas.timeframe.profiles_data`.
- `profiles-rep-periods-<type>.csv`: Following the schema `schemas.rep_periods.profiles_data`.
- `rep-periods-data.csv`: Following the schema `schemas.rep_periods.data`.
- `rep-periods-mapping.csv`: Following the schema `schemas.rep_periods.mapping`.
This is a convenience function calling [`create_input_dataframes_from_csv_folder`](@ref) and
[`create_input_dataframes`](@ref).
"""
function create_input_dataframes_from_csv_folder(input_folder::AbstractString; strict = false)
df_assets_data = read_csv_with_implicit_schema(input_folder, "assets-data.csv")
df_flows_data = read_csv_with_implicit_schema(input_folder, "flows-data.csv")
df_rep_periods = read_csv_with_implicit_schema(input_folder, "rep-periods-data.csv")
df_rp_mapping = read_csv_with_implicit_schema(input_folder, "rep-periods-mapping.csv")
connection = create_connection_and_import_from_csv_folder(input_folder)

return create_input_dataframes(connection; strict = strict)
end

"""
connection = create_connection_and_import_from_csv_folder(input_folder)
Creates a DuckDB connection and reads the CSVs in the `input_folder` into the DB.
The names of the tables will be the names of the files, except that `-` will be converted
into `_`, and the extension will be ignored.
"""
function create_connection_and_import_from_csv_folder(input_folder::AbstractString)
connection = DBInterface.connect(DuckDB.DB)

for filename in readdir(input_folder)
if !endswith(".csv")(filename)
continue
end
table_name, _ = splitext(filename)
table_name = replace(table_name, "-" => "_")
TulipaIO.create_tbl(connection, joinpath(input_folder, filename); name = table_name)
end

return connection
end

"""
table_tree = create_input_dataframes(connection)
period_types = ["rep-periods", "timeframe"]
Returns the `table_tree::TableTree` structure that holds all data using a DB `connection` that
has loaded all the relevant tables.
Set `strict = true` to error if assets are missing from partition data.
The following tables are expected to exist in the DB.
> !!! warn
>
> The schemas are currently being ignored, see issue
[#636](https://github.com/TulipaEnergy/TulipaEnergyModel.jl/issues/636) for more information.
_ `assets_timeframe_partitions`: Following the schema `schemas.assets.timeframe_partition`.
_ `assets_data`: Following the schema `schemas.assets.data`.
_ `assets_timeframe_profiles`: Following the schema `schemas.assets.profiles_reference`.
_ `assets_rep_periods_profiles`: Following the schema `schemas.assets.profiles_reference`.
_ `assets_rep_periods_partitions`: Following the schema `schemas.assets.rep_periods_partition`.
_ `flows_data`: Following the schema `schemas.flows.data`.
_ `flows_rep_periods_profiles`: Following the schema `schemas.flows.profiles_reference`.
_ `flows_rep_periods_partitions`: Following the schema `schemas.flows.rep_periods_partition`.
_ `profiles_timeframe_<type>`: Following the schema `schemas.timeframe.profiles_data`.
_ `profiles_rep_periods_<type>`: Following the schema `schemas.rep_periods.profiles_data`.
_ `rep_periods_data`: Following the schema `schemas.rep_periods.data`.
_ `rep_periods_mapping`: Following the schema `schemas.rep_periods.mapping`.
"""
function create_input_dataframes(connection::DuckDB.DB; strict = false)
function read_table(table_name)
schema = get_schema(table_name)
df = DataFrame(DBInterface.execute(connection, "SELECT * FROM $table_name"))
# enforcing schema to match what Tulipa expects; DuckDB string -> symbol, int -> string
for (key, value) in schema
if value <: Union{Missing,Symbol}
df[!, key] = [ismissing(x) ? x : Symbol(x) for x in df[!, key]]
end
if value <: Union{Missing,String} && !(eltype(df[!, key]) <: Union{Missing,String})
df[!, key] = [ismissing(x) ? x : string(x) for x in df[!, key]]
end
end
return df
end
df_assets_data = read_table("assets_data")
df_flows_data = read_table("flows_data")
df_rep_periods = read_table("rep_periods_data")
df_rp_mapping = read_table("rep_periods_mapping")

dfs_assets_profiles = Dict(
period_type =>
read_csv_with_implicit_schema(input_folder, "assets-$period_type-profiles.csv") for
period_type in period_types
period_type => read_table("assets_$(period_type)_profiles") for period_type in PERIOD_TYPES
)
df_flows_profiles =
read_csv_with_implicit_schema(input_folder, "flows-rep-periods-profiles.csv")
df_flows_profiles = read_table("flows_rep_periods_profiles")
dfs_assets_partitions = Dict(
period_type =>
read_csv_with_implicit_schema(input_folder, "assets-$period_type-partitions.csv")
for period_type in period_types
period_type => read_table("assets_$(period_type)_partitions") for
period_type in PERIOD_TYPES
)
df_flows_partitions =
read_csv_with_implicit_schema(input_folder, "flows-rep-periods-partitions.csv")
df_flows_partitions = read_table("flows_rep_periods_partitions")

tables_list = DBInterface.execute(connection, "SHOW TABLES")
dfs_profiles = Dict(
period_type => Dict(
begin
regex = "profiles-$(period_type)-(.*).csv"
# Sanitized key: Spaces and dashes convert to underscore
key = Symbol(replace(match(Regex(regex), filename)[1], r"[ -]" => "_"))
value = read_csv_with_implicit_schema(input_folder, filename)
regex = "profiles_$(period_type)_(.*)"
key = Symbol(match(Regex(regex), row.name)[1])
value = read_table(row.name)
key => value
end for filename in readdir(input_folder) if
startswith("profiles-$period_type-")(filename)
) for period_type in period_types
end for row in tables_list if startswith("profiles_$period_type")(row.name)
) for period_type in PERIOD_TYPES
)

# Error if partition data is missing assets (if strict)
if strict
missing_assets =
setdiff(df_assets_data[!, :name], dfs_assets_partitions["rep-periods"][!, :asset])
setdiff(df_assets_data[!, :name], dfs_assets_partitions[:rep_periods][!, :asset])
if length(missing_assets) > 0
msg = "Error: Partition data missing for these assets: \n"
for a in missing_assets
Expand Down Expand Up @@ -199,7 +249,7 @@ function create_internal_structures(table_tree::TableTree)
for a in MetaGraphsNext.labels(graph)
compute_assets_partitions!(
graph[a].rep_periods_partitions,
table_tree.partitions.assets["rep-periods"],
table_tree.partitions.assets[:rep_periods],
a,
representative_periods,
)
Expand All @@ -220,7 +270,7 @@ function create_internal_structures(table_tree::TableTree)
if row.is_seasonal
# Search for this row in the table_tree.partitions.assets and error if it is not found
found = false
for partition_row in eachrow(table_tree.partitions.assets["timeframe"])
for partition_row in eachrow(table_tree.partitions.assets[:timeframe])
if row.name == partition_row.asset
graph[row.name].timeframe_partitions = _parse_rp_partition(
Val(partition_row.specification),
Expand All @@ -238,11 +288,11 @@ function create_internal_structures(table_tree::TableTree)
end
end

for asset_profile_row in eachrow(table_tree.profiles.assets["rep-periods"]) # row = asset, profile_type, profile_name
for asset_profile_row in eachrow(table_tree.profiles.assets[:rep_periods]) # row = asset, profile_type, profile_name
gp = DataFrames.groupby( # 3. group by RP
filter(
:profile_name => ==(asset_profile_row.profile_name), # 2. Filter profile_name
table_tree.profiles.data["rep-periods"][asset_profile_row.profile_type]; # 1. Get the profile of given type
table_tree.profiles.data[:rep_periods][asset_profile_row.profile_type]; # 1. Get the profile of given type
view = true,
),
:rep_period,
Expand All @@ -259,7 +309,7 @@ function create_internal_structures(table_tree::TableTree)
gp = DataFrames.groupby(
filter(
:profile_name => ==(flow_profile_row.profile_name),
table_tree.profiles.data["rep-periods"][flow_profile_row.profile_type];
table_tree.profiles.data[:rep_periods][flow_profile_row.profile_type];
view = true,
),
:rep_period;
Expand All @@ -272,10 +322,10 @@ function create_internal_structures(table_tree::TableTree)
end
end

for asset_profile_row in eachrow(table_tree.profiles.assets["timeframe"]) # row = asset, profile_type, profile_name
for asset_profile_row in eachrow(table_tree.profiles.assets[:timeframe]) # row = asset, profile_type, profile_name
df = filter(
:profile_name => ==(asset_profile_row.profile_name), # 2. Filter profile_name
table_tree.profiles.data["timeframe"][asset_profile_row.profile_type]; # 1. Get the profile of given type
table_tree.profiles.data[:timeframe][asset_profile_row.profile_type]; # 1. Get the profile of given type
view = true,
)
graph[asset_profile_row.asset].timeframe_profiles[asset_profile_row.profile_type] = df.value
Expand All @@ -284,41 +334,18 @@ function create_internal_structures(table_tree::TableTree)
return graph, representative_periods, timeframe
end

"""
read_csv_with_schema(file_path, schema; csvargs...)
Reads the csv at `file_path` and validates the data using the `schema`.
It assumes that the file's header is at the second row.
The first row of the file contains some metadata information that is not used.
Additional keywords arguments can be passed to `CSV.read`.
"""
function read_csv_with_schema(file_path, schema; csvargs...)
df = CSV.read(file_path, DataFrame; header = 2, types = schema, strict = true, csvargs...)

return df
end

"""
read_csv_with_implicit_schema(dir, filename; csvargs...)
Reads the csv at direcory `dir` named `filename` and validates the data using a schema
based on `filename`.
The function [`read_csv_with_schema`](@ref) reads the file.
Additional keywords arguments can be passed to `CSV.read`.
"""
function read_csv_with_implicit_schema(dir, filename; csvargs...)
schema = if haskey(schema_per_file, filename)
schema_per_file[filename]
function get_schema(tablename)
if haskey(schema_per_file, tablename)
return schema_per_file[tablename]
else
if startswith("profiles-timeframe")(filename)
schema_per_file["profiles-timeframe-<type>.csv"]
elseif startswith("profiles-rep-periods")(filename)
schema_per_file["profiles-rep-periods-<type>.csv"]
if startswith("profiles_timeframe")(tablename)
return schema_per_file["profiles_timeframe_<type>"]
elseif startswith("profiles_rep_periods")(tablename)
return schema_per_file["profiles_rep_periods_<type>"]
else
error("No implicit schema for file $filename")
error("No implicit schema for table named $tablename")
end
end
read_csv_with_schema(joinpath(dir, filename), schema)
end

"""
Expand Down
19 changes: 12 additions & 7 deletions src/structures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ export GraphAssetData,
const TimestepsBlock = UnitRange{Int}
const PeriodsBlock = UnitRange{Int}

const PeriodType = String
const PeriodType = Symbol
const PERIOD_TYPES = [:rep_periods, :timeframe]

const TableNodeStatic = @NamedTuple{assets::DataFrame, flows::DataFrame}
const TableNodeProfiles = @NamedTuple{
assets::Dict{PeriodType,DataFrame},
Expand Down Expand Up @@ -254,6 +256,7 @@ It hides the complexity behind the energy problem, making the usage more friendl
See the [basic example tutorial](@ref basic-example) to see how these can be used.
"""
mutable struct EnergyProblem
db_connection::DuckDB.DB
table_tree::TableTree
graph::MetaGraph{
Int,
Expand All @@ -279,17 +282,19 @@ mutable struct EnergyProblem
time_solve_model::Float64

"""
EnergyProblem(dfs_input)
EnergyProblem(connection)
Constructs a new EnergyProblem object from the input dataframes.
This will call [`create_internal_structures`](@ref).
Constructs a new EnergyProblem object using the `connection`.
This will call relevant functions to generate all input that is required for the model creation.
"""
function EnergyProblem(dfs_input)
graph, representative_periods, timeframe = create_internal_structures(dfs_input)
function EnergyProblem(connection; strict = false)
table_tree = create_input_dataframes(connection; strict = strict)
graph, representative_periods, timeframe = create_internal_structures(table_tree)
constraints_partitions = compute_constraints_partitions(graph, representative_periods)

return new(
dfs_input,
connection,
table_tree,
graph,
representative_periods,
constraints_partitions,
Expand Down
Loading

0 comments on commit 292dbdd

Please sign in to comment.