# Indexing Dev

In [1]:
using Pkg; Pkg.activate("/home/roscar/work/cfgrib-project/cfgrib.jl")

[32m[1mActivating[22m[39m environment at `~/work/cfgrib-project/cfgrib.jl/Project.toml`


In [None]:
using GRIB
using DataStructures
using Dates

In [None]:
using cfgrib

In [None]:
grib_path = "/home/roscar/work/cfgrib-project/cfgrib.jl/tests/sample-data/era5-levels-members.grib"

In [17]:
DEFAULT_EPOCH = DateTime(1970, 1, 1, 0, 0)

1970-01-01T00:00:00

## Basic Indexing

In [None]:
mutable struct FileIndex
    allowed_protocol_version::VersionNumber

    grib_path::String
    index_path::String

    index_keys::Array{String, 1}
    offsets::Array
    header_values::OrderedDict{String, Array}

    FileIndex() = new()
end

In [None]:
function FileIndex(grib_path::String, index_keys::Array{String, 1})
    fileindex = FileIndex()
    fileindex.allowed_protocol_version = v"0.0.0"
    fileindex.grib_path = grib_path
    fileindex.index_keys = index_keys

    index_path!(fileindex)

    if isfile(fileindex.index_path)
        from_indexfile!(fileindex)
    else
        from_gribfile!(fileindex)
        get_header_values!(fileindex)
    end

    return fileindex
end

In [None]:
function index_path!(index::FileIndex)
    index_keys_hash = hash(
        join([index.index_keys..., index.allowed_protocol_version])
    )
    index_keys_hash = string(index_keys_hash, base=16)
    index.index_path = ".$(index.grib_path).$index_keys_hash.idx"
end

In [None]:
function save_indexfile(index::FileIndex)
    throw("unimplemented")
end

function from_indexfile!(index::FileIndex)
    throw("unimplemented")
end

In [9]:
function from_grib_date_time(message, date_key="dataDate", time_key="dataTime", epoch=DEFAULT_EPOCH)
    date = GRIB.getindex(message, date_key)
    time = GRIB.getindex(message, time_key)
    hour = time ÷ 100
    minute = time % 100
    year = date ÷ 10000
    month = date ÷ 100 % 100
    day = date % 100
    
    data_datetime = DateTime(year, month, day, hour, minute)
    
    return Dates.value(Dates.Second(data_datetime - epoch))
end

from_grib_date_time (generic function with 4 methods)

In [10]:
ismissing

ismissing (generic function with 2 methods)

In [18]:
function from_gribfile!(index::FileIndex)
    offsets = OrderedDict()
    count_offsets = Dict{Int, Int}()

    index_keys = index.index_keys
    index_key_count = length(index_keys)
    index_key_symbols = Tuple(Symbol.(index_keys))
    HeaderTuple = NamedTuple{index_key_symbols}

    #  TODO: Time function to see if it is worth optimising
    #  based on gribfile.nmessages w/ known-length arrays
    #  more, or if I/O overhead too large
    GribFile(index.grib_path) do f
        for message in f
            header_values = Array{Any}(undef, index_key_count)
            for (i, key) in enumerate(index_keys)
                value = haskey(message, key) ? message[key] : missing
                value = value isa Array ? Tuple(value) : value

                if key == "time" && !ismissing(value)
                    value = from_grib_date_time(message)
                end
                
                header_values[i] = value
            end

            offset = Int(message["offset"])
            if offset in keys(count_offsets)
                count_offsets[offset] += 1
                offset_field = (offset, count_offsets[offset])
            else
                count_offsets[offset] = 0
                offset_field = offset
            end

            offsets[HeaderTuple(header_values)] = offset_field
        end
    end

    index.offsets = collect(pairs(offsets))
end

from_gribfile! (generic function with 1 method)

In [19]:
function get_header_values!(index::FileIndex)
    header_values = OrderedDict{String, Array}()
    for key in index.index_keys
        header_values[key] = unique([offset[1][Symbol(key)] for offset in index.offsets])
    end

    index.header_values = header_values
end

get_header_values! (generic function with 1 method)

In [21]:
i = FileIndex(grib_path, cfgrib.ALL_KEYS);

In [23]:
i.header_values["time"]

4-element Array{Int64,1}:
 1483228800
 1483272000
 1483315200
 1483358400

## Debugging

In [12]:
offsets = OrderedDict()
count_offsets = Dict{Int, Int}()

index_keys = cfgrib.ALL_KEYS
index_key_count = length(index_keys)
index_key_symbols = Tuple(Symbol.(index_keys))
HeaderTuple = NamedTuple{index_key_symbols};

In [13]:
GribFile(grib_path) do f
    for message in f
        header_values = Array{Any}(undef, index_key_count)
        for (i, key) in enumerate(index_keys)
            value = haskey(message, key) ? message[key] : missing
            value = value isa Array ? Tuple(value) : value

            header_values[i] = value
        end

        offset = Int(message["offset"])
        if offset in keys(count_offsets)
            count_offsets[offset] += 1
            offset_field = (offset, count_offsets[offset])
        else
            count_offsets[offset] = 0
            offset_field = offset
        end

        offsets[HeaderTuple(header_values)] = offset_field
    end
end

In [14]:
header_values = OrderedDict{String, Array}()
for key in index_keys
    header_values[key] = unique([offset[1][Symbol(key)] for offset in offsets])
end

In [15]:
[offset for offset in keys(offsets)];

In [16]:
[offset[:time] for offset in keys(offsets)]

160-element Array{Int64,1}:
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    ⋮
 1200
 1200
 1200
 1200
 1200
 1200
 1200
 1200
 1200
 1200
 1200
 1200

In [17]:
f = GribFile(grib_path)

GribFile /home/roscar/work/cfgrib-project/cfgrib.jl/tests/sample-data/era5-levels-members.grib at position 0 in mode r

In [18]:
m = Message(f)

date     gridType       stepRange typeOfLevel       level shortName name
20170101 regular_ll     0         isobaricInhPa     500   z         Geopotential

In [19]:
m["time"]

0

1970-01-01T00:00:00

ErrorException: syntax: invalid "::" syntax

In [28]:
m["time"]

time
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataD

Excessive output truncated after 524291 bytes.

dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
dataDate
d

StackOverflowError: StackOverflowError:

In [23]:
function from_grib_date_time(message, date_key="dataDate", time_key="dataTime", epoch=DEFAULT_EPOCH)
    date = GRIB.getindex(message, date_key)
    time = GRIB.getindex(message, time_key)
    hour = time ÷ 100
    minute = time % 100
    year = date ÷ 10000
    month = date ÷ 100 % 100
    day = date % 100
    
    data_datetime = DateTime(year, month, day, hour, minute)
    
    return Dates.value(Dates.Second(data_datetime - epoch))
end

from_grib_date_time (generic function with 4 methods)

In [123]:
t = from_grib_date_time(m)

StackOverflowError: StackOverflowError:

## Subindex

In [4]:
using cfgrib

┌ Info: Precompiling cfgrib [cb67cb4b-e5c8-45d4-aff0-3ae3657ca610]
└ @ Base loading.jl:1273


In [5]:
index = cfgrib.FileIndex(grib_path, cfgrib.ALL_KEYS);