Skip to content

Commit

Permalink
Merge pull request #105 from WojciechMula/custom-save-load
Browse files Browse the repository at this point in the history
Custom save load
  • Loading branch information
WojciechMula committed Dec 20, 2018
2 parents e3ef36c + 8c5de6c commit bd8a541
Show file tree
Hide file tree
Showing 29 changed files with 1,282 additions and 38 deletions.
38 changes: 28 additions & 10 deletions Automaton.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "Automaton.h"
#include "slist.h"
#include "src/custompickle/save/automaton_save.h"

static PyTypeObject automaton_type;

Expand Down Expand Up @@ -82,24 +83,40 @@ check_key_type(const int store) {
} // switch
}

static PyObject*
automaton_create() {

Automaton* automaton;

automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type);
if (UNLIKELY(automaton == NULL)) {
return NULL;
}

automaton->kind = EMPTY;
automaton->store = STORE_ANY;
automaton->key_type = KEY_STRING;
automaton->count = 0;
automaton->longest_word = 0;

automaton->version = 0;
automaton->stats.version = -1;

automaton->root = NULL;

return (PyObject*)automaton;
}

static PyObject*
automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs) {
Automaton* automaton = NULL;
Automaton* automaton;
int key_type;
int store;

automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type);
automaton = (Automaton*)automaton_create();
if (UNLIKELY(automaton == NULL))
return NULL;

// commons settings
automaton->version = 0;
automaton->stats.version = -1;
automaton->count = 0;
automaton->longest_word = 0;
automaton->kind = EMPTY;
automaton->root = NULL;

if (UNLIKELY(PyTuple_Size(args) == 7)) {

Expand Down Expand Up @@ -336,7 +353,7 @@ automaton_remove_word_aux(PyObject* self, PyObject* args, PyObject** value) {
*value = trie_remove_word(automaton, input.word, input.wordlen);
destroy_input(&input);

if (UNLIKELY(PyErr_Occurred())) {
if (UNLIKELY(PyErr_Occurred() != NULL)) {
return MEMORY_ERROR;
} else {
return (*value != NULL) ? TRUE : FALSE;
Expand Down Expand Up @@ -1278,6 +1295,7 @@ PyMethodDef automaton_methods[] = {
method(dump, METH_NOARGS),
method(__reduce__, METH_VARARGS),
method(__sizeof__, METH_VARARGS),
method(save, METH_VARARGS),

{NULL, NULL, 0, NULL}
};
Expand Down
15 changes: 15 additions & 0 deletions Automaton.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,31 @@ typedef enum {
} AutomatonKind;


static bool
check_kind(const int kind);


typedef enum {
STORE_INTS = 10,
STORE_LENGTH = 20,
STORE_ANY = 30
} KeysStore;


static bool
check_store(const int store);


typedef enum {
KEY_STRING = 100,
KEY_SEQUENCE = 200
} KeyType;


static bool
check_key_type(const int key_type);


struct Input {
Py_ssize_t wordlen;
TRIE_LETTER_TYPE* word;
Expand Down Expand Up @@ -77,6 +89,9 @@ automaton_unpickle(
PyObject* values
);

static PyObject*
automaton_create(void);

/* __init__ */
static PyObject*
automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs);
Expand Down
9 changes: 1 addition & 8 deletions Automaton_pickle.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Pickling (automaton___reduce__):
size of pickled data is calculated. If it is small enough
(less than given threshold), all data is saved in a single
byte array. Otherwise, data is saved in several byte arrays.
In either case, the format of byte array is the same:
* 8 first bytes is number of nodes stored in this
chunk of memory
Expand Down Expand Up @@ -61,19 +61,12 @@ typedef struct DumpState {
} DumpState;


// We save all TrieNode's fields except the last one, which is a pointer to array,
// as we're store that array just after the node
#define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(TrieNode**))
#define PICKLE_POINTER_SIZE (sizeof(TrieNode*))
#define PICKLE_CHUNK_COUNTER_SIZE (sizeof(Py_ssize_t))

static size_t
get_pickled_size(TrieNode* node) {
ASSERT(node != NULL);
return PICKLE_TRIENODE_SIZE + node->n * PICKLE_POINTER_SIZE;
}


// replace fail with pairs (fail, id)
static int
pickle_dump_replace_fail_with_id(TrieNode* node, const int depth, void* extra) {
Expand Down
75 changes: 75 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,81 @@ The Automaton class has the following attributes:
``store`` [readonly]
Return the type of values stored in the Automaton as specified at creation.


Saving and loading automaton
----------------------------

There is support for two method of saving and loading an automaton:

* the standard ``pickle`` protocol,
* custom ``save`` and ``load`` methods.

While pickling is more convenient to use, it has quite high memory
requirements. The ``save``/``load`` method try to overcome this
problem.

.. warning::

Neither format of pickle nor save are safe. Although there are
a few sanity checks, they are not sufficient to detect all
possible input errors.


Pickle
======

.. code:: python
import ahocorasick
import pickle
# build automaton
A = ahocorasick.Automaton()
# ... A.add_data, A.make_automaton
# save current state
with open(path, 'wb') as f:
pickle.dump(A, f)
# load saved state
with open(path, 'rb') as f:
B = pickle.load(f)
Save/load methods
=================

.. code:: python
import ahocorasick
import pickle
# build automaton
A = ahocorasick.Automaton()
# ... A.add_data, A.make_automaton
# save current state
A.save(path, pickle.dumps)
# load saved state
B = ahocorasick.load(path, pickle.loads)
Automaton method ``save`` requires ``path`` to the file which will store data.
If the automaton type is ``STORE_ANY``, i.e. values associated with words are
any python objects, then ``save`` requires also another argument, a callable.
The callable serializes python object into bytes; in the example above we
use standard pickle ``dumps`` function.

Module method ``load`` also requires ``path`` to file that has data previously
saved. Because at the moment of loading data we don't know what is the store
attribute of automaton, the second argument - a callable - is required. The
callable must convert back given bytes object into python value, that will be
stored in automaton. Similarly, standard ``pickle.loads`` function can be passed.


Other Automaton methods
-----------------------

Expand Down
6 changes: 6 additions & 0 deletions allsources.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include "src/custompickle/custompickle.c"
#include "src/custompickle/pyhelpers.c"
#include "src/custompickle/save/savebuffer.c"
#include "src/custompickle/save/automaton_save.c"
#include "src/custompickle/load/loadbuffer.c"
#include "src/custompickle/load/module_automaton_load.c"
2 changes: 2 additions & 0 deletions common.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,13 @@
# define UNLIKELY(x) __builtin_expect(x, 0)
# define ALWAYS_INLINE __attribute__((always_inline))
# define PURE __attribute__((pure))
# define UNUSED __attribute__((unused))
#else
# define LIKELY(x) x
# define UNLIKELY(x) x
# define ALWAYS_INLINE
# define PURE
# define UNUSED
#endif

#ifdef DEBUG
Expand Down
4 changes: 4 additions & 0 deletions pyahocorasick.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "Automaton.h"
#include "AutomatonSearchIter.h"
#include "AutomatonItemsIter.h"
#include "src/custompickle/load/module_automaton_load.h"

/* code */
#include "utils.c"
Expand All @@ -29,6 +30,7 @@
#ifdef PYCALLS_INJECT_FAULTS
#include "src/pycallfault/pycallfault.c"
#endif
#include "allsources.c"


#define ahocorasick_doc \
Expand All @@ -40,6 +42,8 @@
static
PyMethodDef
ahocorasick_module_methods[] = {
{"load", module_automaton_load, METH_VARARGS, module_automaton_load_doc},

{NULL, NULL, 0, NULL}
};

Expand Down
1 change: 1 addition & 0 deletions runtest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ function handle_valgrind
function run_mallocfaults
{
# obtain max allocation number
unset ALLOC_FAIL
run_unittests

local MINID=0
Expand Down
12 changes: 12 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ def get_long_description():
'src/pickle/pickle.h',
'src/pickle/pickle_data.h',
'src/pickle/pickle_data.c',
'src/custompickle/custompickle.h',
'src/custompickle/custompickle.c',
'src/custompickle/pyhelpers.h',
'src/custompickle/pyhelpers.c',
'src/custompickle/save/automaton_save.h',
'src/custompickle/save/automaton_save.c',
'src/custompickle/save/savebuffer.h',
'src/custompickle/save/savebuffer.c',
'src/custompickle/load/module_automaton_load.h',
'src/custompickle/load/module_automaton_load.c',
'src/custompickle/load/loadbuffer.h',
'src/custompickle/load/loadbuffer.c',
'src/pycallfault/pycallfault.h',
'src/pycallfault/pycallfault.c',
],
Expand Down
52 changes: 52 additions & 0 deletions src/custompickle/custompickle.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "custompickle.h"
#include "../../Automaton.h"


static const char CUSTOMPICKLE_MAGICK[16] = {
'p', 'y', 'a', 'h', 'o', 'c', 'o', 'r', 'a', 's', 'i', 'c', 'k', // signature
'0', '0', '1' // format version
};


void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton) {

ASSERT(header != NULL);
ASSERT(automaton != NULL);

memcpy(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK));
header->data.kind = automaton->kind;
header->data.store = automaton->store;
header->data.key_type = automaton->key_type;
header->data.words_count = automaton->count;
header->data.longest_word = automaton->longest_word;
}


void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodes_count) {

ASSERT(footer != NULL);

memcpy(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK));
footer->nodes_count = nodes_count;
}

int custompickle_validate_header(CustompickleHeader* header) {
if (memcmp(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) != 0)
return false;

if (!check_store(header->data.store))
return false;

if (!check_kind(header->data.kind))
return false;

if (!check_key_type(header->data.key_type))
return false;

return true;
}


int custompickle_validate_footer(CustompickleFooter* footer) {
return (memcmp(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) == 0);
}
29 changes: 29 additions & 0 deletions src/custompickle/custompickle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include "../../Automaton.h"

typedef struct AutomatonData {
AutomatonKind kind;
KeysStore store;
KeyType key_type;
size_t words_count;
int longest_word;
} AutomatonData;


typedef struct CustompickleHeader {
char magick[16]; // CUSTOMPICKLE_MAGICK
AutomatonData data;
} CustompickleHeader;


typedef struct CustompickleFooter {
size_t nodes_count;
char magick[16]; // CUSTOMPICKLE_MAGICK
} CustompickleFooter;


void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton);
void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodescount);
int custompickle_validate_header(CustompickleHeader* header);
int custompickle_validate_footer(CustompickleFooter* footer);

0 comments on commit bd8a541

Please sign in to comment.