From 7f405d15464044a7aaa6810379aeabe03d30c507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Mu=C5=82a?= Date: Wed, 19 Dec 2018 18:07:31 +0100 Subject: [PATCH] Custom pickle mechanism Methods 'save' of automaton and 'load' of module allow to save and load automaton in file. Save has almost no memory footprint, load does have. This mechanism is complementary to pickling. Pickling suffers from high memory consumption, the save/load pair tries to overcome this limitation. --- Automaton.c | 38 ++- Automaton.h | 15 + Automaton_pickle.c | 9 +- allsources.c | 6 + common.h | 2 + pyahocorasick.c | 4 + setup.py | 12 + src/custompickle/custompickle.c | 52 ++++ src/custompickle/custompickle.h | 29 ++ src/custompickle/load/loadbuffer.c | 152 ++++++++++ src/custompickle/load/loadbuffer.h | 43 +++ src/custompickle/load/module_automaton_load.c | 281 ++++++++++++++++++ src/custompickle/load/module_automaton_load.h | 7 + src/custompickle/pyhelpers.c | 61 ++++ src/custompickle/pyhelpers.h | 10 + src/custompickle/save/automaton_save.c | 144 +++++++++ src/custompickle/save/automaton_save.h | 10 + src/custompickle/save/savebuffer.c | 114 +++++++ src/custompickle/save/savebuffer.h | 34 +++ src/pickle/pickle.h | 1 + src/pickle/pickle_data.c | 1 - src/pycallfault/pycallfault.c | 2 +- src/pycallfault/pycallfault.h | 6 +- tests/pickle_stresstest.py | 63 +++- trienode.c | 14 +- unittests.py | 64 ++++ utils.c | 5 +- 27 files changed, 1143 insertions(+), 36 deletions(-) create mode 100644 allsources.c create mode 100644 src/custompickle/custompickle.c create mode 100644 src/custompickle/custompickle.h create mode 100644 src/custompickle/load/loadbuffer.c create mode 100644 src/custompickle/load/loadbuffer.h create mode 100644 src/custompickle/load/module_automaton_load.c create mode 100644 src/custompickle/load/module_automaton_load.h create mode 100644 src/custompickle/pyhelpers.c create mode 100644 src/custompickle/pyhelpers.h create mode 100644 src/custompickle/save/automaton_save.c create mode 100644 src/custompickle/save/automaton_save.h create mode 100644 src/custompickle/save/savebuffer.c create mode 100644 src/custompickle/save/savebuffer.h diff --git a/Automaton.c b/Automaton.c index 3b04f1f..b9774b8 100644 --- a/Automaton.c +++ b/Automaton.c @@ -11,6 +11,7 @@ #include "Automaton.h" #include "slist.h" +#include "src/custompickle/save/automaton_save.h" static PyTypeObject automaton_type; @@ -82,24 +83,40 @@ check_key_type(const int store) { } // switch } +static PyObject* +automaton_create() { + + Automaton* automaton; + + automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type); + if (UNLIKELY(automaton == NULL)) { + return NULL; + } + + automaton->kind = EMPTY; + automaton->store = STORE_ANY; + automaton->key_type = KEY_STRING; + automaton->count = 0; + automaton->longest_word = 0; + + automaton->version = 0; + automaton->stats.version = -1; + + automaton->root = NULL; + + return (PyObject*)automaton; +} static PyObject* automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs) { - Automaton* automaton = NULL; + Automaton* automaton; int key_type; int store; - automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type); + automaton = (Automaton*)automaton_create(); if (UNLIKELY(automaton == NULL)) return NULL; - // commons settings - automaton->version = 0; - automaton->stats.version = -1; - automaton->count = 0; - automaton->longest_word = 0; - automaton->kind = EMPTY; - automaton->root = NULL; if (UNLIKELY(PyTuple_Size(args) == 7)) { @@ -336,7 +353,7 @@ automaton_remove_word_aux(PyObject* self, PyObject* args, PyObject** value) { *value = trie_remove_word(automaton, input.word, input.wordlen); destroy_input(&input); - if (UNLIKELY(PyErr_Occurred())) { + if (UNLIKELY(PyErr_Occurred() != NULL)) { return MEMORY_ERROR; } else { return (*value != NULL) ? TRUE : FALSE; @@ -1278,6 +1295,7 @@ PyMethodDef automaton_methods[] = { method(dump, METH_NOARGS), method(__reduce__, METH_VARARGS), method(__sizeof__, METH_VARARGS), + method(save, METH_VARARGS), {NULL, NULL, 0, NULL} }; diff --git a/Automaton.h b/Automaton.h index bd99f29..22b0d58 100644 --- a/Automaton.h +++ b/Automaton.h @@ -20,6 +20,10 @@ typedef enum { } AutomatonKind; +static bool +check_kind(const int kind); + + typedef enum { STORE_INTS = 10, STORE_LENGTH = 20, @@ -27,12 +31,20 @@ typedef enum { } KeysStore; +static bool +check_store(const int store); + + typedef enum { KEY_STRING = 100, KEY_SEQUENCE = 200 } KeyType; +static bool +check_key_type(const int key_type); + + struct Input { Py_ssize_t wordlen; TRIE_LETTER_TYPE* word; @@ -77,6 +89,9 @@ automaton_unpickle( PyObject* values ); +static PyObject* +automaton_create(void); + /* __init__ */ static PyObject* automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs); diff --git a/Automaton_pickle.c b/Automaton_pickle.c index 6c0908b..ecd8c6f 100644 --- a/Automaton_pickle.c +++ b/Automaton_pickle.c @@ -23,7 +23,7 @@ Pickling (automaton___reduce__): size of pickled data is calculated. If it is small enough (less than given threshold), all data is saved in a single byte array. Otherwise, data is saved in several byte arrays. - + In either case, the format of byte array is the same: * 8 first bytes is number of nodes stored in this chunk of memory @@ -61,19 +61,12 @@ typedef struct DumpState { } DumpState; -// We save all TrieNode's fields except the last one, which is a pointer to array, -// as we're store that array just after the node -#define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(TrieNode**)) -#define PICKLE_POINTER_SIZE (sizeof(TrieNode*)) -#define PICKLE_CHUNK_COUNTER_SIZE (sizeof(Py_ssize_t)) - static size_t get_pickled_size(TrieNode* node) { ASSERT(node != NULL); return PICKLE_TRIENODE_SIZE + node->n * PICKLE_POINTER_SIZE; } - // replace fail with pairs (fail, id) static int pickle_dump_replace_fail_with_id(TrieNode* node, const int depth, void* extra) { diff --git a/allsources.c b/allsources.c new file mode 100644 index 0000000..d97e2fa --- /dev/null +++ b/allsources.c @@ -0,0 +1,6 @@ +#include "src/custompickle/custompickle.c" +#include "src/custompickle/pyhelpers.c" +#include "src/custompickle/save/savebuffer.c" +#include "src/custompickle/save/automaton_save.c" +#include "src/custompickle/load/loadbuffer.c" +#include "src/custompickle/load/module_automaton_load.c" diff --git a/common.h b/common.h index 4c7b4d1..097de42 100644 --- a/common.h +++ b/common.h @@ -68,11 +68,13 @@ # define UNLIKELY(x) __builtin_expect(x, 0) # define ALWAYS_INLINE __attribute__((always_inline)) # define PURE __attribute__((pure)) +# define UNUSED __attribute__((unused)) #else # define LIKELY(x) x # define UNLIKELY(x) x # define ALWAYS_INLINE # define PURE +# define UNUSED #endif #ifdef DEBUG diff --git a/pyahocorasick.c b/pyahocorasick.c index 0ccdb4c..6dcb223 100644 --- a/pyahocorasick.c +++ b/pyahocorasick.c @@ -17,6 +17,7 @@ #include "Automaton.h" #include "AutomatonSearchIter.h" #include "AutomatonItemsIter.h" +#include "src/custompickle/load/module_automaton_load.h" /* code */ #include "utils.c" @@ -29,6 +30,7 @@ #ifdef PYCALLS_INJECT_FAULTS #include "src/pycallfault/pycallfault.c" #endif +#include "allsources.c" #define ahocorasick_doc \ @@ -40,6 +42,8 @@ static PyMethodDef ahocorasick_module_methods[] = { + {"load", module_automaton_load, METH_VARARGS, module_automaton_load_doc}, + {NULL, NULL, 0, NULL} }; diff --git a/setup.py b/setup.py index 860e2b8..9f68337 100644 --- a/setup.py +++ b/setup.py @@ -65,6 +65,18 @@ def get_long_description(): 'src/pickle/pickle.h', 'src/pickle/pickle_data.h', 'src/pickle/pickle_data.c', + 'src/custompickle/custompickle.h', + 'src/custompickle/custompickle.c', + 'src/custompickle/pyhelpers.h', + 'src/custompickle/pyhelpers.c', + 'src/custompickle/save/automaton_save.h', + 'src/custompickle/save/automaton_save.c', + 'src/custompickle/save/savebuffer.h', + 'src/custompickle/save/savebuffer.c', + 'src/custompickle/load/module_automaton_load.h', + 'src/custompickle/load/module_automaton_load.c', + 'src/custompickle/load/loadbuffer.h', + 'src/custompickle/load/loadbuffer.c', 'src/pycallfault/pycallfault.h', 'src/pycallfault/pycallfault.c', ], diff --git a/src/custompickle/custompickle.c b/src/custompickle/custompickle.c new file mode 100644 index 0000000..63eff37 --- /dev/null +++ b/src/custompickle/custompickle.c @@ -0,0 +1,52 @@ +#include "custompickle.h" +#include "../../Automaton.h" + + +static const char CUSTOMPICKLE_MAGICK[16] = { + 'p', 'y', 'a', 'h', 'o', 'c', 'o', 'r', 'a', 's', 'i', 'c', 'k', // signature + '0', '0', '1' // format version +}; + + +void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton) { + + ASSERT(header != NULL); + ASSERT(automaton != NULL); + + memcpy(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)); + header->data.kind = automaton->kind; + header->data.store = automaton->store; + header->data.key_type = automaton->key_type; + header->data.words_count = automaton->count; + header->data.longest_word = automaton->longest_word; +} + + +void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodes_count) { + + ASSERT(footer != NULL); + + memcpy(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)); + footer->nodes_count = nodes_count; +} + +int custompickle_validate_header(CustompickleHeader* header) { + if (memcmp(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) != 0) + return false; + + if (!check_store(header->data.store)) + return false; + + if (!check_kind(header->data.kind)) + return false; + + if (!check_key_type(header->data.key_type)) + return false; + + return true; +} + + +int custompickle_validate_footer(CustompickleFooter* footer) { + return (memcmp(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) == 0); +} diff --git a/src/custompickle/custompickle.h b/src/custompickle/custompickle.h new file mode 100644 index 0000000..da53fcf --- /dev/null +++ b/src/custompickle/custompickle.h @@ -0,0 +1,29 @@ +#pragma once + +#include "../../Automaton.h" + +typedef struct AutomatonData { + AutomatonKind kind; + KeysStore store; + KeyType key_type; + size_t words_count; + int longest_word; +} AutomatonData; + + +typedef struct CustompickleHeader { + char magick[16]; // CUSTOMPICKLE_MAGICK + AutomatonData data; +} CustompickleHeader; + + +typedef struct CustompickleFooter { + size_t nodes_count; + char magick[16]; // CUSTOMPICKLE_MAGICK +} CustompickleFooter; + + +void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton); +void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodescount); +int custompickle_validate_header(CustompickleHeader* header); +int custompickle_validate_footer(CustompickleFooter* footer); diff --git a/src/custompickle/load/loadbuffer.c b/src/custompickle/load/loadbuffer.c new file mode 100644 index 0000000..6753b52 --- /dev/null +++ b/src/custompickle/load/loadbuffer.c @@ -0,0 +1,152 @@ +#include "loadbuffer.h" + + +int +loadbuffer_open(LoadBuffer* input, const char* path, PyObject* deserializer) { + + ASSERT(input != NULL); + ASSERT(path != NULL); + + input->file = NULL; + input->lookup = NULL; + input->size = 0; + input->capacity = 0; + input->deserializer = deserializer; + + input->file = fopen(path, "rb"); + if (UNLIKELY(input->file == NULL)) { + PyErr_SetFromErrno(PyExc_IOError); + return 0; + } + + return 1; +} + +int +loadbuffer_load(LoadBuffer* input, char* buffer, size_t size) { + + size_t read; + + ASSERT(input != NULL); + ASSERT(buffer != NULL); + + if (UNLIKELY(size == 0)) { + PyErr_SetString(PyExc_ValueError, "logic error: tried to read 0 bytes"); + return 0; + } + + read = fread(buffer, 1, size, input->file); + if (read != size) { + PyErr_SetFromErrno(PyExc_IOError); + return 0; + } + + return 1; +} + +int +loadbuffer_init(LoadBuffer* input, CustompickleHeader* header, CustompickleFooter* footer) { + + long pos; + int ret; + + ASSERT(input != NULL); + ASSERT(header != NULL); + ASSERT(footer != NULL); + + ret = loadbuffer_loadinto(input, header, CustompickleHeader); + if (UNLIKELY(!ret)) { + return 0; + } + + pos = ftell(input->file); + if (UNLIKELY(pos < 0)) { + PyErr_SetFromErrno(PyExc_IOError); + return 0; + } + + ret = fseek(input->file, -sizeof(CustompickleFooter), SEEK_END); + if (UNLIKELY(ret < 0)) { + PyErr_SetFromErrno(PyExc_IOError); + return 0; + } + + ret = loadbuffer_loadinto(input, footer, CustompickleFooter); + if (UNLIKELY(!ret)) { + return 0; + } + + ret = fseek(input->file, pos, SEEK_SET); + if (UNLIKELY(ret < 0)) { + PyErr_SetFromErrno(PyExc_IOError); + return 0; + } + + if (UNLIKELY(!custompickle_validate_header(header))) { + PyErr_Format(PyExc_ValueError, "invalid header"); + return 0; + } + + if (UNLIKELY(!custompickle_validate_footer(footer))) { + PyErr_Format(PyExc_ValueError, "invalid footer"); + return 0; + } + + input->store = header->data.store; + input->kind = header->data.kind; + input->size = 0; + input->capacity = footer->nodes_count; + input->lookup = (AddressPair*)memory_alloc(sizeof(AddressPair) * input->capacity); + if (UNLIKELY(input->lookup == NULL)) { + PyErr_NoMemory(); + return 0; + } + + return 1; +} + +void +loadbuffer_invalidate(LoadBuffer* input) { + + ASSERT(input != NULL); + + input->size = 0; +} + +void +loadbuffer_close(LoadBuffer* input) { + + TrieNode* node; + size_t i; + + if (input->file != NULL) { + fclose(input->file); + } + + if (input->lookup) { + for (i=0; i < input->size; i++) { + node = input->lookup[i].current; + + if (node->eow && input->store == STORE_ANY) { + Py_DECREF(node->output.object); + } + + trienode_free(node); + } + + memory_free(input->lookup); + } +} + + +void +loadbuffer_dump(LoadBuffer* input, FILE* out) { + + AddressPair* pair; + size_t i; + + for (i=0; i < input->size; i++) { + pair = &(input->lookup[i]); + fprintf(out, "%p -> %p\n", pair->original, pair->current); + } +} diff --git a/src/custompickle/load/loadbuffer.h b/src/custompickle/load/loadbuffer.h new file mode 100644 index 0000000..15d4ae6 --- /dev/null +++ b/src/custompickle/load/loadbuffer.h @@ -0,0 +1,43 @@ +#pragma once + +#include + +#include "../../../trienode.h" +#include "../custompickle.h" + +typedef struct AddressPair { + TrieNode* original; + TrieNode* current; +} AddressPair; + + +typedef struct LoadBuffer { + PyObject* deserializer; + FILE* file; + KeysStore store; + AutomatonKind kind; + AddressPair* lookup; + size_t size; + size_t capacity; +} LoadBuffer; + +int +loadbuffer_open(LoadBuffer* input, const char* path, PyObject* deserializer); + +int +loadbuffer_load(LoadBuffer* input, char* output, size_t size); + +#define loadbuffer_loadinto(input, variable, type) \ + loadbuffer_load(input, (char*)(variable), sizeof(type)) + +int +loadbuffer_init(LoadBuffer* input, CustompickleHeader* header, CustompickleFooter* footer); + +void +loadbuffer_invalidate(LoadBuffer* input); + +void +loadbuffer_close(LoadBuffer* input); + +void +loadbuffer_dump(LoadBuffer* input, FILE* out); diff --git a/src/custompickle/load/module_automaton_load.c b/src/custompickle/load/module_automaton_load.c new file mode 100644 index 0000000..bf14cd1 --- /dev/null +++ b/src/custompickle/load/module_automaton_load.c @@ -0,0 +1,281 @@ +#include "module_automaton_load.h" + +#include "../../../Automaton.h" +#include "loadbuffer.h" + + +// --- public ----------------------------------------------------------- + +static bool +automaton_load_impl(Automaton* automaton, const char* path, PyObject* deserializer); + +PyObject* +module_automaton_load(PyObject* module, PyObject* args) { + + SaveLoadParameters params; + Automaton* automaton; + int ret; + + automaton = (Automaton*)automaton_create(); + if (UNLIKELY(automaton == NULL)) { + return NULL; + } + + if (UNLIKELY(!automaton_save_load_parse_args(automaton->store, args, ¶ms))) { + Py_DECREF(automaton); + return NULL; + } + + ret = automaton_load_impl(automaton, PyBytes_AsString(params.path), params.callback); + Py_DECREF(params.path); + + if (LIKELY(ret)) + return (PyObject*)automaton; + else + return NULL; +} + +// ----private ---------------------------------------------------------- + +static bool +automaton_load_node(LoadBuffer* input); + +static TrieNode* +automaton_load_fixup_pointers(LoadBuffer* input); + +static bool +automaton_load_impl(Automaton* automaton, const char* path, PyObject* deserializer) { + + TrieNode* root; + LoadBuffer input; + CustompickleHeader header; + CustompickleFooter footer; + size_t i; + + if (!loadbuffer_open(&input, path, deserializer)) { + return false; + } + + if (!loadbuffer_init(&input, &header, &footer)) { + goto exception; + } + + if (header.data.kind == TRIE || header.data.kind == AHOCORASICK) { + for (i=0; i < input.capacity; i++) { + if (UNLIKELY(!automaton_load_node(&input))) { + goto exception; + } + } + + root = automaton_load_fixup_pointers(&input); + if (UNLIKELY(root == NULL)) { + goto exception; + } + } else if (header.data.kind == EMPTY) { + + root = NULL; + + } else { + PyErr_SetString(PyExc_ValueError, "automaton kind save in file is invalid"); + goto exception; + } + + loadbuffer_close(&input); + + // setup object + automaton->kind = header.data.kind; + automaton->store = header.data.store; + automaton->key_type = header.data.key_type; + automaton->count = header.data.words_count; + automaton->longest_word = header.data.longest_word; + automaton->version = 0; + automaton->stats.version = -1; + automaton->root = root; + + return true; + +exception: + loadbuffer_close(&input); + return false; +} + +static bool +automaton_load_node(LoadBuffer* input) { + + PyObject* bytes; // XXX: it might be reused (i.e. be part of input) + PyObject* object; + TrieNode* original; + TrieNode* node; + size_t size; + int ret; + + // 1. get original address of upcoming node + ret = loadbuffer_loadinto(input, &original, TrieNode*); + if (UNLIKELY(!ret)) { + return false; + } + + // 2. load node data + node = (TrieNode*)memory_alloc(sizeof(TrieNode)); + if (UNLIKELY(node == NULL)) { + PyErr_NoMemory(); + return false; + } + + ret = loadbuffer_load(input, (char*)node, PICKLE_TRIENODE_SIZE); + if (UNLIKELY(!ret)) { + memory_free(node); + return false; + } + + node->next = NULL; + + // 3. load next pointers + if (node->n > 0) { + size = sizeof(TrieNode*) * node->n; + node->next = (TrieNode**)memory_alloc(size); + if (UNLIKELY(node->next == NULL)) { + PyErr_NoMemory(); + goto exception; + } + + ret = loadbuffer_load(input, (char*)(node->next), size); + if (UNLIKELY(!ret)) { + goto exception; + } + } + + // 4. load custom python object + if (node->eow && input->store == STORE_ANY) { + size = (size_t)(node->output.integer); + bytes = F(PyBytes_FromStringAndSize)(NULL, size); + if (UNLIKELY(bytes == NULL)) { + goto exception; + } + + ret = loadbuffer_load(input, PyBytes_AS_STRING(bytes), size); + if (UNLIKELY(!ret)) { + Py_DECREF(bytes); + goto exception; + } + + object = F(PyObject_CallFunction)(input->deserializer, "O", bytes); + if (UNLIKELY(object == NULL)) { + Py_DECREF(bytes); + goto exception; + } + + node->output.object = object; + Py_DECREF(bytes); + } + + input->lookup[input->size].original = original; + input->lookup[input->size].current = node; + input->size += 1; + + return true; + +exception: + memory_safefree(node->next); + memory_free(node); + + return false; +} + + +static int +addresspair_cmp(const void* a, const void *b) { + const TrieNode* Aptr; + const TrieNode* Bptr; + uintptr_t A; + uintptr_t B; + + Aptr = ((AddressPair*)a)->original; + Bptr = ((AddressPair*)b)->original; + + A = (uintptr_t)Aptr; + B = (uintptr_t)Bptr; + + if (A < B) { + return -1; + } else if (A > B) { + return +1; + } else { + return 0; + } +} + + +static TrieNode* +lookup_address(LoadBuffer* input, TrieNode* original) { + + AddressPair* pair; + + pair = (AddressPair*)bsearch(&original, + input->lookup, + input->size, + sizeof(AddressPair), + addresspair_cmp); + + if (LIKELY(pair != NULL)) { + return pair->current; + } else { + return NULL; + } +} + + +static bool +automaton_load_fixup_node(LoadBuffer* input, TrieNode* node) { + + size_t i; + + if (input->kind == AHOCORASICK && node->fail != NULL) { + node->fail = lookup_address(input, node->fail); + if (UNLIKELY(node->fail == NULL)) { + return false; + } + } + + if (node->n > 0) { + for (i=0; i < node->n; i++) { + node->next[i] = lookup_address(input, node->next[i]); + if (UNLIKELY(node->next[i] == NULL)) { + return false; + } + } + } + + + return true; +} + + +static TrieNode* +automaton_load_fixup_pointers(LoadBuffer* input) { + + TrieNode* root; + TrieNode* node; + size_t i; + + ASSERT(input != NULL); + + // 1. root is the first node stored in the array + root = input->lookup[0].current; + + // 2. sort array to make it bsearch-able + qsort(input->lookup, input->size, sizeof(AddressPair), addresspair_cmp); + + // 3. convert all next and fail pointers to current pointers + for (i=0; i < input->size; i++) { + node = input->lookup[i].current; + if (UNLIKELY(!automaton_load_fixup_node(input, node))) { + PyErr_Format(PyExc_ValueError, "Detected malformed pointer during unpickling node %lu", i); + return NULL; + } + } + + loadbuffer_invalidate(input); + + return root; +} diff --git a/src/custompickle/load/module_automaton_load.h b/src/custompickle/load/module_automaton_load.h new file mode 100644 index 0000000..2d7bd0c --- /dev/null +++ b/src/custompickle/load/module_automaton_load.h @@ -0,0 +1,7 @@ +#pragma once + +#define module_automaton_load_doc \ + "Load automaton from a file" + +PyObject* +module_automaton_load(PyObject* module, PyObject* args); diff --git a/src/custompickle/pyhelpers.c b/src/custompickle/pyhelpers.c new file mode 100644 index 0000000..32a74fd --- /dev/null +++ b/src/custompickle/pyhelpers.c @@ -0,0 +1,61 @@ +#include "pyhelpers.h" + +bool +automaton_save_load_parse_args(KeysStore store, PyObject* args, SaveLoadParameters* result) { + + PyObject* string; + + if (store == STORE_ANY) { + if (PyTuple_GET_SIZE(args) != 2) { + PyErr_SetString(PyExc_ValueError, "expected exactly two arguments"); + return false; + } + } else { + if (PyTuple_GET_SIZE(args) != 1) { + PyErr_SetString(PyExc_ValueError, "expected exactly one argument"); + return false; + } + } + + string = F(PyTuple_GetItem)(args, 0); + if (UNLIKELY(string == NULL)) { + return false; + } + +#if defined(PY3K) + if (UNLIKELY(!F(PyUnicode_Check)(string))) { + PyErr_SetString(PyExc_TypeError, "the first argument must be a string"); + return false; + } +#else + if (UNLIKELY(!F(PyString_Check)(string))) { + PyErr_SetString(PyExc_TypeError, "the first argument must be a string"); + return false; + } +#endif + + if (store == STORE_ANY) { + result->callback = F(PyTuple_GetItem)(args, 1); + if (UNLIKELY(result->callback == NULL)) { + return false; + } + + if (UNLIKELY(!F(PyCallable_Check)(result->callback))) { + PyErr_SetString(PyExc_TypeError, "the second argument must be a callable object"); + return false; + } + } + +#if defined(PY3K) + result->path = F(PyUnicode_AsUTF8String)(string); +#else + result->path = string; + Py_INCREF(string); +#endif + if (UNLIKELY(result->path == NULL)) { + return false; + } + + return true; +} + diff --git a/src/custompickle/pyhelpers.h b/src/custompickle/pyhelpers.h new file mode 100644 index 0000000..d79ac8b --- /dev/null +++ b/src/custompickle/pyhelpers.h @@ -0,0 +1,10 @@ +#pragma once + +typedef struct SaveLoadParameters { + PyObject* path; + PyObject* callback; +} SaveLoadParameters; + +bool +automaton_save_load_parse_args(KeysStore store, PyObject* args, SaveLoadParameters* result); + diff --git a/src/custompickle/save/automaton_save.c b/src/custompickle/save/automaton_save.c new file mode 100644 index 0000000..ac78862 --- /dev/null +++ b/src/custompickle/save/automaton_save.c @@ -0,0 +1,144 @@ +#include "automaton_save.h" + +#include "../custompickle.h" +#include "../pyhelpers.h" +#include "savebuffer.h" + + +// --- public ----------------------------------------------------------- + +static bool +automaton_save_impl(Automaton* automaton, const char* path, PyObject* serializer); + +PyObject* +automaton_save(PyObject* self, PyObject* args) { + + SaveLoadParameters params; + Automaton* automaton; + int ret; + + automaton = (Automaton*)self; + + if (UNLIKELY(!automaton_save_load_parse_args(automaton->store, args, ¶ms))) { + return NULL; + } + + ret = automaton_save_impl(automaton, PyBytes_AsString(params.path), params.callback); + Py_DECREF(params.path); + + if (LIKELY(ret)) + Py_RETURN_NONE; + else + return NULL; +} + +// --- private ---------------------------------------------------------- + +static int +automaton_save_node(TrieNode* node, const int depth, void* extra); + +static bool +automaton_save_impl(Automaton* automaton, const char* path, PyObject* serializer) { + + CustompickleHeader header; + CustompickleFooter footer; + SaveBuffer output; + int ret; + + ret = savebuffer_init(&output, + serializer, + automaton->store, + path, + SAVEBUFFER_DEFAULT_SIZE); + if (!ret) + return false; + + custompickle_initialize_header(&header, automaton); + + // 1. save header + savebuffer_store(&output, (const char*)&header, sizeof(header)); + + // 2. save nodes + if (automaton->kind != EMPTY) { + trie_traverse(automaton->root, automaton_save_node, &output); + if (UNLIKELY(PyErr_Occurred() != NULL)) { + goto exception; + } + } + + // 3. save footer + custompickle_initialize_footer(&footer, output.nodes_count); + savebuffer_store(&output, (const char*)&footer, sizeof(footer)); + + savebuffer_finalize(&output); + + return true; + +exception: + savebuffer_finalize(&output); + + return false; +} + + +static int +automaton_save_node(TrieNode* node, const int depth, void* extra) { + + SaveBuffer* output; + TrieNode* dump; + TrieNode** arr; + unsigned i; + char* buffer; + PyObject* bytes; + + output = (SaveBuffer*)extra; + + // 1. save actual address of node + savebuffer_store_pointer(output, (void*)node); + + // 2. obtain buffer + buffer = savebuffer_acquire(output, get_pickled_size(node)); + ASSERT(buffer != NULL); // XXX: may fail if node->n is huge + dump = (TrieNode*)(buffer); + + // we do not save the last pointer in array + arr = (TrieNode**)(buffer + PICKLE_TRIENODE_SIZE); + + if (output->store != STORE_ANY) + dump->output.integer = node->output.integer; + + dump->n = node->n; + dump->eow = node->eow; + dump->letter = node->letter; + dump->fail = node->fail; + + // 4. save array of pointers + for (i=0; i < node->n; i++) { + arr[i] = node->next[i]; + } + + // 5. pickle python value associated with word + if (node->eow && output->store == STORE_ANY) { + bytes = F(PyObject_CallFunction)(output->serializer, "O", node->output.object); + if (UNLIKELY(bytes == NULL)) { + return 0; + } + + if (UNLIKELY(!F(PyBytes_CheckExact)(bytes))) { + PyErr_SetString(PyExc_TypeError, "serializer must return bytes object"); + return 0; + } + + // 1. save the size of buffer + *(size_t*)(&dump->output.integer) = PyBytes_GET_SIZE(bytes); + + // 2. save the content of buffer + savebuffer_store(output, PyBytes_AS_STRING(bytes), PyBytes_GET_SIZE(bytes)); + + Py_DECREF(bytes); + } + + output->nodes_count += 1; + + return 1; +} diff --git a/src/custompickle/save/automaton_save.h b/src/custompickle/save/automaton_save.h new file mode 100644 index 0000000..87bfedd --- /dev/null +++ b/src/custompickle/save/automaton_save.h @@ -0,0 +1,10 @@ +#pragma once + +#include "../../../common.h" + +#define automaton_save_doc \ + "Save content of automaton in an on-disc file" + +PyObject* +automaton_save(PyObject* self, PyObject* args); + diff --git a/src/custompickle/save/savebuffer.c b/src/custompickle/save/savebuffer.c new file mode 100644 index 0000000..e44c551 --- /dev/null +++ b/src/custompickle/save/savebuffer.c @@ -0,0 +1,114 @@ +#include "savebuffer.h" + +bool +savebuffer_init(SaveBuffer* output, PyObject* serializer, KeysStore store, const char* path, size_t capacity) { + + output->store = store; + output->file = NULL; + output->buffer = NULL; + output->size = 0; + output->capacity = capacity; + output->serializer = serializer; + output->nodes_count = 0; + + if (PICKLE_SIZE_T_SIZE < sizeof(PyObject*)) { + // XXX: this must be reworked, likely to module level + PyErr_SetString(PyExc_SystemError, "unable to save data due to technical reasons"); + return false; + } + + if (UNLIKELY(store == STORE_ANY && serializer == NULL)) { + PyErr_SetString(PyExc_ValueError, "for automatons with STORE_ANY serializer must be given"); + return false; + } + + output->buffer = (char*)memory_alloc(capacity); + if (UNLIKELY(output->buffer == NULL)) { + PyErr_NoMemory(); + return false; + } + + output->file = fopen(path, "wb"); + if (output->file == NULL) { + memory_free(output->buffer); + output->buffer = NULL; + PyErr_SetFromErrno(PyExc_IOError); + return false; + } + + return true; +} + + +void +savebuffer_flush(SaveBuffer* output) { + if (output->size != fwrite(output->buffer, 1, output->size, output->file)) { + PyErr_SetFromErrno(PyExc_IOError); + } + + output->size = 0; +} + + +char* +savebuffer_acquire(SaveBuffer* output, size_t request) { + + char* ptr; + + if (UNLIKELY(request > output->capacity)) { + return NULL; + } + + if (UNLIKELY(output->size + request > output->capacity)) { + savebuffer_flush(output); + } + + ptr = output->buffer + output->size; + output->size += request; + + return ptr; +} + + +void +savebuffer_store(SaveBuffer* output, const char* data, size_t size) { + + if (UNLIKELY(size > output->capacity)) { + savebuffer_flush(output); + if (fwrite(data, 1, size, output->file) != size) { + PyErr_SetFromErrno(PyExc_IOError); + } + return; + } + + if (UNLIKELY(output->size + size >= output->capacity)) { + savebuffer_flush(output); + } + + memcpy(output->buffer + output->size, data, size); + output->size += size; +} + + +void +savebuffer_store_pointer(SaveBuffer* save, void* ptr) { + char* buf; + + buf = savebuffer_acquire(save, sizeof(void*)); + *((void**)buf) = ptr; +} + + +void +savebuffer_finalize(SaveBuffer* output) { + + if (output->buffer != NULL && output->file != NULL && output->size > 0) { + savebuffer_flush(output); + } + + xfree(output->buffer); + + if (output->file != NULL) { + fclose(output->file); + } +} diff --git a/src/custompickle/save/savebuffer.h b/src/custompickle/save/savebuffer.h new file mode 100644 index 0000000..4d1ce6d --- /dev/null +++ b/src/custompickle/save/savebuffer.h @@ -0,0 +1,34 @@ +#pragma once + +#include "../../../Automaton.h" + +#define SAVEBUFFER_DEFAULT_SIZE (32 * 1024lu) + +typedef struct SaveBuffer { + KeysStore store; + FILE* file; + char* buffer; + size_t size; + size_t capacity; + + PyObject* serializer; + size_t nodes_count; ///< the total number of stored nodes +} SaveBuffer; + +bool +savebuffer_init(SaveBuffer* save, PyObject* serializer, KeysStore store, const char* path, size_t capacity); + +void +savebuffer_flush(SaveBuffer* save); + +char* +savebuffer_acquire(SaveBuffer* save, size_t request); + +void +savebuffer_store(SaveBuffer* save, const char* data, size_t size); + +void +savebuffer_store_pointer(SaveBuffer* save, void* ptr); + +void +savebuffer_finalize(SaveBuffer* save); diff --git a/src/pickle/pickle.h b/src/pickle/pickle.h index 833e60d..90c3a97 100644 --- a/src/pickle/pickle.h +++ b/src/pickle/pickle.h @@ -6,4 +6,5 @@ // as we're store that array just after the node #define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(TrieNode**)) #define PICKLE_POINTER_SIZE (sizeof(TrieNode*)) +#define PICKLE_SIZE_T_SIZE (sizeof(size_t)) #define PICKLE_CHUNK_COUNTER_SIZE (sizeof(Py_ssize_t)) diff --git a/src/pickle/pickle_data.c b/src/pickle/pickle_data.c index a86fd9b..f0e1308 100644 --- a/src/pickle/pickle_data.c +++ b/src/pickle/pickle_data.c @@ -75,7 +75,6 @@ pickle_data__shrink_last_buffer(PickleData* data) { bytes = F(PyList_GetItem)(data->bytes_list, last_idx); if (UNLIKELY(bytes == NULL)) { - puts("HERE?"); return false; } diff --git a/src/pycallfault/pycallfault.c b/src/pycallfault/pycallfault.c index 4a720c3..c137afb 100644 --- a/src/pycallfault/pycallfault.c +++ b/src/pycallfault/pycallfault.c @@ -43,7 +43,7 @@ int check_and_set_error(void) { PyErr_NoMemory(); return 1; } - + return 0; } diff --git a/src/pycallfault/pycallfault.h b/src/pycallfault/pycallfault.h index bb1e8c1..cd24322 100644 --- a/src/pycallfault/pycallfault.h +++ b/src/pycallfault/pycallfault.h @@ -7,7 +7,7 @@ void initialize_pycallfault(void); -// --- python function wrappers ----------------------------------------- +// --- python function wrappers ----------------------------------------- int check(void); int check_and_set_error(void); @@ -38,8 +38,6 @@ int check_and_set_error(void); #define PyBytes_CheckExact_custom(arg) (check() ? 0 : PyBytes_CheckExact(arg)) -#define PyCallable_Check_custom(arg) (check() ? 0 : PyCallable_Check(arg)) - #define PyNumber_Check_custom(arg) (check() ? 0 : PyNumber_Check(arg)) #define PyTuple_Check_custom(arg) (check() ? 0 : PyTuple_Check(arg)) @@ -52,6 +50,8 @@ int check_and_set_error(void); #define PyUnicode_FromKindAndData_custom(...) (check_and_set_error() ? NULL : PyUnicode_FromKindAndData(__VA_ARGS__)) +#define PyUnicode_AsUTF8String_custom(...) (check_and_set_error() ? NULL : PyUnicode_AsUTF8String(__VA_ARGS__)) + #define PyBytes_FromStringAndSize_custom(...) (check_and_set_error() ? NULL : PyBytes_FromStringAndSize(__VA_ARGS__)) #endif // PYCALLFAULT_H_ diff --git a/tests/pickle_stresstest.py b/tests/pickle_stresstest.py index 9d70bee..c46367b 100644 --- a/tests/pickle_stresstest.py +++ b/tests/pickle_stresstest.py @@ -30,23 +30,38 @@ def __init__(self, options): def run(self): self.A = ahocorasick.Automaton() - if self.options.compare and not self.options.save: + if self.options.compare and (not self.options.pickle and not self.options.save): self.generate_words() - if self.options.save: + if self.options.pickle or self.options.save: self.add_words() + + if self.options.pickle: t1 = time.time() self.pickle() t2 = time.time() print(" time: %0.2fs" % (t2 - t1)) self.A.clear() - if self.options.load: + if self.options.save: + t1 = time.time() + self.save() + t2 = time.time() + print(" time: %0.2fs" % (t2 - t1)) + self.A.clear() + + if self.options.unpickle: t1 = time.time() self.unpickle() t2 = time.time() print(" time: %0.2fs" % (t2 - t1)) + if self.options.load: + t1 = time.time() + self.load() + t2 = time.time() + print(" time: %0.2fs" % (t2 - t1)) + if self.options.compare: self.compare() @@ -129,7 +144,8 @@ def read(self): def pickle(self): path = self.options.picklepath - print("Saving automaton in %s" % path) + print("Pickling automaton in %s" % path) + with open(path, 'wb') as f: pickle.dump(self.A, f) @@ -140,11 +156,30 @@ def pickle(self): def unpickle(self): path = self.options.picklepath - print("Loading automaton from %s" % path) + print("Unpickling automaton from %s" % path) with open(path, 'rb') as f: self.A = pickle.load(f) + def save(self): + path = self.options.picklepath + + print("Saving automaton in %s" % path) + + self.A.save(path, pickle.dumps); + + size = os.path.getsize(path) + print(" file size is %s" % format_size(size)) + + + def load(self): + path = self.options.picklepath + + print("Loading automaton from %s" % path) + + self.A = ahocorasick.load(path, pickle.loads) + + def compare(self): print("Comparing added words with restored automaton") @@ -187,15 +222,26 @@ def parse_args(): ) parser.add_option( - "-p", "--pickle", dest='save', action='store_true', default=False, + "-p", "--pickle", dest='pickle', action='store_true', default=False, help="perform pickle operation on generated/loaded words" ) parser.add_option( - "-u", "--unpickle", dest='load', action='store_true', default=False, + "-u", "--unpickle", dest='unpickle', action='store_true', default=False, help="perform unpickle operation on previously pickled data" ) + parser.add_option( + "-s", "--save", dest='save', action='store_true', default=False, + help="perform save operation on generated/loaded words" + ) + + parser.add_option( + "-l", "--load", dest='load', action='store_true', default=False, + help="perform load operation on previously saved data" + ) + + parser.add_option( "-c", "--compare", action='store_true', default=False, help="compare generated/loaded words with unpickled data" @@ -231,6 +277,9 @@ def parse_args(): if not (options.file_gz or options.random): raise parser.error("pass --random or --file-gz option") + if (options.pickle or options.unpickle) and (options.save or options.load): + raise parser.error("use separately --pickle/--unpickle and --save/--load") + return options diff --git a/trienode.c b/trienode.c index 2ce2183..5b16572 100644 --- a/trienode.c +++ b/trienode.c @@ -155,7 +155,7 @@ void trienode_dump_layout() { #endif -static void +UNUSED static void trienode_dump_to_file(TrieNode* node, FILE* f) { unsigned i; @@ -173,10 +173,14 @@ trienode_dump_to_file(TrieNode* node, FILE* f) { fprintf(f, "- fail: %p\n", node->fail); if (node->n > 0) { - fprintf(f, "- %d next: [%p", node->n, node->next[0]); - for (i=1; i < node->n; i++) - fprintf(f, ", %p", node->next[i]); - fprintf(f, "]\n"); + if (node->next == NULL) { + fprintf(f, "- %d next: %p\n", node->n, node->next); + } else { + fprintf(f, "- %d next: [%p", node->n, node->next[0]); + for (i=1; i < node->n; i++) + fprintf(f, ", %p", node->next[i]); + fprintf(f, "]\n"); + } } } diff --git a/unittests.py b/unittests.py index 7c87359..0cec48c 100644 --- a/unittests.py +++ b/unittests.py @@ -1422,6 +1422,70 @@ def test_case1(self): pass +class TestLoadSave(TestAutomatonBase): + + def test_save__invalid_number_of_arguments(self): + A = self.add_words_and_make_automaton(); + with self.assertRaisesRegex(ValueError, "expected exactly two arguments"): + A.save() + + + def test_save__invalid_argument_1(self): + A = self.add_words_and_make_automaton(); + with self.assertRaisesRegex(TypeError, "the first argument must be a string"): + A.save(None, None) + + + def test_save__invalid_argument_2(self): + A = self.add_words_and_make_automaton(); + with self.assertRaisesRegex(TypeError, "the second argument must be a callable object"): + A.save("/dev/shm/test.dump", None) + + + def test_save(self): + import pickle + + A = self.add_words_and_make_automaton(); + def serializer(obj): + return pickle.dumps(obj) + + path = "/dev/shm/test.dump" + A.save(path, serializer) + + + def test_save_and_load(self): + import pickle + + A = self.add_words_and_make_automaton(); + def serializer(obj): + bytes = pickle.dumps(obj) + return bytes + + path = "/dev/shm/test.dump" + A.save(path, serializer) + + def deserializer(bytes): + return pickle.loads(bytes) + + B = ahocorasick.load(path, deserializer) + + self.compare_automatons(A, B) + + + def compare_automatons(self, A, B): + if print_dumps: + print([x for x in B.items()]) + print([x for x in A.items()]) + + self.assertEqual(len(A), len(B)) + + for item in zip(A.items(), B.items()): + (AK, AV), (BK, BV) = item + + self.assertEqual(AK, BK) + self.assertEqual(AV, BV) + + if __name__ == '__main__': unittest.main() diff --git a/utils.c b/utils.c index 86e32b1..5fd0663 100644 --- a/utils.c +++ b/utils.c @@ -68,7 +68,7 @@ void* memory_alloc(ssize_t size) { if (alloc_dump) fprintf(debug_file, "A %d %p %ld\n", alloc_num, res, size); #endif - +# return res; } @@ -88,6 +88,9 @@ void xfree(void* ptr) { } +#define memory_safefree xfree + + #if !defined(PY3K) || !defined(AHOCORASICK_UNICODE) // define when pymod_get_string makes a copy of string # define INPUT_KEEPS_COPY