Skip to content

Commit

Permalink
Custom pickle mechanism
Browse files Browse the repository at this point in the history
Methods 'save' of automaton and 'load' of module allow
to save and load automaton in file. Save has almost no
memory footprint, load does have.

This mechanism is complementary to pickling. Pickling
suffers from high memory consumption, the save/load pair
tries to overcome this limitation.
  • Loading branch information
WojciechMula committed Dec 20, 2018
1 parent b28b362 commit 7f405d1
Show file tree
Hide file tree
Showing 27 changed files with 1,143 additions and 36 deletions.
38 changes: 28 additions & 10 deletions Automaton.c
Expand Up @@ -11,6 +11,7 @@

#include "Automaton.h"
#include "slist.h"
#include "src/custompickle/save/automaton_save.h"

static PyTypeObject automaton_type;

Expand Down Expand Up @@ -82,24 +83,40 @@ check_key_type(const int store) {
} // switch
}

static PyObject*
automaton_create() {

Automaton* automaton;

automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type);
if (UNLIKELY(automaton == NULL)) {
return NULL;
}

automaton->kind = EMPTY;
automaton->store = STORE_ANY;
automaton->key_type = KEY_STRING;
automaton->count = 0;
automaton->longest_word = 0;

automaton->version = 0;
automaton->stats.version = -1;

automaton->root = NULL;

return (PyObject*)automaton;
}

static PyObject*
automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs) {
Automaton* automaton = NULL;
Automaton* automaton;
int key_type;
int store;

automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type);
automaton = (Automaton*)automaton_create();
if (UNLIKELY(automaton == NULL))
return NULL;

// commons settings
automaton->version = 0;
automaton->stats.version = -1;
automaton->count = 0;
automaton->longest_word = 0;
automaton->kind = EMPTY;
automaton->root = NULL;

if (UNLIKELY(PyTuple_Size(args) == 7)) {

Expand Down Expand Up @@ -336,7 +353,7 @@ automaton_remove_word_aux(PyObject* self, PyObject* args, PyObject** value) {
*value = trie_remove_word(automaton, input.word, input.wordlen);
destroy_input(&input);

if (UNLIKELY(PyErr_Occurred())) {
if (UNLIKELY(PyErr_Occurred() != NULL)) {
return MEMORY_ERROR;
} else {
return (*value != NULL) ? TRUE : FALSE;
Expand Down Expand Up @@ -1278,6 +1295,7 @@ PyMethodDef automaton_methods[] = {
method(dump, METH_NOARGS),
method(__reduce__, METH_VARARGS),
method(__sizeof__, METH_VARARGS),
method(save, METH_VARARGS),

{NULL, NULL, 0, NULL}
};
Expand Down
15 changes: 15 additions & 0 deletions Automaton.h
Expand Up @@ -20,19 +20,31 @@ typedef enum {
} AutomatonKind;


static bool
check_kind(const int kind);


typedef enum {
STORE_INTS = 10,
STORE_LENGTH = 20,
STORE_ANY = 30
} KeysStore;


static bool
check_store(const int store);


typedef enum {
KEY_STRING = 100,
KEY_SEQUENCE = 200
} KeyType;


static bool
check_key_type(const int key_type);


struct Input {
Py_ssize_t wordlen;
TRIE_LETTER_TYPE* word;
Expand Down Expand Up @@ -77,6 +89,9 @@ automaton_unpickle(
PyObject* values
);

static PyObject*
automaton_create(void);

/* __init__ */
static PyObject*
automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs);
Expand Down
9 changes: 1 addition & 8 deletions Automaton_pickle.c
Expand Up @@ -23,7 +23,7 @@ Pickling (automaton___reduce__):
size of pickled data is calculated. If it is small enough
(less than given threshold), all data is saved in a single
byte array. Otherwise, data is saved in several byte arrays.
In either case, the format of byte array is the same:
* 8 first bytes is number of nodes stored in this
chunk of memory
Expand Down Expand Up @@ -61,19 +61,12 @@ typedef struct DumpState {
} DumpState;


// We save all TrieNode's fields except the last one, which is a pointer to array,
// as we're store that array just after the node
#define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(TrieNode**))
#define PICKLE_POINTER_SIZE (sizeof(TrieNode*))
#define PICKLE_CHUNK_COUNTER_SIZE (sizeof(Py_ssize_t))

static size_t
get_pickled_size(TrieNode* node) {
ASSERT(node != NULL);
return PICKLE_TRIENODE_SIZE + node->n * PICKLE_POINTER_SIZE;
}


// replace fail with pairs (fail, id)
static int
pickle_dump_replace_fail_with_id(TrieNode* node, const int depth, void* extra) {
Expand Down
6 changes: 6 additions & 0 deletions allsources.c
@@ -0,0 +1,6 @@
#include "src/custompickle/custompickle.c"
#include "src/custompickle/pyhelpers.c"
#include "src/custompickle/save/savebuffer.c"
#include "src/custompickle/save/automaton_save.c"
#include "src/custompickle/load/loadbuffer.c"
#include "src/custompickle/load/module_automaton_load.c"
2 changes: 2 additions & 0 deletions common.h
Expand Up @@ -68,11 +68,13 @@
# define UNLIKELY(x) __builtin_expect(x, 0)
# define ALWAYS_INLINE __attribute__((always_inline))
# define PURE __attribute__((pure))
# define UNUSED __attribute__((unused))
#else
# define LIKELY(x) x
# define UNLIKELY(x) x
# define ALWAYS_INLINE
# define PURE
# define UNUSED
#endif

#ifdef DEBUG
Expand Down
4 changes: 4 additions & 0 deletions pyahocorasick.c
Expand Up @@ -17,6 +17,7 @@
#include "Automaton.h"
#include "AutomatonSearchIter.h"
#include "AutomatonItemsIter.h"
#include "src/custompickle/load/module_automaton_load.h"

/* code */
#include "utils.c"
Expand All @@ -29,6 +30,7 @@
#ifdef PYCALLS_INJECT_FAULTS
#include "src/pycallfault/pycallfault.c"
#endif
#include "allsources.c"


#define ahocorasick_doc \
Expand All @@ -40,6 +42,8 @@
static
PyMethodDef
ahocorasick_module_methods[] = {
{"load", module_automaton_load, METH_VARARGS, module_automaton_load_doc},

{NULL, NULL, 0, NULL}
};

Expand Down
12 changes: 12 additions & 0 deletions setup.py
Expand Up @@ -65,6 +65,18 @@ def get_long_description():
'src/pickle/pickle.h',
'src/pickle/pickle_data.h',
'src/pickle/pickle_data.c',
'src/custompickle/custompickle.h',
'src/custompickle/custompickle.c',
'src/custompickle/pyhelpers.h',
'src/custompickle/pyhelpers.c',
'src/custompickle/save/automaton_save.h',
'src/custompickle/save/automaton_save.c',
'src/custompickle/save/savebuffer.h',
'src/custompickle/save/savebuffer.c',
'src/custompickle/load/module_automaton_load.h',
'src/custompickle/load/module_automaton_load.c',
'src/custompickle/load/loadbuffer.h',
'src/custompickle/load/loadbuffer.c',
'src/pycallfault/pycallfault.h',
'src/pycallfault/pycallfault.c',
],
Expand Down
52 changes: 52 additions & 0 deletions src/custompickle/custompickle.c
@@ -0,0 +1,52 @@
#include "custompickle.h"
#include "../../Automaton.h"


static const char CUSTOMPICKLE_MAGICK[16] = {
'p', 'y', 'a', 'h', 'o', 'c', 'o', 'r', 'a', 's', 'i', 'c', 'k', // signature
'0', '0', '1' // format version
};


void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton) {

ASSERT(header != NULL);
ASSERT(automaton != NULL);

memcpy(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK));
header->data.kind = automaton->kind;
header->data.store = automaton->store;
header->data.key_type = automaton->key_type;
header->data.words_count = automaton->count;
header->data.longest_word = automaton->longest_word;
}


void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodes_count) {

ASSERT(footer != NULL);

memcpy(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK));
footer->nodes_count = nodes_count;
}

int custompickle_validate_header(CustompickleHeader* header) {
if (memcmp(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) != 0)
return false;

if (!check_store(header->data.store))
return false;

if (!check_kind(header->data.kind))
return false;

if (!check_key_type(header->data.key_type))
return false;

return true;
}


int custompickle_validate_footer(CustompickleFooter* footer) {
return (memcmp(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) == 0);
}
29 changes: 29 additions & 0 deletions src/custompickle/custompickle.h
@@ -0,0 +1,29 @@
#pragma once

#include "../../Automaton.h"

typedef struct AutomatonData {
AutomatonKind kind;
KeysStore store;
KeyType key_type;
size_t words_count;
int longest_word;
} AutomatonData;


typedef struct CustompickleHeader {
char magick[16]; // CUSTOMPICKLE_MAGICK
AutomatonData data;
} CustompickleHeader;


typedef struct CustompickleFooter {
size_t nodes_count;
char magick[16]; // CUSTOMPICKLE_MAGICK
} CustompickleFooter;


void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton);
void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodescount);
int custompickle_validate_header(CustompickleHeader* header);
int custompickle_validate_footer(CustompickleFooter* footer);

0 comments on commit 7f405d1

Please sign in to comment.