Skip to content

Commit

Permalink
Merge pull request rime#661 from WhiredPlanck/memory
Browse files Browse the repository at this point in the history
Tweak to Save Memory Consumption When Compiling Dictionaries
  • Loading branch information
lotem committed Jun 11, 2023
2 parents 77e8a5c + c616687 commit 1c43fe5
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -12,3 +12,4 @@ env.bat
node_modules/
*~
.*.swp
.cache/
14 changes: 8 additions & 6 deletions src/rime/dict/dict_compiler.cc
Expand Up @@ -236,22 +236,24 @@ bool DictCompiler::BuildTable(int table_index,
for (const auto& s : collector.syllabary) {
syllable_to_id[s] = syllable_id++;
}
for (RawDictEntry& r : collector.entries) {
for (const auto& r : collector.entries) {
Code code;
for (const auto& s : r.raw_code) {
for (const auto& s : r->raw_code) {
code.push_back(syllable_to_id[s]);
}
DictEntryList* ls = vocabulary.LocateEntries(code);
auto ls = vocabulary.LocateEntries(code);
if (!ls) {
LOG(ERROR) << "Error locating entries in vocabulary.";
continue;
}
auto e = New<DictEntry>();
auto e = New<ShortDictEntry>();
e->code.swap(code);
e->text.swap(r.text);
e->weight = log(r.weight > 0 ? r.weight : DBL_EPSILON);
e->text.swap(r->text);
e->weight = log(r->weight > 0 ? r->weight : DBL_EPSILON);
ls->push_back(e);
}
// release memory in time to reduce peak memory usage
vector<of<RawDictEntry>>().swap(collector.entries);
if (settings->sort_order() != "original") {
vocabulary.SortHomophones();
}
Expand Down
10 changes: 5 additions & 5 deletions src/rime/dict/entry_collector.cc
Expand Up @@ -206,7 +206,7 @@ void EntryCollector::CreateEntry(const string &word,
words[e.text][code_str] += e.weight;
total_weight[e.text] += e.weight;
}
entries.push_back(e);
entries.emplace_back(New<RawDictEntry>(e));
++num_entries;
}

Expand Down Expand Up @@ -240,10 +240,10 @@ void EntryCollector::Dump(const string& file_name) const {
out << "# - " << syllable << std::endl;
}
out << std::endl;
for (const RawDictEntry& e : entries) {
out << e.text << '\t'
<< e.raw_code.ToString() << '\t'
<< e.weight << std::endl;
for (const auto &e : entries) {
out << e->text << '\t'
<< e->raw_code.ToString() << '\t'
<< e->weight << std::endl;
}
out.close();
}
Expand Down
2 changes: 1 addition & 1 deletion src/rime/dict/entry_collector.h
Expand Up @@ -35,7 +35,7 @@ class EntryCollector : public PhraseCollector {
public:
Syllabary syllabary;
bool build_syllabary = true;
vector<RawDictEntry> entries;
vector<of<RawDictEntry>> entries;
size_t num_entries = 0;
ReverseLookupTable stems;

Expand Down
6 changes: 3 additions & 3 deletions src/rime/dict/table.cc
Expand Up @@ -518,7 +518,7 @@ table::TailIndex* Table::BuildTailIndex(const Code& prefix,
return index;
}

Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
Array<table::Entry>* Table::BuildEntryArray(const ShortDictEntryList& entries) {
auto array = CreateArray<table::Entry>(entries.size());
if (!array) {
return NULL;
Expand All @@ -531,7 +531,7 @@ Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
return array;
}

bool Table::BuildEntryList(const DictEntryList& src,
bool Table::BuildEntryList(const ShortDictEntryList& src,
List<table::Entry>* dest) {
if (!dest)
return false;
Expand All @@ -549,7 +549,7 @@ bool Table::BuildEntryList(const DictEntryList& src,
return true;
}

bool Table::BuildEntry(const DictEntry& dict_entry, table::Entry* entry) {
bool Table::BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry) {
if (!entry)
return false;
if (!AddString(dict_entry.text, &entry->text, dict_entry.weight)) {
Expand Down
6 changes: 3 additions & 3 deletions src/rime/dict/table.h
Expand Up @@ -166,9 +166,9 @@ class Table : public MappedFile {
const Vocabulary& vocabulary);
bool BuildPhraseIndex(Code code, const Vocabulary& vocabulary,
map<string, int>* index_data);
Array<table::Entry>* BuildEntryArray(const DictEntryList& entries);
bool BuildEntryList(const DictEntryList& src, List<table::Entry>* dest);
bool BuildEntry(const DictEntry& dict_entry, table::Entry* entry);
Array<table::Entry>* BuildEntryArray(const ShortDictEntryList& entries);
bool BuildEntryList(const ShortDictEntryList& src, List<table::Entry>* dest);
bool BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry);

string GetString(const table::StringType& x);
bool AddString(const string& src, table::StringType* dest,
Expand Down
45 changes: 38 additions & 7 deletions src/rime/dict/vocabulary.cc
Expand Up @@ -5,6 +5,7 @@
// 2011-07-24 GONG Chen <chen.sst@gmail.com>
//
#include <algorithm>
#include <iterator>
#include <sstream>
#include <utility>
#include <rime/dict/vocabulary.h>
Expand Down Expand Up @@ -59,6 +60,18 @@ string Code::ToString() const {
return stream.str();
}

inline ShortDictEntry DictEntry::ToShort() const {
return {text, code, weight};
}

bool ShortDictEntry::operator< (const ShortDictEntry& other) const {
// Sort different entries sharing the same code by weight desc.
if (weight != other.weight)
return weight > other.weight;
// reduce carbon emission
return 0; //text < other.text;
}

bool DictEntry::operator< (const DictEntry& other) const {
// Sort different entries sharing the same code by weight desc.
if (weight != other.weight)
Expand All @@ -72,16 +85,34 @@ inline bool dereference_less(const T& a, const T& b) {
return *a < *b;
}

template <typename C>
inline void sort(C &container) {
std::sort(std::begin(container), std::end(container), dereference_less<typename C::value_type>);
}

template <typename C>
inline void sort_range(C &container, size_t start, size_t count) {
if (start >= container.size())
return;
auto i(std::begin(container) + start);
auto j(start + count >= container.size() ? std::end(container) : i + count);
std::sort(i, j, dereference_less<typename C::value_type>);
}

void ShortDictEntryList::Sort() {
sort(*this);
}

void ShortDictEntryList::SortRange(size_t start, size_t count) {
sort_range(*this, start, count);
}

void DictEntryList::Sort() {
std::sort(begin(), end(), dereference_less<DictEntryList::value_type>);
sort(*this);
}

void DictEntryList::SortRange(size_t start, size_t count) {
if (start >= size())
return;
iterator i(begin() + start);
iterator j(start + count >= size() ? end() : i + count);
std::sort(i, j, dereference_less<DictEntryList::value_type>);
sort_range(*this, start, count);
}

void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
Expand All @@ -96,7 +127,7 @@ void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
}
}

DictEntryList* Vocabulary::LocateEntries(const Code& code) {
ShortDictEntryList* Vocabulary::LocateEntries(const Code& code) {
Vocabulary* v = this;
size_t n = code.size();
for (size_t i = 0; i < n; ++i) {
Expand Down
24 changes: 20 additions & 4 deletions src/rime/dict/vocabulary.h
Expand Up @@ -30,20 +30,36 @@ class Code : public vector<SyllableId> {
string ToString() const;
};

struct ShortDictEntry {
string text;
Code code; // multi-syllable code from prism
double weight = 0.0;

ShortDictEntry() = default;
bool operator< (const ShortDictEntry& other) const;
};

struct DictEntry {
string text;
string comment;
string preedit;
double weight = 0.0;
int commit_count = 0;
Code code; // multi-syllable code from prism
string custom_code; // user defined code
double weight = 0.0;
int commit_count = 0;
int remaining_code_length = 0;

DictEntry() = default;
ShortDictEntry ToShort() const;
bool operator< (const DictEntry& other) const;
};

class ShortDictEntryList : public vector<of<ShortDictEntry>> {
public:
void Sort();
void SortRange(size_t start, size_t count);
};

class DictEntryList : public vector<of<DictEntry>> {
public:
void Sort();
Expand All @@ -64,13 +80,13 @@ class DictEntryFilterBinder {
class Vocabulary;

struct VocabularyPage {
DictEntryList entries;
ShortDictEntryList entries;
an<Vocabulary> next_level;
};

class Vocabulary : public map<int, VocabularyPage> {
public:
DictEntryList* LocateEntries(const Code& code);
ShortDictEntryList* LocateEntries(const Code& code);
void SortHomophones();
};

Expand Down
18 changes: 9 additions & 9 deletions test/table_test.cc
Expand Up @@ -44,34 +44,34 @@ rime::the<rime::Table> RimeTableTest::table_;

void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
rime::Vocabulary& voc) {
auto d = rime::New<rime::DictEntry>();
auto d = rime::New<rime::ShortDictEntry>();
syll.insert("0");
// no entries for '0', however
syll.insert("1");
d->code.push_back(1);
d->text = "yi";
d->weight = 1.0;
voc[1].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("2");
d->code.back() = 2;
d->text = "er";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "liang";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "lia";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("3");
d->code.back() = 3;
d->text = "san";
voc[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "sa";
voc[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("4");
auto lv2 = rime::New<rime::Vocabulary>();
voc[1].next_level = lv2;
Expand All @@ -84,11 +84,11 @@ void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
d->code.push_back(3);
d->text = "yi-er-san";
(*lv3)[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->code.push_back(4);
d->text = "yi-er-san-si";
(*lv4)[-1].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->code.resize(3);
d->code.push_back(2);
d->code.push_back(1);
Expand Down

0 comments on commit 1c43fe5

Please sign in to comment.