-
Notifications
You must be signed in to change notification settings - Fork 1.8k
/
scorer.h
112 lines (84 loc) · 3.02 KB
/
scorer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#ifndef SCORER_H_
#define SCORER_H_
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lm/enumerate_vocab.hh"
#include "lm/virtual_interface.hh"
#include "lm/word_index.hh"
#include "util/string_piece.hh"
#include "path_trie.h"
const double OOV_SCORE = -1000.0;
const std::string START_TOKEN = "<s>";
const std::string UNK_TOKEN = "<unk>";
const std::string END_TOKEN = "</s>";
// Implement a callback to retrive the dictionary of language model.
class RetriveStrEnumerateVocab : public lm::EnumerateVocab {
public:
RetriveStrEnumerateVocab() {}
void Add(lm::WordIndex index, const StringPiece &str) {
vocabulary.push_back(std::string(str.data(), str.length()));
}
std::vector<std::string> vocabulary;
};
/* External scorer to query score for n-gram or sentence, including language
* model scoring and word insertion.
*
* Example:
* Scorer scorer(alpha, beta, "path_of_language_model");
* scorer.get_log_cond_prob({ "WORD1", "WORD2", "WORD3" });
* scorer.get_sent_log_prob({ "WORD1", "WORD2", "WORD3" });
*/
class Scorer {
public:
Scorer(double alpha,
double beta,
const std::string &lm_path,
const std::vector<std::string> &vocabulary);
~Scorer();
double get_log_cond_prob(const std::vector<std::string> &words);
double get_sent_log_prob(const std::vector<std::string> &words);
// return the max order
size_t get_max_order() const { return max_order_; }
// return the dictionary size of language model
size_t get_dict_size() const { return dict_size_; }
// retrun true if the language model is character based
bool is_character_based() const { return is_character_based_; }
// reset params alpha & beta
void reset_params(float alpha, float beta);
// make ngram for a given prefix
std::vector<std::string> make_ngram(PathTrie *prefix);
// trransform the labels in index to the vector of words (word based lm) or
// the vector of characters (character based lm)
std::vector<std::string> split_labels(const std::vector<int> &labels);
// language model weight
double alpha;
// word insertion weight
double beta;
// pointer to the dictionary of FST
void *dictionary;
protected:
// necessary setup: load language model, set char map, fill FST's dictionary
void setup(const std::string &lm_path,
const std::vector<std::string> &vocab_list);
// load language model from given path
void load_lm(const std::string &lm_path);
// fill dictionary for FST
void fill_dictionary(bool add_space);
// set char map
void set_char_map(const std::vector<std::string> &char_list);
double get_log_prob(const std::vector<std::string> &words);
// translate the vector in index to string
std::string vec2str(const std::vector<int> &input);
private:
void *language_model_;
bool is_character_based_;
size_t max_order_;
size_t dict_size_;
int SPACE_ID_;
std::vector<std::string> char_list_;
std::unordered_map<std::string, int> char_map_;
std::vector<std::string> vocabulary_;
};
#endif // SCORER_H_