## 前缀树

### 工具

#### 定义对象和方法

In [1]:
"前缀树"
Base.@kwdef mutable struct PrefixTree
    isend::Bool = false
    children = Dict{Char,PrefixTree}()
end

"给前缀树增加单词"
function add_node!(node::PrefixTree, word::String)::Nothing
    for c in word
        children = node.children
        haskey(children, c) || (children[c] = PrefixTree())
        node = children[c]
    end
    node.isend = true
    return nothing
end

"在字符串里匹配字典单词"
function search_valid_word(node::PrefixTree, word::String)
    res, n, word = String[], length(word), collect(word)
    for i in 1:n
        # 检索 word[i:end]
        dict = node
        for j in i:n
            haskey(dict.children, word[j]) || break # 不存在到该位置的路径
            dict = dict.children[word[j]] # 切换到该节点
            dict.isend && push!(res, join(word[i:j]))
        end
    end
    res
end

search_valid_word

#### 测试

In [2]:
words = ["伊格尔·茨维塔诺维奇", "克罗地亚", "伊万·哈谢克", "伊莎貝拉_(帕爾馬郡主)", "克里斯·麦克尼尔利", "克罗地亚王国", "克罗泽群岛", "伊格内修斯·库图·阿昌庞"];
dict_zh = PrefixTree()
for word in words
    add_node!(dict_zh, word)
end

In [3]:
question = "克罗地亚足球运动员伊格尔·茨维塔诺维奇的出生国家西南边的地方叫什么"
search_valid_word(dict_zh, question)

2-element Vector{String}:
 "克罗地亚"
 "伊格尔·茨维塔诺维奇"

### 读取数据

In [4]:
;cd ../data

/home/rex/work_space/7 others/ccks/CCKS-mKGQA/data


In [32]:
# 中文三元组，实体，关系
txts = rstrip(read(open("extract/triple_zh.txt", "r"), String))
zh_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
# zh_objs = unique!(vcat(first.(zh_triples), last.(zh_triples)))
zh_objs = unique(first.(zh_triples))
zh_rels = unique(triple[2] for triple in zh_triples)

# 英文三元组，实体，关系
txts = rstrip(read(open("extract/triple_en.txt", "r"), String))
en_triples = [NTuple{3,String}(split(txt, '\t')) for txt in split(txts, '\n')]
# en_objs = unique!(vcat(first.(en_triples), last.(en_triples)))
en_objs = unique(first.(en_triples))
en_rels = unique(triple[2] for triple in en_triples)

# ILLs 对齐文件
txts = rstrip(read(open("extract/ILLs(zh-en).txt", "r"), String))
ILLs = Dict{String,String}(split(txt, '\t') for txt in split(txts,'\n'))

# 训练数据
train_data = Dict{String, Vector{NTuple{4, String}}}()
open("extract/train_data.txt", "r") do io
    for _ in 1:14077
        que = readline(io)
        ind, que = parse(Int, que[2]), que[5:end]
        train_data[que] = [Tuple(split(readline(io), '\t')) for _ in 1:ind]
        readline(io)
    end
end

### 字典树初始化及测试

In [50]:
# 合并数据
zh_words = vcat(filter(!isascii, zh_objs), collect(values(ILLs)))
en_words = vcat(filter(isascii, zh_objs), filter(isascii, en_objs), collect(keys(ILLs)))
# 替换字符
zh_words = [replace(word, '_'=>' ') for word in zh_words]
en_words = [replace(word, '_'=>' ') for word in en_words]

84157-element Vector{String}:
 "Microsoft Office"
 "Apple TV"
 "Eminem"
 "PS/2"
 "WNBA"
 "Eidos"
 "EMac"
 "Let It Be"
 "HTC Touch Pro"
 "E.ON"
 "Bunga Citra Lestari"
 "HTC Touch Pro2"
 "Vodacom"
 ⋮
 "Paragonah, Utah"
 "Hoyos de Miguel Muñoz"
 "Super Star (S.H.E album)"
 "Lampedusa"
 "Czech Technical University in Prague"
 "Parowan, Utah"
 "Sun Wensheng"
 "Burgin, Kentucky"
 "Tapley Seaton"
 "Salva Kiir Mayardit"
 "Digital container format"
 "Menard, Texas"

In [46]:
# 初始化前缀树
dict_zh = PrefixTree()
for word in zh_words
    add_node!(dict_zh, word)
end
dict_en = PrefixTree()
for word in en_words
    add_node!(dict_en, word)
end

In [47]:
# 出生-来自尾实体
# 馬紹爾城	settlementType	出生
question = "克罗地亚足球运动员伊格尔·茨维塔诺维奇的出生国家西南边的地方叫什么"
search_valid_word(dict_zh, question)

2-element Vector{String}:
 "克罗地亚"
 "伊格尔·茨维塔诺维奇"

In [48]:
search_valid_word(dict_zh, "which format does the cause of 夜櫻作戰, a japanese biological warfare plan during world war ii, belong to?")

1-element Vector{String}:
 "夜櫻作戰"

In [49]:
question = "Savez-vous qui est l’héritier de Le chef de Second Czechoslovak Republic, une république de 1938-1939 en europe centrale/orientale?"
search_valid_word(dict_en, question)

2-element Vector{String}:
 "Second Czechoslovak Republic"
 "Republic"

### 实战

In [52]:
txts = rstrip(read(open("extract/train_questions.txt", "r"), String))
NER_data = [NTuple{2,String}(split(txt, '\t')) for txt in split(txts, '\n')]

13770-element Vector{Tuple{String, String}}:
 ("who is the developer of the sponsor of Call of Duty World League?", "Call_of_Duty_World_League")
 ("where did the one that is after election of United States Senate special election in Massachusetts, 1962, a wahl, study?", "United_States_Senate_special_election_in_Massachusetts,_1962")
 ("where does the event involved in Kingdom of Slavonia occur?", "Kingdom_of_Slavonia")
 ("Daratt的所属国家的立法机构属于哪个王朝", "Daratt")
 ("Où Le descendant de 易卜拉欣帕夏, an ottoman politician and general (1789-1848), est-il mort?", "易卜拉欣帕夏")
 ("who is the spouse of the leader of Singaporean general election, 2015?", "Singaporean_general_election,_2015")
 ("who managed the tenant of the stadium that Costa Rica national football team 2010, a sportseizoen van een voetbalcompetitie, takes place?", "Costa_Rica_national_football_team_2010")
 ("which country does the alma mater of Wang Shu belong to?", "Wang_Shu")
 ("which religion does the one that is after election of Guinea

In [54]:
sols = NTuple{2,String}[]
for (que, obj) in NER_data
    objs = search_valid_word(dict_zh, que)
    if length(objs) == 1
        push!(sols, (que, first(objs)))
    else
        objs = search_valid_word(dict_en, que)
        length(objs) == 1 && push!(sols, (que,first(objs)))
    end
end

In [58]:
search_valid_word(dict_zh, "Daratt的所属国家的立法机构属于哪个王朝")

1-element Vector{String}:
 "Dara"

In [59]:
length(sols)

7980