33import bpe_module .learn_BPE as learn_BPE
44import bpe_module .apply_BPE as apply_BPE
55
6- parser = argparse .ArgumentParser (description = 'file path' )
7- parser .add_argument ('-train_path' , required = True , nargs = '+' )
8- parser .add_argument ('-voca_out_path' , required = True )
9- parser .add_argument ('-bpe_out_path' , required = True , nargs = '+' )
10- parser .add_argument ('-train_voca_threshold' , required = True ) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음.
11- parser .add_argument ('-final_voca_size' , required = True )
12- parser .add_argument ('-num_merges' , required = True )
13- parser .add_argument ('-multi_proc' , required = True )
6+ parser = argparse .ArgumentParser ()
7+ group = parser .add_mutually_exclusive_group ()
148
9+ parser .add_argument (
10+ '-train_path' ,
11+ help = "Multiple documents path" ,
12+ required = True ,
13+ nargs = '+'
14+ )
15+ parser .add_argument (
16+ '-voca_out_path' ,
17+ help = "Vocabulary_path" ,
18+ required = True
19+ )
20+ parser .add_argument (
21+ '-bpe_out_path' ,
22+ help = "Multile BPE_applied path" ,
23+ required = True ,
24+ nargs = '+'
25+ )
26+ parser .add_argument (
27+ '-train_voca_threshold' ,
28+ help = "Vocabulary threshold(word frequency) for BPE learn (default 1)" ,
29+ type = int ,
30+ default = 1
31+ ) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음.
32+ parser .add_argument (
33+ '-num_merges' ,
34+ help = "# Merge" ,
35+ required = True ,
36+ type = int
37+ )
38+ parser .add_argument (
39+ '-multi_proc' ,
40+ help = "# Process (default 1), (-1: use all process)" ,
41+ type = int ,
42+ default = 1
43+ )
44+ group .add_argument (
45+ '-final_voca_size' ,
46+ help = "Final voca size (default 0), Must use either -final_voca_size or -final_voca_threshold" ,
47+ type = int ,
48+ default = 0
49+ )
50+ group .add_argument (
51+ '-final_voca_threshold' ,
52+ help = "Final voca threshold(word frequency) (default 0), Must use either -final_voca_size or -final_voca_threshold. " ,
53+ type = int ,
54+ default = 0
55+ )
1556args = parser .parse_args ()
57+ if args .final_voca_size == 0 and args .final_voca_threshold == 0 :
58+ parser .error ("Must use either -final_voca_size or -final_voca_threshold." )
59+
1660
1761train_path = args .train_path
1862voca_out_path = args .voca_out_path
1963bpe_out_path = args .bpe_out_path
2064train_voca_threshold = int (args .train_voca_threshold )
21- final_voca_size = int (args .final_voca_size )
2265num_merges = int (args .num_merges )
2366multi_proc = int (args .multi_proc )
67+ final_voca_size = int (args .final_voca_size )
68+ final_voca_threshold = int (args .final_voca_threshold )
2469
2570if multi_proc == - 1 :
2671 multi_proc = os .cpu_count ()
57102 voca_path = voca_out_path ,
58103 new_voca_path = voca_out_path ,
59104 final_voca_num = final_voca_size ,
105+ final_voca_threshold = final_voca_threshold ,
60106 space_symbol = '</w>'
61107 )
0 commit comments