Skip to content

Commit 435ca97

Browse files
committed
argparse
1 parent 0084b62 commit 435ca97

File tree

4 files changed

+84
-28
lines changed

4 files changed

+84
-28
lines changed

README.md

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,22 @@ Byte Pair Encoding (BPE)
1515
* learn BPE from document
1616
```
1717
python bpe_learn.py
18-
-train_path 1_document 2_document ... K_document
19-
-voca_out_path voca_path/voca_file_name
20-
-bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document
21-
-train_voca_threshold 1
22-
-final_voca_size 30000
23-
-num_merges 30000
24-
-multi_proc=-1
25-
26-
multi_proc: -1(use all process), 1(not use)
18+
-train_path 1_document 2_document ... K_document
19+
-voca_out_path voca_path/voca_file_name
20+
-bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document
21+
-train_voca_threshold 1
22+
-num_merges 30000
23+
-multi_proc=-1 (-1:use all process, 1:not use)
24+
-final_voca_size 30000 or -final_voca_threshold 50
2725
```
2826

2927
* apply BPE to document
3028
```
3129
python bpe_apply.py
32-
-data_path 1_document 2_document ... K_document
33-
-voca_path voca_path/voca_file_name
34-
-bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document
30+
-data_path 1_document 2_document ... K_document
31+
-voca_path voca_path/voca_file_name
32+
-bpe_out_path 1_BPE_document 2_BPE_document ... K_BPE_document
3533
```
3634

37-
## dataset/
38-
* WMT17 example: http://data.statmt.org/wmt17/translation-task/preprocessed/
39-
4035
## Reference
4136
* https://lovit.github.io/nlp/2018/04/02/wpm/

bpe_apply.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,23 @@
33
import bpe_module.apply_BPE as apply_BPE
44

55
parser = argparse.ArgumentParser(description='file path')
6-
parser.add_argument('-data_path', required=True, nargs='+')
7-
parser.add_argument('-voca_path', required=True)
8-
parser.add_argument('-bpe_out_path', required=True, nargs='+')
6+
parser.add_argument(
7+
'-data_path',
8+
help="Multiple documents path",
9+
required=True,
10+
nargs='+'
11+
)
12+
parser.add_argument(
13+
'-voca_path',
14+
help="Vocabulary for BPE apply",
15+
required=True
16+
)
17+
parser.add_argument(
18+
'-bpe_out_path',
19+
help="Multile BPE_applied path",
20+
required=True,
21+
nargs='+'
22+
)
923

1024
args = parser.parse_args()
1125

bpe_learn.py

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,69 @@
33
import bpe_module.learn_BPE as learn_BPE
44
import bpe_module.apply_BPE as apply_BPE
55

6-
parser = argparse.ArgumentParser(description='file path')
7-
parser.add_argument('-train_path', required=True, nargs='+')
8-
parser.add_argument('-voca_out_path', required=True)
9-
parser.add_argument('-bpe_out_path', required=True, nargs='+')
10-
parser.add_argument('-train_voca_threshold', required=True) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음.
11-
parser.add_argument('-final_voca_size', required=True)
12-
parser.add_argument('-num_merges', required=True)
13-
parser.add_argument('-multi_proc', required=True)
6+
parser = argparse.ArgumentParser()
7+
group = parser.add_mutually_exclusive_group()
148

9+
parser.add_argument(
10+
'-train_path',
11+
help="Multiple documents path",
12+
required=True,
13+
nargs='+'
14+
)
15+
parser.add_argument(
16+
'-voca_out_path',
17+
help="Vocabulary_path",
18+
required=True
19+
)
20+
parser.add_argument(
21+
'-bpe_out_path',
22+
help="Multile BPE_applied path",
23+
required=True,
24+
nargs='+'
25+
)
26+
parser.add_argument(
27+
'-train_voca_threshold',
28+
help="Vocabulary threshold(word frequency) for BPE learn (default 1)",
29+
type=int,
30+
default=1
31+
) # 빠른 학습을 위해 일정 빈도수 이하의 단어는 bpe learn에 참여시키지 않음.
32+
parser.add_argument(
33+
'-num_merges',
34+
help="# Merge",
35+
required=True,
36+
type=int
37+
)
38+
parser.add_argument(
39+
'-multi_proc',
40+
help="# Process (default 1), (-1: use all process)",
41+
type=int,
42+
default=1
43+
)
44+
group.add_argument(
45+
'-final_voca_size',
46+
help="Final voca size (default 0), Must use either -final_voca_size or -final_voca_threshold",
47+
type=int,
48+
default=0
49+
)
50+
group.add_argument(
51+
'-final_voca_threshold',
52+
help="Final voca threshold(word frequency) (default 0), Must use either -final_voca_size or -final_voca_threshold. ",
53+
type=int,
54+
default=0
55+
)
1556
args = parser.parse_args()
57+
if args.final_voca_size == 0 and args.final_voca_threshold == 0:
58+
parser.error("Must use either -final_voca_size or -final_voca_threshold.")
59+
1660

1761
train_path = args.train_path
1862
voca_out_path = args.voca_out_path
1963
bpe_out_path = args.bpe_out_path
2064
train_voca_threshold = int(args.train_voca_threshold)
21-
final_voca_size = int(args.final_voca_size)
2265
num_merges = int(args.num_merges)
2366
multi_proc = int(args.multi_proc)
67+
final_voca_size = int(args.final_voca_size)
68+
final_voca_threshold = int(args.final_voca_threshold)
2469

2570
if multi_proc == -1:
2671
multi_proc = os.cpu_count()
@@ -57,5 +102,6 @@
57102
voca_path=voca_out_path,
58103
new_voca_path=voca_out_path,
59104
final_voca_num=final_voca_size,
105+
final_voca_threshold=final_voca_threshold,
60106
space_symbol='</w>'
61107
)

bpe_module/apply_BPE.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def _apply_bpe(path, out_path, space_symbol='</w>', sorted_voca={}):
9898
o.close()
9999

100100

101-
def apply_bpe(path_list, out_list, voca_path, new_voca_path=None, final_voca_threshold=1, final_voca_num=None, space_symbol='</w>'):
101+
def apply_bpe(path_list, out_list, voca_path, new_voca_path=None, final_voca_threshold=1, final_voca_num=0, space_symbol='</w>'):
102102
# final_voca_threshold: final voca에 참여시킬 voca의 threshold
103103
print('apply bpe')
104104

@@ -139,6 +139,7 @@ def apply_bpe(path_list, out_list, voca_path, new_voca_path=None, final_voca_thr
139139
new_sorted_voca = get_vocabulary(bpe_path_list)[:final_voca_num]
140140
else:
141141
new_sorted_voca = get_vocabulary(bpe_path_list)
142+
new_sorted_voca = [(word, int(freq)) for (word, freq) in new_sorted_voca if int(freq) >= final_voca_threshold]
142143

143144
save_voca(new_voca_path, new_sorted_voca)
144145
print(new_voca_path, "data size:", len(new_sorted_voca))

0 commit comments

Comments
 (0)