-
Notifications
You must be signed in to change notification settings - Fork 7
/
prepro.py
80 lines (71 loc) · 2.94 KB
/
prepro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
#/usr/bin/python2
from __future__ import print_function
from hyperparams import Hyperparams as hp
import tensorflow as tf
import numpy as np
import codecs
import os, code
#import regex
from collections import Counter
def allNum(word):
allNum=True
for ww in word:
if ww>'9' or ww<'0':
allNum=False
break
return allNum
def make_vocab_src(fpath, fname):
'''Constructs vocabulary.
Args:
fpath: A string. Input file path.
fname: A string. Output file name.
Writes vocabulary line by line to `preprocessed/fname`
'''
file1 = codecs.open(fpath, 'r', 'utf-8')
words = []
for line in file1.readlines():
for content_emotion in line.split("</d>"):
#code.interact(local=locals())
if len(content_emotion.split("\t\t")) > 1:
content = content_emotion.split("\t\t")[1]
words.extend(content.split())
#words = text.split()
word2cnt = Counter(words)
if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
for word, cnt in word2cnt.most_common(len(word2cnt)):
if allNum(word)==False:
fout.write(u"{}\t{}\n".format(word, cnt))
def make_vocab_tgt(fpath, fname, fname2):
'''Constructs vocabulary.
Args:
fpath: A string. Input file path.
fname: A string. Output file name.
Writes vocabulary line by line to `preprocessed/fname`
'''
words = []
speakers = []
file1 = codecs.open(fpath, 'r', 'utf-8')
for line in file1.readlines():
#code.interact(local=locals())
words.extend(line.split(u"</d>")[1].split())
speakers.append(' '.join(line.split(u"</d>")[0].split()))
word2cnt = Counter(words)
speaker2cnt = Counter(speakers)
if not os.path.exists('preprocessed'): os.mkdir('preprocessed')
with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout:
fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>"))
for word, cnt in word2cnt.most_common(len(word2cnt)):
if allNum(word)==False:
fout.write(u"{}\t{}\n".format(word, cnt))
with codecs.open('preprocessed/{}'.format(fname2), 'w', 'utf-8') as fout:
fout.write("{}\t1000000000\n".format("<newer>"))
for word, cnt in speaker2cnt.most_common(len(speaker2cnt)):
if allNum(word)==False:
fout.write(u"{}\t{}\n".format(word, cnt))
if __name__ == '__main__':
make_vocab_src(hp.source_train, "de.vocab.tsv")
make_vocab_tgt(hp.target_train, "en.vocab.tsv", "speaker.name.tsv")
print("Done")