-
Notifications
You must be signed in to change notification settings - Fork 52
/
cleaners.py
176 lines (152 loc) · 7.2 KB
/
cleaners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import re
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
# from text.sanskrit import devanagari_to_ipa
# from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
# from text.thai import num_to_thai, latin_to_thai
# from text.shanghainese import shanghainese_to_ipa
# from text.cantonese import cantonese_to_ipa
# from text.ngu_dialect import ngu_dialect_to_ipa
def japanese_cleaners(text):
text = japanese_to_romaji_with_accent(text)
if re.match('[A-Za-z]', text[-1]):
text += '.'
return text
def japanese_cleaners2(text):
return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
def korean_cleaners(text):
'''Pipeline for Korean text'''
text = latin_to_hangul(text)
text = number_to_hangul(text)
text = divide_hangul(text)
if re.match('[\u3131-\u3163]', text[-1]):
text += '.'
return text
def chinese_cleaners(text):
'''Pipeline for Chinese text'''
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
if re.match('[ˉˊˇˋ˙]', text[-1]):
text += '。'
return text
def zh_ja_mixture_cleaners(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_romaji(chinese_text[4:-4])
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_romaji_with_accent(
japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
text = text.replace(japanese_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
text += '.'
return text
def sanskrit_cleaners(text):
text = text.replace('॥', '।').replace('ॐ', 'ओम्')
if text[-1] != '।':
text += ' ।'
return text
def cjks_cleaners(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_ipa(japanese_text[4:-4])
text = text.replace(japanese_text, cleaned_text+' ', 1)
for korean_text in korean_texts:
cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
text = text.replace(korean_text, cleaned_text+' ', 1)
for sanskrit_text in sanskrit_texts:
cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
text = text.replace(sanskrit_text, cleaned_text+' ', 1)
for english_text in english_texts:
cleaned_text = english_to_lazy_ipa(english_text[4:-4])
text = text.replace(english_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def cjke_cleaners(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
cleaned_text = cleaned_text.replace(
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_ipa(japanese_text[4:-4])
cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
text = text.replace(japanese_text, cleaned_text+' ', 1)
for korean_text in korean_texts:
cleaned_text = korean_to_ipa(korean_text[4:-4])
text = text.replace(korean_text, cleaned_text+' ', 1)
for english_text in english_texts:
cleaned_text = english_to_ipa2(english_text[4:-4])
cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
text = text.replace(english_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def cjke_cleaners2(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_ipa(chinese_text[4:-4])
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_ipa2(japanese_text[4:-4])
text = text.replace(japanese_text, cleaned_text+' ', 1)
for korean_text in korean_texts:
cleaned_text = korean_to_ipa(korean_text[4:-4])
text = text.replace(korean_text, cleaned_text+' ', 1)
for english_text in english_texts:
cleaned_text = english_to_ipa2(english_text[4:-4])
text = text.replace(english_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def thai_cleaners(text):
text = num_to_thai(text)
text = latin_to_thai(text)
return text
def shanghainese_cleaners(text):
text = shanghainese_to_ipa(text)
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def chinese_dialect_cleaners(text):
text = re.sub(r'\[MD\](.*?)\[MD\]',
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
text = re.sub(r'\[TW\](.*?)\[TW\]',
lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
text = re.sub(r'\[JA\](.*?)\[JA\]',
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
'˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
text = re.sub(r'\[GD\](.*?)\[GD\]',
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
text = re.sub(r'\[EN\](.*?)\[EN\]',
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
text = re.sub(r'\s+$', '', text)
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
return text