/
unicode.py
286 lines (240 loc) · 9.18 KB
/
unicode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# coding: utf-8
from functools import lru_cache
from charset_normalizer.constant import UNICODE_RANGES_ZIP, UNICODE_RANGES_NAMES, UNICODE_SECONDARY_RANGE_KEYWORD
class UnicodeRangeIdentify:
SUSPICIOUS_RANGES_CACHE = dict()
@staticmethod
@lru_cache(maxsize=8192)
def find_letter_type(letter):
"""
This method is intended to associate a single character with a range name from the unicode table
:param str letter: Shall be a unique char
:return: Associated unicode range designation
:rtype: Union[str, None]
"""
if len(letter) != 1:
raise IOError('Trying to associate multiple char <{}> to a single unicode range'.format(letter))
for u_name, u_range in UNICODE_RANGES_ZIP.items():
if ord(letter) in u_range:
return u_name
return None
@staticmethod
@lru_cache(maxsize=8192)
def is_accentuated(letter):
"""
Verify if a latin letter is accentuated, unicode point of view.
:param str letter: Letter to check
:return: True if accentuated, else False
:rtype: bool
"""
if len(letter) != 1:
raise IOError('Trying to determine accentuated state of multiple char <{}>'.format(letter))
return 192 <= ord(letter) <= 383
@staticmethod
@lru_cache(maxsize=512)
def get_range_id(range_name):
return UNICODE_RANGES_NAMES.index(range_name)
@staticmethod
@lru_cache(maxsize=8192)
def is_latin(letter):
"""
Verify if a letter is Latin based
:param str letter:
:return:
"""
return 'Latin' in (UnicodeRangeIdentify.find_letter_type(letter) or '')
@staticmethod
@lru_cache(maxsize=8192)
def is_punc(letter):
"""
Verify if a letter is a sort of punctuation sign
:param str letter:
:return:
"""
if letter.isspace():
return True
r_name = UnicodeRangeIdentify.find_letter_type(letter)
return r_name is not None and \
("Punctuation" in r_name or
'Forms' in r_name or
letter in set('º¯—–‒‐⁃«‹?!;.:^$¥*»£¹¿~ª؟©±¡{}[]|½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒™℠¬‼⁇❝❞¶⁋√↑↓�¤©`¨'))
@staticmethod
@lru_cache(maxsize=8192)
def is_cjk(letter):
"""
Verify if a letter is part of a CJK unicode range
:param str letter:
:return:
"""
return 'CJK' in (UnicodeRangeIdentify.find_letter_type(letter) or '')
@staticmethod
def unravel_suspicious_ranges(str_len, encountered_unicode_range_occurrences):
"""
:param int str_len:
:param dict encountered_unicode_range_occurrences:
:return:
"""
items = encountered_unicode_range_occurrences.items()
s_ = 0
# print(encountered_unicode_range_occurrences)
for k, v in items:
k_ = k.lower()
if (
'latin' not in k_ and 'general punctuation' not in k_ and 'symbols and punctuation' not in k_ and 'cjk' not in k_) or 'latin extended' in k_ or 'latin-1 supplement' in k_:
if v / str_len < 0.09:
if len(encountered_unicode_range_occurrences.keys()) <= 2 and 'latin-1 supplement' in k_:
continue
if 'halfwidth and fullwidth forms' in k_ and any(['CJK' in el for el in encountered_unicode_range_occurrences.keys()]):
continue
if 'hiragana' in k_ or 'katakana' in k_:
continue
# print('suspicious', k_, 'with', v)
s_ += v
return s_
@staticmethod
@lru_cache(maxsize=8192)
def is_suspiciously_successive_range(range_name_a, range_name_b):
"""
Verify if range B encountered just after range A is considered suspicious
:param str range_name_a: Unicode range A
:param str range_name_b: Unicode range B
:return: True if suspicious else False
:rtype: bool
"""
if range_name_a is None or range_name_b is None:
return True
dec_range_name_a, dec_range_name_b = range_name_a.split(), range_name_b.split()
if range_name_a == range_name_b:
return False
if 'Latin' in range_name_a and 'Latin' in range_name_b:
return False
for el in dec_range_name_a:
if el in dec_range_name_b:
return False
if range_name_a in ['Katakana', 'Hiragana'] and 'CJK' in range_name_b:
return False
if 'CJK' in range_name_a and range_name_b in ['Katakana', 'Hiragana']:
return False
if range_name_a in ['Katakana', 'Hiragana'] and range_name_b in ['Katakana', 'Hiragana']:
return False
return True
@staticmethod
def classification(word):
"""
:param str word:
:return:
"""
cla_ = dict()
for el in word:
if el.isspace():
raise IOError('Classification should not be invoked with sentences !')
u_name = UnicodeRangeIdentify.find_letter_type(el)
if u_name is None:
u_name = 'Unknown'
if u_name not in cla_:
cla_[u_name] = 0
cla_[u_name] += 1
return cla_
@staticmethod
@lru_cache(maxsize=512)
def is_range_secondary(u_range):
"""
Determine if a unicode range name is not a primary range by search specific keyword in range name
:param str u_range: Unicode range name
:return: True if secondary else False
:rtype: bool
"""
try:
UnicodeRangeIdentify.get_range_id(u_range)
except ValueError:
return True
for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
if keyword in u_range:
return True
return False
@staticmethod
def part_punc(word):
"""
Determine how much of the word is composed of punc sign
:param str word:
:return: Ratio special letter VS len of the word
:rtype: float
"""
return [UnicodeRangeIdentify.is_punc(el) for el in word].count(True) / len(word)
@staticmethod
def part_accent(word):
"""
Determine how much of the word is composed of accentuated letter
:param word:
:return: Ratio accentuated letter VS len of the word
:rtype: float
"""
return [UnicodeRangeIdentify.is_accentuated(el) for el in word].count(True) / len(word)
@staticmethod
def word_to_range_list(word):
"""
:param str word:
:return: Produce a list containing for each letter in word it's unicode range name
:rtype: list[str]
"""
return [UnicodeRangeIdentify.find_letter_type(el) for el in word]
@staticmethod
def word_to_range_continue(word):
"""
:param str word:
:return: List of tuple (unicode range with occ) continuously encountered in a word
:rtype: list[tuple[str, int]]
"""
l_ = list()
for el in word:
u_name = UnicodeRangeIdentify.find_letter_type(el)
if len(l_) == 0:
l_.append(
(
u_name,
1
)
)
else:
if UnicodeRangeIdentify.is_suspiciously_successive_range(u_name, l_[-1][0]) is True:
l_.append(
(
u_name,
1
)
)
else:
l_[-1] = (
u_name,
l_[-1][1]+1
)
return l_
@staticmethod
def part_lonely_range(word):
"""
:param str word:
:return:
"""
return [u_occ_cont == 1 for u_name, u_occ_cont in UnicodeRangeIdentify.word_to_range_continue(word)].count(True) / len(word)
@staticmethod
def list_by_range(letters):
"""
Sort letters by unicode range in a dict
:param list[str] letters:
:return: Letters by unicode range
:rtype: dict
"""
by_ranges = dict()
for l in letters:
u_range = UnicodeRangeIdentify.find_letter_type(l)
s_ = False
for range_name, letters in by_ranges.items():
if UnicodeRangeIdentify.is_suspiciously_successive_range(range_name, u_range) is False:
by_ranges[range_name].append(l)
s_ = True
break
if s_ is False:
if u_range not in by_ranges.keys():
by_ranges[u_range] = list()
by_ranges[u_range].append(l)
return by_ranges