/
wiki_trans_filter.py
122 lines (103 loc) · 3.26 KB
/
wiki_trans_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from pathlib import Path
from difflib import SequenceMatcher
from utils.normalize import strip_accents
from utils.transliterate import transliterations
data_dir = Path("../data/transliteration/raw/")
file_name = data_dir / "en_hi_pairs1.txt"
def check_transliterations(hin, eng):
"""
Takes in an english and a hindi word. Finds the possible transliterations
of the hindi word and then checks if the english word is in that list.
Returns 1 if it is. It also checks if the english word is similar to
any of the possible transliterations with a score of greater than 0.85.
Returns 1 if it is. If none of the transliterations are similar to the
english with with a score of 0.5 then it returns 2.
"""
t = transliterations(hin)
if eng in t:
del t
return 1
scores = [SequenceMatcher(None, eng, j).ratio() for j in t] + [0]
max_score = max(scores)
if max_score > 0.85:
del t
return 1
if max_score < 0.5:
del t
return 2
return 0
def filter_pair(hin, eng):
"""
Checks if the english word is a valid transliteration of the hindi word
based on the possible generated transliterations.
"""
# transliteration func takes time when length is 11 or more
if len(hin) > 11:
return 0
# observed that majority of the right transliterations had max len
# differene of 3
if abs(len(hin) - len(eng)) > 3:
return 2
flag = check_transliterations(hin, eng)
return flag
def filter_pair2(hin, eng):
"""
Checks if the english word is a valid transliteration of the hindi word
based on the word endings of english and hindi words.
"""
endings = {
"ar": "ड़",
"ex": "ेक्स",
"ord": "ोर्ड",
"ix": "िक्स",
"ox": "ॉक्स",
"sor": "जर",
"sa": "सा",
# "ine": "इन",
# "in": "िन",
"ide": "ाइड",
"ism": "िज्म",
"ra": "ड़ा",
# "and": "ैंड",
"and": "ैण्ड",
"phere": "फीयर",
"xon": "क्सन",
"ru": "रु",
"pus": "पस",
"nge": "ंज",
"ine": "िन",
"ine": "ाइन",
"xar": "सर",
"ura": "ूरा",
# "in": "इन"
"me": "ेम",
"ery": "री",
"ite": "ाइट",
"pur": "पुर",
"anda": "न्द",
# "ic": "िक",
"ist": "िस्ट",
"ene": "िन",
"ays": "ेज़"
}
for i in endings:
if eng.endswith(i):
if hin.endswith(endings[i]):
return 1
return 0
with open(file_name, "r") as f:
for i, pair in enumerate(f):
# print(i, pair)
eng, hin = pair.strip().split("|")
eng = strip_accents(eng)
flag = filter_pair(hin, eng)
# flag = filter_pair2(hin, eng)
if flag == 0:
with open(data_dir / "false.txt", "a") as f:
f.write(pair)
elif flag == 1:
with open(data_dir / "true.txt", "a") as f:
f.write(pair)
elif flag == 2:
with open(data_dir / "v_false.txt", "a") as f:
f.write(pair)