-
Notifications
You must be signed in to change notification settings - Fork 0
/
Similarity_Functions.py
80 lines (70 loc) · 2.44 KB
/
Similarity_Functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import textdistance
from abydos import distance
#All functions presented execute the similarity between all attributes of the pair of multi-attributes entities
#A and B are entities and t is an integer threshold
def HammingDistanceFunction(A,B,t):
sim = 0
for i in range(len(A)):
sim += textdistance.hamming(A[i],B[i])
sim = sim/len(A)
return (A, B, sim)
#A and B are entities and t is an integer threshold
def LevenshteinDistanceFunction(A,B,t, filters_vector,features):
sim = 0
for i in range(len(A)):
sim += textdistance.levenshtein(A[i],B[i])
sim = sim/len(A)
return (A, B, sim)
#Normalized version of Levenshtein Distance
#A and B are entities and t is a normalized threshold
def NormalizedLevenshteinDistanceFunction(A,B,t):
norm = 0
for i in range(len(A)):
sim = textdistance.levenshtein(A[i],B[i])
norm += (len(max(A[i],B[i],key=len))-sim)/len(max(A[i],B[i],key=len))
norm = norm/len(A)
return (A, B, norm)
#A and B are entities and t is a normalized threshold
def JaccardFunction(A,B,t):
sim = 0
for i in range(len(A)):
sim += textdistance.jaccard(A[i],B[i])
sim = sim/len(A)
return (A, B, sim)
#A and B are entities and t is a normalized threshold
def CosineFunction(A,B,t):
sim = 0
for i in range(len(A)):
sim += textdistance.cosine(A[i],B[i])
sim = sim/len(A)
return (A, B, sim)
#Sorensen and dice similarity function
#A and B are entities and t is a normalized threshold
def SorensenCoefficient(A,B,t, filters_vector,features):
sim = 0
for i in range(len(A)):
sim += textdistance.sorensen(A[i],B[i])
sim = sim/len(A)
return (A, B, sim)
#A and B are entities and t is a normalized threshold
def JaroFunction(A,B,t):
sim = 0
for i in range(len(A)):
sim += textdistance.jaro(A[i],B[i])
sim = sim/len(A)
return (A, B, sim)
#A and B are entities and t is a normalized threshold
def BraunBlanquetFunction(A,B,t):
sim = 0
BraunBlanquet = distance.BraunBlanquet()
for i in range(len(A)):
sim += BraunBlanquet.sim(A[i],B[i])
sim = sim/len(A)
return (A, B, sim)
#A and B are entities and t is a normalized threshold
def OverlapFunction(A,B,t, filters_vector,features):
sim = 0
for i in range(len(A)):
sim += textdistance.overlap(A,B)
sim = sim/len(A)
return (A, B, sim)