This repository has been archived by the owner on Oct 8, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
/
WordNet.scala
301 lines (267 loc) · 7.88 KB
/
WordNet.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
/*
*
* WordNet for Scala and Spark
*
* Afshin Sadeghi
*
* Inspired from:
* WordNet::Similarity of Ted Peterson
* and https://github.com/sujitpal/scalcium
* and ws4j
* and nltk project
*/
package net.sansa_stack.ml.common.nlp.wordnet
import java.io.Serializable
import net.sf.extjwnl.data.{PointerType, PointerUtils, Word}
import net.sf.extjwnl.dictionary.Dictionary
import scala.collection.JavaConverters._
import scala.collection.breakOut
import scala.collection.mutable.ArrayBuffer
/**
* WordNet singleton to initialize WordNet dataset
*/
object WordNet {
val dict: Dictionary = Dictionary.getDefaultResourceInstance
}
/**
* WordNet class that provide WordNet related basic services
*/
class WordNet extends Serializable {
var maxDepth = 0
/**
* Returns an instance of the WordNet dictionary used in the package
*
* @return
*/
def getDict: Dictionary = WordNet.dict
/**
* Returns a Synset belonging to a lemma String
*
* @param lemma : String
* @return : List[Synset]
*/
def getSynsets(lemma: String): List[Synset] =
net.sf.extjwnl.data.POS.getAllPOS.asScala
.flatMap(pos => getSynsets(lemma, pos))(breakOut)
/**
* Returns a Synset given a String
* Returns empty list if the lemma did not exist in the WordNet
*
* @param lemma : String
* @param pos : POS
* @param sid : Integer
* @return : List[Synset]
*/
def getSynset(lemma: String, pos: POS, sid: Int): List[Synset] = {
val indexWord = WordNet.dict.getIndexWord(pos, lemma)
var result = List.empty[Synset]
if (indexWord != null) {
val result_scala = indexWord.getSenses().asScala
result = List(result_scala(sid))
}
result
}
/**
* Returns a Synset given a String, pos and synset id
* Returns empty list if the lemma did not exist in the WordNet
*
* @param lemma : String
* @param pos : POS
* @return List[Synset]
*/
def getSynsets(lemma: String, pos: POS): List[Synset] = {
val iword = WordNet.dict.getIndexWord(pos, lemma)
if (iword == null) List.empty[Synset]
else iword.getSenses.asScala.toList
}
/**
* Gets lemma name for a synset
*
* @param synset :Synset
* @return : List[String]
*/
def lemmaNames(synset: Synset): List[String] =
synset.getWords.asScala.map(_.getLemma)(breakOut)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return
*/
def hyponyms(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.HYPONYM)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return
*/
def hypernyms(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.HYPERNYM)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return : List[Synset]
*/
def partMeronyms(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.PART_MERONYM)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return : List[Synset]
*/
def partHolonyms(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.PART_HOLONYM)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return : List[Synset]
*/
def substanceMeronyms(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.SUBSTANCE_MERONYM)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return : List[Synset]
*/
def substanceHolonyms(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.SUBSTANCE_HOLONYM)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return : List[Synset]
*/
def memberHolonyms(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.MEMBER_HOLONYM)
/**
* Input is a synset
* returns a list of synsets
*
* @param synset :Synset
* @return : List[Synset]
*/
def entailments(synset: Synset): List[Synset] = relatedSynsets(synset, PointerType.ENTAILMENT)
/**
* Gets related synsets per function given a pointer type
* from pointer class
*
* @param synset :Synset
* @param ptr : PointerType
* @return : List[Synset]
*/
def relatedSynsets(synset: Synset, ptr: PointerType): List[Synset] =
synset.getPointers(ptr).asScala.map(ptr => ptr.getTarget.asInstanceOf[Synset])(breakOut)
/**
* Returns list of all hypernyms of a synset
*
* @param synset :Synset
* @return : List[Synset]
*/
def getAllHypernyms(synset: Synset): List[List[Synset]] =
PointerUtils
.getHypernymTree(synset)
.toList
.asScala.map(ptnl => ptnl
.asScala.map(ptn => ptn.getSynset)
.toList)(breakOut)
/**
* Returns the list of root hypernyms of a Synset
*
* @param synset : Synset
* @return : List[Synset]
*/
def rootHypernyms(synset: Synset): List[Synset] =
getAllHypernyms(synset)
.map(hp => hp.reverse.head).distinct
/**
* Get lowestCommonHypernym of two Synsets
*
* @param synset1 : Synset
* @param synset2 : Synset
* @return : List[Synset]
*/
def lowestCommonHypernym(synset1: Synset, synset2: Synset): List[Synset] = {
val paths1 = getAllHypernyms(synset1)
val paths2 = getAllHypernyms(synset2)
lch(paths1, paths2)
}
/**
* Get shortestPath Length to a Hypernim
*
* @param synset1 : Synset
* @param hypernym : Synset
* @return : Integer
*/
def shortestHypernymPathLength(synset1: Synset, hypernym: Synset): Int = {
val paths1 = getAllHypernyms(synset1)
val path = ArrayBuffer[(Synset, Int)]()
val matchedPath = paths1.zipWithIndex.filter { case (s, i) => s.contains(hypernym) }
if (matchedPath.isEmpty) -1 else matchedPath.map(x => x._1.indexOf(hypernym)).min
}
/**
* Returns the lowest common hypernymys of two synset paths
*
* @param paths1 : List[Synset]
* @param paths2 : List[Synset]
* @return : List[Synset]
*/
private[this] def lch(paths1: List[List[Synset]], paths2: List[List[Synset]]): List[Synset] = {
val pairs = for (paths1 <- paths1; paths2 <- paths2) yield (paths1, paths2)
val lchs = ArrayBuffer[(Synset, Int)]()
pairs.map { case (paths1, paths2) =>
val lSet = paths1.toSet
val matched = paths2.zipWithIndex.filter { case (s, i) => lSet.contains(s) }
if (matched.nonEmpty) lchs += matched.head
}
var result = List[Synset]()
if (lchs.isEmpty) {
} else result = lchs.minBy(_._2)._1 :: result
result
}
/**
* Returns the length of the shortest hypernym path from this
* synset to the root
* Since there can be several paths to root, the minimum length is considered
*
* @param synset : Synset
* @return : Integer
*/
def minDepth(synset: Synset): Int = {
val lists = getAllHypernyms(synset)
if (lists.isEmpty) -1 else lists.map(_.size).min - 1
}
/**
* Returns the length of the longest hypernym path from this
* synset to the root
* Since there can be several paths to root, the minimum length is considered
* @param synset : Synset
* @return : Integer
*/
def maxDepth(synset: Synset): Int = {
val lists = getAllHypernyms(synset)
if (lists.isEmpty) -1 else lists.map(_.size).max - 1
}
/**
* Returns the antonym of a word
*
* @param word : Word
* @return : List[Word]
*/
def antonyms(word: Word): List[Word] =
relatedLemmas(word, PointerType.ANTONYM)
/**
* Returns related lemmas of a word given the word and the type of relation
*
* @param word : Word
* @param ptr : PointerType
* @return : List[Word]
*/
def relatedLemmas(word: Word, ptr: PointerType): List[Word] =
word.getPointers(ptr)
.asScala.map(ptr => ptr.getTarget.asInstanceOf[Word])(breakOut)
}