-
Notifications
You must be signed in to change notification settings - Fork 0
/
Wake.py
135 lines (111 loc) · 4.17 KB
/
Wake.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
""" **************************************************************************
Created on 2021
@author: Omid Hajipoor
Email: hajipoor.omid@aut.ac.ir
Gmail: omid.hajipoor0770@Gmail.com
************************************************************************** """
import nltk
import learnWord2vec
import math
import Helper
from gensim.models import Word2Vec
class wake:
def __init__(self,*args, **kwargs):
domainTxt = args[0]
preTrainModel = args[1]
t = Helper.Pre_proc(domainTxt)
domainTxt = t.PreProc()
self.domainTxt=domainTxt
if preTrainModel==1 and len(args)==2:
print('Error: '+"Addres for load model is not defined")
elif preTrainModel==0:
param=args[2]
creatMod=learnWord2vec.w2v(domainTxt,param[0],param[1])
self.model =creatMod.learnW2v()
else:
try:
addLoad=args[3]
self.model= Word2Vec.load(addLoad)
print('Model Was loaded...')
except NameError:
print("Addres for load model is not defined")
# =============================================================================
# input: txt , number of keyword
# output: keywords and score
# =============================================================================
def keyword_EXT(self,txt , numKey):
t = Helper.Pre_proc(txt)
doc=t.PreProc()
vocab = self.model.wv.vocab
token=nltk.word_tokenize(doc)
final_word=[]
for word in token:
if word not in vocab:
continue
else:
final_word.append(word)
dis=[]
for i in range(1,len(final_word)-1):
s=[]
s.append(final_word[i])
s.append(self.model.wv.distance(final_word[i+1] ,final_word[i] ))
s.append(self.model.wv.distance(final_word[i-1] ,final_word[i] ))
dis.append(s)
disSum={}
co={}
for line in dis:
if line[0] in disSum:
disSum[line[0]]+=line[1]
co[line[0]]+=1
disSum[line[0]]+=line[2]
co[line[0]]+=1
else:
disSum[line[0]]=line[1]
co[line[0]]=1
disSum[line[0]]+=line[2]
co[line[0]]+=1
score={}
n = Helper.Pre_proc.ngrams(doc,1)
countN=0
for w in n:
countN+=n[w]
pw={}
for w in n:
pw[w]=n[w]/countN
for word in disSum:
if word=='.':
continue
else:
score[word]=disSum[word]*pw[word]/co[word]
sorted_by_value_ngram = sorted(n.items(), key=lambda kv: kv[1])
com={}
ls=[]
for w in sorted_by_value_ngram:
w1=w[0]
if w1 not in vocab:
continue
else:
dist=0
coo=0
for i in range(len(final_word)):
if final_word[i] not in vocab:
continue
else:
dist+=self.model.wv.distance(w1,final_word[i] )
coo+=1
com[w1]=dist/coo
ls.append(com[w1])
newScore={}
for w in com:
if (w in pw) and (w in com):
if pw[w]==0:
newScore[w]=abs(math.log2(0.99)*com[w])
else:
newScore[w]=abs(math.log2(pw[w])*com[w])
else:
continue
sorted_by_value_com = sorted(newScore.items(), key=lambda kv: kv[1])
selectedDoc=[]
selectedDoc=sorted_by_value_com[0:numKey]#len(realKey[ss])numOfWord
return selectedDoc