-
Notifications
You must be signed in to change notification settings - Fork 0
/
RANKING.PY
138 lines (111 loc) · 6.25 KB
/
RANKING.PY
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import FILEOPS22
import _pickle as cPickle
import operator
from math import log10
import collections
from collections import OrderedDict
import json
class SPIMIRanking:
def __init__(self):
self.index = FILEOPS22.readIndexIntoMemory()
self.index1 = FILEOPS22.readRankintomemory()
def multiple_and_keyword_query(self,query):
docIds = []
terms = query.split(' and ')
for i, term in enumerate(terms):
print ("The terms are : ",str((i + 1)) + ": " + term)
print ("" )
for term in query.split():
#print('The terms are : ',term)
if term in self.index1:
term_doclist = []
for posting in (self.index1[term]):
posting_doc = str(posting[0])
term_doclist.append(posting_doc)
docIds.append(term_doclist)
results = list(set.intersection(*map(set, docIds)))
print('Document IDS without Ranking for AND --- : ',results)
return results
def multiple_or_keyword_query(self,query):
docIds = []
#inverted_index = get_inverted_index(index_file)
terms = query.split(' or ')
for i, term in enumerate(terms):
print ("The terms are : ",str((i + 1)) + ": " + term)
#print ("" )
for term in query.split():
#print('The terms are : ',term)
if term in self.index1:
term_doclist = []
for posting in (self.index1[term]):
posting_doc = str(posting[0])
term_doclist.append(posting_doc)
docIds.append(term_doclist)
results = list(set.union(*map(set, docIds)))
print('Document IDS without Ranking for OR --- :',results)
return results
def single_keyword_query(self,query):
main_indexdict = self.index1
doc_list=[]
if main_indexdict.get(query)!=None:
for match in main_indexdict.get(query):
print("Document IDS without Ranking",str(match[0]))
doc_list.append(str(match[0]))
return doc_list
def GetRankedResults(self,query):
matches_rank = []
if ' and' in query.lower():
matches_rank = self.multiple_and_keyword_query(query)
elif ' or' in query.lower():
matches_rank = self.multiple_or_keyword_query(query)
else:
matches_rank = self.single_keyword_query(query)
rsvd_dict = {}
with open("DISK/collection_stats", 'rb') as stats_files:
N, doc_length_dict, avg_doc_length = cPickle.load(stats_files)
stats_files.close()
N = float(N)
for term in query.split():
#print('term------',term)
if term in self.index1:
dft = float(len(self.index1[term]))
for posting in (self.index1[term]):
posting_doc = str(posting[0])
if posting_doc in matches_rank :
rsvd = (( posting[2] * (log10((N/dft)))))
rsvd_dict[posting_doc] = rsvd
return sorted(rsvd_dict.items(), key=operator.itemgetter(1), reverse=True)
def RankDocuments(self):
k1 = 1.2
b = 0.75
out_file = 'Merge/rank_file.txt'
# fetch collection stats
with open("DISK/collection_stats", 'rb') as stats_files:
N, doc_length_dict, avg_doc_length = cPickle.load(stats_files)
stats_files.close()
doc_length_dict = dict(doc_length_dict)
selfindexdict = self.index
for term, postings in selfindexdict.items():
#print(type(postings))
postings = json.loads(postings)
for index, post in enumerate(postings):
tftd = post[1]
doc_id = post[0]
doc_id = str(doc_id)
ld = doc_length_dict[doc_id]
post = [post[0], post[1], self.calculate_rsv(avg_doc_length, b, k1, ld, tftd)]
postings[index]=post
selfindexdict[term] = postings
#print(' self_post[index]', selfindexdict)
with open(out_file, "w+") as output_file:
#cPickle.dump(self.index, output_file, -1)
output_file.write(json.dumps(selfindexdict))
output_file.close()
def calculate_rsv(self,avg_doc_length, b, k1, ld, tftd):
avg_doc_length = float(avg_doc_length)
b = float(b)
k1 = float(k1)
ld = float(ld)
tftd = float(tftd)
return float(((k1 + 1) * tftd) / (k1 * (((1 - b) + b * (ld / avg_doc_length))) + tftd))