In [7]:
terms = {
    'hujan': [1.5, 0.4, 0.6, 1.0, 1.5, 1.6],
    'turun': [0.7, 1.0, 1.5, 1.5, 0.3, 1.1],
    'deras': [1.2, 1.0, 0.5, 0.6, 1.8]
}

doc_ids = {
    'hujan': [1, 2, 3, 6, 8, 11],
    'turun': [1, 3, 6, 8, 10, 12],
    'deras': [1, 6, 7, 10, 11]
}

In [8]:
def find_pivot(upper_bounds, threshold):
    acc_ub = 0
    for i, ub in enumerate(upper_bounds):
        acc_ub += ub
        if acc_ub >= threshold:
            return i
    return None

def wand_algorithm(terms, doc_ids, K):
    pointers = [0] * len(terms)
    upper_bounds = [max(scores) for scores in terms.values()]
    num_terms = len(terms)
    theta = 0
    top_K = []
    num_evaluated = 0

    while all(pointer < len(doc_ids[term]) for pointer, term in zip(pointers, terms)):
        candidate_doc = max(doc_ids[term][ptr] for term, ptr in zip(terms, pointers))
        pivot_idx = find_pivot(upper_bounds, theta)
        if pivot_idx is None:
            break

        pivot_term = list(terms.keys())[pivot_idx]
        pivot_doc = doc_ids[pivot_term][pointers[pivot_idx]]

        if pivot_doc == candidate_doc:
            sum_scores = sum(score[pointers[i]] for i, (term, score) in enumerate(terms.items()) if doc_ids[term][pointers[i]] == candidate_doc)
            if sum_scores >= theta:
                top_K.append((candidate_doc, sum_scores))
                top_K.sort(key=lambda x: x[1], reverse=True)
                if len(top_K) > K:
                    top_K.pop()
                if len(top_K) == K:
                    theta = top_K[-1][1]
            pointers[0] += 1  # Move the first pointer
            num_evaluated += 1
        else:
            pointers[pivot_idx] += 1  # Move the pivot pointer

        # Printing
        print("Pointers: ", {term: doc_ids[term][ptr] for term, ptr in zip(terms, pointers)})
        print("TOP-{}: {}".format(K, top_K))
        print("Threshold :", theta)
        print("Pivot : {}".format(pivot_term))
        print("curDoc : {}".format(candidate_doc))
        print("#documents that are fully evaluated: {}".format(num_evaluated))
        print("-"*30)

    return top_K

In [12]:
# Example usage:
top_1 = wand_algorithm(terms, doc_ids, 1)
print("Final TOP-2:", top_1)


Pointers:  {'hujan': 2, 'turun': 1, 'deras': 1}
TOP-1: [(1, 3.4000000000000004)]
Threshold : 3.4000000000000004
Pivot : hujan
curDoc : 1
#documents that are fully evaluated: 1
------------------------------
Pointers:  {'hujan': 2, 'turun': 1, 'deras': 6}
TOP-1: [(1, 3.4000000000000004)]
Threshold : 3.4000000000000004
Pivot : deras
curDoc : 2
#documents that are fully evaluated: 1
------------------------------
Pointers:  {'hujan': 3, 'turun': 1, 'deras': 6}
TOP-1: [(1, 3.4000000000000004)]
Threshold : 3.4000000000000004
Pivot : deras
curDoc : 6
#documents that are fully evaluated: 2
------------------------------
Pointers:  {'hujan': 6, 'turun': 1, 'deras': 6}
TOP-1: [(1, 3.4000000000000004)]
Threshold : 3.4000000000000004
Pivot : deras
curDoc : 6
#documents that are fully evaluated: 3
------------------------------
Pointers:  {'hujan': 8, 'turun': 1, 'deras': 6}
TOP-1: [(1, 3.4000000000000004)]
Threshold : 3.4000000000000004
Pivot : deras
curDoc : 6
#documents that are fully evaluated

IndexError: list index out of range

In [18]:
# Define inverted indexes
terms = {
    'hujan': [1.5, 0.4, 0.6, 1.0, 1.5, 1.6],
    'turun': [0.7, 1.0, 1.5, 1.5, 0.3, 1.1],
    'deras': [1.2, 1.0, 0.5, 0.6, 1.8]
}

doc_ids = {
    'hujan': [1, 2, 3, 6, 8, 11],
    'turun': [1, 3, 6, 8, 10, 12],
    'deras': [1, 6, 7, 10, 11]
}

# Helper functions
def safe_get(arr, idx, default):
    return arr[idx] if idx < len(arr) else default

def find_pivot(terms, pointers, UB, theta):
    acc = 0
    for term, ub in zip(terms, UB):
        acc += ub
        if acc >= theta:
            return term
    return None

# WAND Algorithm Implementation
def wand_algorithm(terms, doc_ids, K):
    pointers = {term: 0 for term in terms}
    UBs = {term: max(scores) for term, scores in terms.items()}
    results = []
    theta = 0
    curDoc = 0
    evaluated_docs = 0

    while True:
        candidate_doc = min(safe_get(doc_ids[term], ptr, float('inf')) for term, ptr in pointers.items())

        # If any pointer is at the end ("Last"), break
        if any(ptr == len(doc_ids[term]) for term, ptr in pointers.items()):
            break

        if candidate_doc == float('inf'):
            break

        pivot = find_pivot(terms.keys(), pointers.values(), UBs.values(), theta)

        if pivot is None:
            break

        pivot_doc_id = safe_get(doc_ids[pivot], pointers[pivot], float('inf'))

        if pivot_doc_id == candidate_doc:
            score = sum(safe_get(terms[term], pointers[term], 0) if doc == candidate_doc else 0 for term, doc in ((term, safe_get(doc_ids[term], ptr, float('inf'))) for term, ptr in pointers.items()))
            evaluated_docs += 1
            if score > theta:
                results.append((candidate_doc, score))
                results = sorted(results, key=lambda x: x[1], reverse=True)[:K]
                theta = results[-1][1] if results else 0
            curDoc = candidate_doc
            pointers[min(pointers, key=lambda t: safe_get(doc_ids[t], pointers[t], float('inf')))] += 1
        else:
            for term in terms:
                if safe_get(doc_ids[term], pointers[term], float('inf')) < pivot_doc_id:
                    pointers[term] += 1

        print("Pointers: ", {term: safe_get(doc_ids[term], ptr, 'Last') for term, ptr in pointers.items()})
        print(f"TOP-{K}: {results}")
        print(f"Threshold: {theta}")
        print(f"Pivot: {pivot if pivot_doc_id != float('inf') else 'Last'}")
        print(f"curDoc: {curDoc}")
        print(f"#documents that are fully evaluated: {evaluated_docs}")
        print('-' * 40)

    return results

# Example usage
top_1 = wand_algorithm(terms, doc_ids, 1)
print("Final TOP-1:", top_1)


Pointers:  {'hujan': 2, 'turun': 1, 'deras': 1}
TOP-1: [(1, 3.4000000000000004)]
Threshold: 3.4000000000000004
Pivot: hujan
curDoc: 1
#documents that are fully evaluated: 1
----------------------------------------
Pointers:  {'hujan': 2, 'turun': 3, 'deras': 1}
TOP-1: [(1, 3.4000000000000004)]
Threshold: 3.4000000000000004
Pivot: deras
curDoc: 1
#documents that are fully evaluated: 2
----------------------------------------
Pointers:  {'hujan': 2, 'turun': 3, 'deras': 6}
TOP-1: [(1, 3.4000000000000004)]
Threshold: 3.4000000000000004
Pivot: deras
curDoc: 1
#documents that are fully evaluated: 3
----------------------------------------
Pointers:  {'hujan': 3, 'turun': 6, 'deras': 6}
TOP-1: [(1, 3.4000000000000004)]
Threshold: 3.4000000000000004
Pivot: deras
curDoc: 1
#documents that are fully evaluated: 3
----------------------------------------
Pointers:  {'hujan': 6, 'turun': 6, 'deras': 6}
TOP-1: [(1, 3.4000000000000004)]
Threshold: 3.4000000000000004
Pivot: deras
curDoc: 1
#documents

In [19]:
import heapq

def find_pivot(terms, threshold):
    #find the first term such that the sum of UBs from beginning till this term is >= threshold
    accumulated_ub = 0.0
    for term in terms:
        accumulated_ub += term[2]  # assuming the UB is the third element of term's list
        if accumulated_ub >= threshold:
            return term
    return None

def pick_term(terms, curDoc):
    for term in terms:
        if term[1][0] > curDoc:  # assuming the docID is the first element of term's posting list
            return term
    return None

def advance_term_pointer(term, curDoc):
    for i, posting in enumerate(term[1]):
        if posting[0] > curDoc:
            term[1] = term[1][i:]
            return term
    return term

def search(terms, initial_threshold):
    terms = sorted(terms, key=lambda t: t[1][0][0])  # initial sort by first docID

    top_1 = []
    threshold = initial_threshold
    pivot = 0
    curDoc = 0
    num_fully_evaluated_docs = 0

    while True:
        pTerm = find_pivot(terms, threshold)
        if pTerm is None:  # No more pivot terms can be found
            break

        pivot = pTerm[1][0][0]  # assuming the docID is the first element of term's posting list

        if pivot <= curDoc:
            aTerm = pick_term(terms, curDoc)
            if aTerm is None:  # No terms can be advanced
                break

            aTerm = advance_term_pointer(aTerm, curDoc)
            terms = sorted(terms, key=lambda t: t[1][0][0])
        elif pivot == pTerm[1][-1][0]:  # Check if pivot == last docID in pTerm
            break
        else:
            # ... (do evaluation and score updating etc. here)
            # Example scoring step:
            score = sum(posting[1] for term in terms for posting in term[1] if posting[0] == pivot)
            if score > threshold:
                threshold = score
                heapq.heappush(top_1, (-score, pivot))  # Store top-1 in a min-heap (as (-score, docID))

            num_fully_evaluated_docs += 1
            curDoc = pivot
            for term in terms:
                term[1] = [posting for posting in term[1] if posting[0] > curDoc]
                if not term[1]:
                    return top_1, threshold, "last", curDoc, num_fully_evaluated_docs  # if some term is exhausted
            terms = sorted(terms, key=lambda t: t[1][0][0])

    return top_1, threshold, "last", curDoc, num_fully_evaluated_docs

# Example usage:
# terms: [term_name, [(docID, score), ...], UB]
terms = [
    ["hujan", [(1, 1.5), (2, 0.4), (3, 0.6), (6, 1.0), (8, 1.5), (11, 1.6)], 1.6],
    ["turun", [(1, 0.7), (3, 1.0), (6, 1.5), (8, 1.5), (10, 0.3), (12, 1.1)], 1.5],
    ["deras", [(1, 1.2), (6, 1.0), (7, 0.5), (10, 0.6), (11, 1.8)], 1.8]
]
initial_threshold = 0

top_1, threshold, pivot_status, curDoc, num_fully_evaluated_docs = search(terms, initial_threshold)

# Output:
# top_1: [(-score, docID), ...]
# threshold: float
# pivot_status: str ("last" or docID)
# curDoc: int
# num_fully_evaluated_docs: int


In [25]:
#By Jonathan Williams, written November 2020
import math
import bisect
iterators = {}

class Iterator:
    def __init__(self,postings,term,term_string):
        self.postings = postings
        self.docs, _ = zip(*self.postings)
        self.cursor = 0
        self.term = term
        self.term_string = term_string
    #return posting at current cursor position
    def current(self):
        return self.postings[self.cursor]
    #advance cursor position if not at last posting
    def next(self):
        if self.last():
            return False
        else:
            self.cursor += 1
            return True
    #advance cursor to nearest position less than doc
    def gallop_to(self, doc):
        if doc == self.postings[self.cursor][0]: return True
        if doc < self.postings[self.cursor][0]:
            return False
        elif doc > self.postings[-1][0]:
            self.cursor = len(self.postings)-1
            return False
        else:
            i = bisect.bisect_left(self.docs, doc, self.cursor)
            self.cursor = i
            return True
    def last(self):
        if self.cursor == len(self.postings)-1:
            return True
        else:
            return False
    def __str__(self):
        return "IT:: t:{}, cursor: {}\n postings: {}\nAt last element: {}\n".format(self.term, self.cursor, self.postings,self.last())


#creates iterator for term t, adds it to the list, returns first posting
def first_posting(postings, t, term_string):
    it = Iterator(postings, t, term_string)
    iterators[term_string] = it
    return it.current()
#return next posting or None
def next_posting(term_string):
    it = iterators[term_string]
    if it.next():
        return it.current()
    else:
        return None
#return nearest posting that is larger than or equal to doc or None
def seek_to_document(term_string, doc):
    it = iterators[term_string]
    if it.gallop_to(doc):
        return it.current()
    else:
        return None

def WAND_Algo(query_terms, top_k, inverted_index):
    max_weights = {}
    candidates = []
    for t in range(0, len(query_terms)):
        qterm = query_terms[t]
        if not inverted_index[qterm]: continue
        #collect upper bound weight amongst all docs in index for the query term
        max_weights[qterm] = max(inverted_index[qterm], key=lambda x: x[1])[1]
        #get first candidate (first posting in the iterator) for the query term
        c_did, c_w = first_posting(inverted_index[qterm], t, query_terms[t])
        candidates.append((c_did, c_w, query_terms[t]))
    theta = float("-inf")
    ans = []
    fully_evaluated = 0
    while candidates:
        if ans:
            print("TOP-1: [{}]".format(ans[0]))
        else:
            print("TOP-1: []")
        print("Threshold: {}".format(theta))
        candidates = sorted(candidates, key=lambda c: c[0])
        score_limit = 0
        pivot = 0
        pivot_found = False

        while pivot < len(candidates):
            tmp_s_lim = score_limit + max_weights[candidates[pivot][2]]
            if tmp_s_lim > theta:
                pivot_found = True
                break
            score_limit = tmp_s_lim
            pivot += 1
        #if pivot term is the last term then DONE
        if not pivot_found:
            break
        pivot_doc = candidates[pivot][0]
        print("Pointers are pointing to documents: {}".format([c[0] for c in candidates]))
        print("Pivot is pointing to term: {}".format(candidates[pivot][2]))

        if candidates[0][0] == pivot_doc:
            fully_evaluated+=1
            s = 0
            t = 0
            cand_len = len(candidates)
            removed_candidates = []
            while t < cand_len:
                if candidates[t][0] == pivot_doc:
                    s += candidates[t][1]
                    next_candidate = next_posting(candidates[t][2])
                    if not next_candidate: #candidate has reached end of posting list
                        removed_candidates.append(candidates[t])
                    else:
                        candidates[t] = next_candidate + (candidates[t][2],)
                    t += 1
                else:
                    break
            #remove candidates that are at the end of their posting list
            for r in removed_candidates:
                candidates.remove(r)
            #if pivot doc contains all query terms needed for its accumulated upper bound to be greater than threshold
            if s > theta:
                ans.append((s,pivot_doc))
                if len(ans) > top_k:
                    #the list should be sorted by score in decrease order, if two documents have same score, smaller document id precedes larger one
                    ans.remove(min(ans, key=lambda  x: (x[0],-x[1])))
                    theta = min(ans, key=lambda  x: x[0])[0]

            print("Pivot: {}".format(candidates[pivot][2]))
            print("curDoc: {} ---> document that is currently fully evaluated".format(pivot_doc))
            print("Documents that are fully evaluated: {}".format(fully_evaluated))
        else:
            removed_candidates = []
            for t in range(0,pivot):
                seeked_candidate = seek_to_document(candidates[t][2],pivot_doc)
                if not seeked_candidate: #remove candidates if they need to be advanced past their posting list bounds
                    removed_candidates.append(candidates[t])
                else:
                    candidates[t] = seeked_candidate + (candidates[t][2],)
            #remove candidates
            for r in removed_candidates:
                candidates.remove(r)
    ans = sorted(ans, key=lambda x: (-x[0],x[1]))
    return (ans,fully_evaluated)



In [26]:
inverted_index = {
    "hujan": [(1, 1.5), (2, 0.4), (3, 0.6), (6, 1.0), (8, 1.5), (11, 1.6)],
    "turun": [(1, 0.7), (3, 1.0), (6, 1.5), (8, 1.5), (10, 0.3), (12, 1.1)],
    "deras": [(1, 1.2), (6, 1.0), (7, 0.5), (10, 0.6), (11, 1.8)]
}
query_terms = ["hujan", "turun", "deras"]
top_k = 1

results, num_fully_evaluated = WAND_Algo(query_terms, top_k, inverted_index)
print("Final Results:", results)
print("Number of Documents Fully Evaluated:", num_fully_evaluated)


TOP-1: []
Threshold: -inf
Pointers are pointing to documents: [1, 1, 1]
Pivot is pointing to term: hujan
Pivot: hujan
curDoc: 1 ---> document that is currently fully evaluated
Documents that are fully evaluated: 1
TOP-1: [(3.4000000000000004, 1)]
Threshold: -inf
Pointers are pointing to documents: [2, 3, 6]
Pivot is pointing to term: hujan
Pivot: hujan
curDoc: 2 ---> document that is currently fully evaluated
Documents that are fully evaluated: 2
TOP-1: [(3.4000000000000004, 1)]
Threshold: 3.4000000000000004
Pointers are pointing to documents: [3, 3, 6]
Pivot is pointing to term: deras
TOP-1: [(3.4000000000000004, 1)]
Threshold: 3.4000000000000004
Pointers are pointing to documents: [6, 6, 6]
Pivot is pointing to term: deras
Pivot: deras
curDoc: 6 ---> document that is currently fully evaluated
Documents that are fully evaluated: 3
TOP-1: [(3.5, 6)]
Threshold: 3.5
Pointers are pointing to documents: [7, 8, 8]
Pivot is pointing to term: turun
TOP-1: [(3.5, 6)]
Threshold: 3.5
Pointers ar