In [211]:
import os
import math
import requests
import json
import heapq
import string
import re

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [212]:
docTextMap = {}
stemmer = PorterStemmer()
applyStem = True
folder = "stemmed/" if applyStem else "unstemmed/"

In [213]:
def requireStemming(token, applyStem):
    if applyStem:
        return stemmer.stem(token)
    else:
        return token
def checkForStopWords(word):
    return word.lower() in stopwords

def is_number(word):
    return word[0].isdigit() or word.isdigit() or is_float(word)

def is_float(word):
    try:
        float_value = float(word)
        return True
    except ValueError:
        return False

In [214]:
stopWordPath = "../Resources/IR_data/AP_DATA/stoplist.txt"

with open(stopWordPath) as file:
    stopwords = file.readlines()
    
    for index, stopword in enumerate(stopwords):
        stopwords[index] = stopword.split("\n")[0]
        
        
punctuations = list(string.punctuation)

extraPunc = ["``", "'s'", "'", "''"]
[punctuations.append(el) for el in extraPunc]

for p in punctuations:
    stopwords.append(p)
        
print(f'Total number of stopwords: {len(stopwords)}')

Total number of stopwords: 531


In [215]:
# Helper method for processing text
def tokenize(text):
    pattern = r"\w+(?:\.?\w)*"
 
    # Tokenize the text based on the pattern
    words = re.findall(pattern, text.lower())

    return words

In [216]:
#Tokenizing, removing stop words and stemming the text
def preprocess(text, applyStem):
    tokens = tokenize(text)
    words = []
    for index, word in enumerate(tokens):
        if not checkForStopWords(word) and not is_number(word):
  #      if word.lower() not in checkForStopWords(word) and not is_number(word):
            words.append(requireStemming(word, applyStem))
    return words

In [217]:
# Path of the query file
queryFilePath = "../Resources/IR_data/AP_DATA/query_desc.51-100.short.txt"

# Used to store the query ID with its respective Query
queryIdMap = {}

with open(queryFilePath) as file:
    fileQuery = file.readlines()
    for query in fileQuery:
        query = query.split(".")
        qid = query[0]
        query = "".join(query[1:])   
        queryIdMap[qid] = []
        wList = preprocess(query, applyStem)
        for w in wList:            
            queryIdMap[qid].append(w)

for qid, query in queryIdMap.items():
    print(qid, query)

85 ['alleg', 'corrupt', 'public', 'offici']
59 ['weather', 'least', 'fatal', 'locat']
56 ['prime', 'lend', 'rate', 'prime', 'rate']
71 ['incurs', 'border', 'area', 'militari', 'forc', 'second', 'guerrilla', 'second']
64 ['result', 'polit', 'hostag', 'take']
62 ['militari', 'coup', 'd', 'etat', 'attempt']
93 ['support', 'nation', 'rifl', 'associ', 'nra']
99 ['develop', 'iran', 'contra', 'affair']
58 ['rail', 'strike', 'ongo', 'rail', 'strike']
77 ['poach', 'method', 'wildlif']
54 ['contract', 'agreement', 'reserv', 'launch', 'commerci', 'satellit']
87 ['current', 'offic', 'fail', 'institut']
94 ['crime', 'aid', 'comput']
100 ['non', 'communist', 'state', 'transfer', 'high', 'tech', 'good', 'technolog', 'nation']
89 ['invest', 'opec', 'downstream']
61 ['israel', 'iran', 'contra', 'affair']
95 ['comput', 'applic', 'crime']
68 ['concern', 'safeti', 'manufactur', 'worker', 'fine', 'diamet', 'fiber', 'product']
57 ['mci', 'bell', 'system']
97 ['fiber', 'optic', 'technolog']
98 ['fiber', 'opt

In [218]:
# Used to get the query details in the specified format.
def formatLine(queryNo, doc_id, rank, score):
    formatted_line = '{} Q0{}{} {} Exp\n'.format(queryNo, doc_id, rank, score)
    return formatted_line

# USed to fetch the scores from the file.
def getScores(output_file, queryNo, score_list):
    with open(output_file, "a") as file:
        for rank, (score, doc_id) in enumerate(score_list):
            line = formatLine(queryNo, doc_id, rank, score)
            file.write(line)
        file.close()

In [219]:
avgDocLen = 0
doc_count = 0

with open(f'../Resources/{folder}docTextLen.txt') as file:
    for line in file:
        docLen = json.loads(line)
        for docId in docLen:
            avgDocLen += docLen[docId]
            doc_count += 1

avgDocLen /= doc_count


with open(f'../Resources/{folder}tokenIndexMap.txt') as file:
    token_index_map = file.read()
    
token_index_map = json.loads(token_index_map)


In [220]:
# Initialize maps for token-index and docID-index
docID_index_map = {}
c = 0

# Read docIDs from file and sort them
with open('..\Resources\IR_data\AP_DATA\doclist.txt', 'r') as file:
    for line in file:
        index, filename = line.split(maxsplit=1)
        index = index.strip()  # Remove leading/trailing whitespace and newline character
        filename = filename.strip()  # Remove leading/trailing whitespace and newline character
        docID_index_map[' '+filename+' '] = index

  with open('..\Resources\IR_data\AP_DATA\doclist.txt', 'r') as file:


In [221]:
# Read catalog file
def openCatalog(path):
    content = ""
    
    with open(f'../Resources/{folder}catalogs/{path}') as file:
        content = file.readlines()
        
    return content

In [222]:
# Converts catalog string to position list
def catalogToPList(catalog):
    positions = {}
    for line in catalog:
        parts = line.split()
        if len(parts) > 1:
            key = parts[0]
            values = [int(parts[1]), int(parts[2])]
            positions[key] = values

    return positions


In [223]:
def stringToDictionary(str):
    # Regular expression pattern to match docId and integers
    p = r'(\d+):(\[\d+(?:,\d+)*\])'

    # Find all matches in the input string
    match = re.finditer(p, str)

    # Extract docId and integers from matches
    extractedData = {}
    for match in match:
        doc_id = match.group(1)
        nums = list(map(int, match.group(2)[1:-1].split(',')))  # Extract integers within []
        extractedData[doc_id] = nums
        
    return extractedData

In [224]:
def extractDict(key, position, path):
    start, length = position
    
    # Read content from file with specific start and length
    with open(f'../Resources/{folder}/invertedIndex/{path}') as file:
        file.seek(start)
        line = file.read(length + 1)
        
    token = line.split(":")[0]

    # Extract the catalog string
    catalog_string = line[len(token)+2:-1]

    # Convert catalog string to dictionary
    dd = stringToDictionary(catalog_string)
    
    # Create a dictionary with the token as key and the converted catalog as value
    updatedDD = {token: dd}
    
    return updatedDD

'''# Pre-computes query catalog
catalog = openCatalog("0.txt")
catalogPos = catalogToPList(catalog)
# vocabulary size: unique terms in the collection
V = len(catalogPos.keys())
queryCatalogDict = {}
for qid, query in queryIdMap.items():
    for queryWord in query:
        if queryWord in queryCatalogDict:
            continue        
        token = str(token_index_map[queryWord])
        queryWPos = catalogPos[token]
        catalogDict = extractDict(token, queryWPos, "0.txt")
        queryCatalogDict[token] = catalogDict[token] '''

# Pre-computes query catalog
catalog = openCatalog("0.txt")
catalogPos = catalogToPList(catalog)
# vocabulary size: unique terms in the collection
Vocab = len(catalogPos.keys())
queryCatalogDict = {}

# Using setdefault to avoid unnecessary if conditions
for qid, query in queryIdMap.items():
    for queryWord in query:
        token = str(token_index_map[queryWord])
        queryWPos = catalogPos.get(token, [])
        catalogDict = extractDict(token, queryWPos, "0.txt")
        queryCatalogDict.setdefault(token, catalogDict.get(token))


### OkapiTF

In [182]:
def okapiTF(path, queryIdMap, queryCatalogDict):
    try:
        os.remove(path)
    except FileNotFoundError:
        pass

    # Initialize the OkapiTF Dictionary
    okapiTf = {qid: [] for qid in queryIdMap.keys()}

    for index, docId in enumerate(docLen.keys()):
        if index % 10000 == 0:
            print(f'On {index}th doc')

        for qid, query in queryIdMap.items():
            doc_Okapi = 0

            for queryWord in query:
                tf = 0

                token = str(token_index_map[queryWord])

                if docID_index_map[docId] in queryCatalogDict[token]:
                    tf = len(queryCatalogDict[token][docID_index_map[docId]])

                doc_Okapi += tf / (tf + 0.5 + 1.5 * (docLen[docId] / avgDocLen))

            heapq.heappush(okapiTf[qid], (doc_Okapi, docId))


    for index, e in okapiTf.items():
        e = sorted(e, reverse=True)[:1000]
        getScores(path, index, e)

In [183]:
okapiTF(f'./output/{folder}okapiTf.txt', queryIdMap, queryCatalogDict)

On 0th doc
On 10000th doc
On 20000th doc
On 30000th doc
On 40000th doc
On 50000th doc
On 60000th doc
On 70000th doc
On 80000th doc


### TF-IDF

In [184]:
def tfIdf(path, queryIdMap, queryCatalogDict):

    try:
        os.remove(path)
    except FileNotFoundError:
        pass
        
    # total number of documents in the corpus
    D = len(docLen)

    # map of term frequencies for a word and all documents
    tfIdf = {}

    # Initializes the tfIdf map
    for qid in queryIdMap.keys():
        tfIdf[qid] = []

    for index, docId in enumerate(docLen.keys()):
        if index % 10000 == 0:
            print(f'On {index}th doc')

        for qid, query in queryIdMap.items():
            docTfIdf = 0
            
            currentPositionLists = []
            
            for queryWord in query:
                token = str(token_index_map[queryWord])
                
                if docID_index_map[docId] in queryCatalogDict[token]:
                    currentPositionLists.append(queryCatalogDict[token][docID_index_map[docId]])
                else:
                    currentPositionLists.append([])
            
            for queryWord in query:
                tf = 0
                
                token = str(token_index_map[queryWord])
                
                dfw = len(queryCatalogDict[token].keys())
                
                if docID_index_map[docId] in queryCatalogDict[token]:
                    tf = len(queryCatalogDict[token][docID_index_map[docId]])
                
                docTfIdf += tf / (tf + 0.5 + 1.5*(docLen[docId] / avgDocLen)) * math.log(D / dfw)
            
            heapq.heappush(tfIdf[qid], (docTfIdf, docId))


    for index, e in tfIdf.items():
        e = sorted(e, reverse=True)[:1000]
        getScores(path, index, e)

In [185]:
tfIdf(f'./output/{folder}tfIdf.txt', queryIdMap, queryCatalogDict)

On 0th doc
On 10000th doc
On 20000th doc
On 30000th doc
On 40000th doc
On 50000th doc
On 60000th doc
On 70000th doc
On 80000th doc


## Okapi BM25

In [208]:
# Helper functions
def get1stTerm(D, df_w):
    return math.log((D + 0.5) / (df_w + 0.5))

def get2ndTerm(tf_wd, k1, b, lenD, avgLenD):
    return (tf_wd + k1 * tf_wd) / (tf_wd + k1 * ((1 - b) + b * (lenD / avgLenD)))

def get3rdTerm(tf_wq, k2):
    return (tf_wq + k2 * tf_wq) / (tf_wq + k2)

In [209]:
def bm25(path, queryIdMap, queryCatalogDict):
    try:
        os.remove(path)
    except FileNotFoundError:
        pass

    k1 = 1.2
    k2 = 100
    b = 0.75
    
    # Count of total docs overall
    D = len(docLen)
    
    # Initialize the dictionary to store Okapi BM25 scores for each query
    okapi_bm25 = {qid: [] for qid in queryIdMap.keys()}
    
    for index, docId in enumerate(docLen.keys()):
        if index % 10000 == 0:
            print(f'On {index}th doc')

        for qid, query in queryIdMap.items():
            # Map for containing frequencies of words in a given query
            tf_wqs = {}

            for word in query:
                if word not in tf_wqs:
                    tf_wqs[word] = 0
                tf_wqs[word] += 1

            doc_okapi_bm25 = 0
            
            for word, tf_wq in tf_wqs.items():
                tf_wd = 0
                
                token = str(token_index_map.get(word, None))
                
                if token is not None:
                    df_w = len(queryCatalogDict.get(token, {}).keys())
                    
                    if docID_index_map.get(docId, None) in queryCatalogDict.get(token, {}):
                        tf_wd = len(queryCatalogDict[token].get(docID_index_map[docId], []))
                    
                    doc_okapi_bm25 += get1stTerm(D, df_w) * get2ndTerm(tf_wd, k1, b, docLen.get(docId, 0), avgDocLen) * get3rdTerm(tf_wq, k2)
                
            heapq.heappush(okapi_bm25[qid], (doc_okapi_bm25, docId))
    for index, e in okapi_bm25.items():
        e = sorted(e, reverse=True)[:1000]
        getScores(path, index, e)




In [210]:
bm25(f'./output/{folder}bm25-decompressed.txt', queryIdMap, queryCatalogDict)

On 0th doc
On 10000th doc
On 20000th doc
On 30000th doc
On 40000th doc
On 50000th doc
On 60000th doc
On 70000th doc
On 80000th doc


## Laplace Smoothing

In [229]:
def laplaceSmoothing(path, queryIdMap):
    try:
        os.remove(path)
    except FileNotFoundError:
        pass
    # Initialise laplace dictionary    
    laplace = {qid: [] for qid in queryIdMap.keys()}

    for docNo in docLen.keys():

        for qid, query in queryIdMap.items():
            calulate = 0

            for queryWord in query:
                tf = 0

                token = str(token_index_map.get(queryWord, None))

                if token is not None and docID_index_map.get(docNo, None) in queryCatalogDict.get(token, {}):
                    tf = len(queryCatalogDict[token][docID_index_map[docNo]])

                calulate += -1000 if tf == 0 else math.log((tf + 1) / (docLen.get(docNo, 0) + Vocab))

            heapq.heappush(laplace[qid], (calulate, docNo))

    for index, e in laplace.items():
        e = sorted(e, reverse=True)[:1000]
        getScores(path, index, e)  



In [230]:
laplaceSmoothing(f'./output/{folder}laplace3.txt', queryIdMap)

On 0th doc
On 10000th doc
On 20000th doc
On 30000th doc
On 40000th doc
On 50000th doc
On 60000th doc
On 70000th doc
On 80000th doc


In [191]:
def all_indices_at_end(position_lists, indices):
    for i in range(len(indices)):
        if indices[i] < len(position_lists[i]) - 1:
            return False
    return True

def find_min_span(position_lists):
    for pos_list in position_lists:
        if len(pos_list) == 0:
            return 1e5
    
    min_span = 1e5
    indices = [0] * len(position_lists)

    while not all_indices_at_end(position_lists, indices):
        min_val = 1e9
        next_index = find_next_index(position_lists, indices)

        min_span = update_min_span(position_lists, indices, min_span)

        indices[next_index] += 1

    return min_span

def find_next_index(position_lists, indices):
    min_val = 1e9
    next_index = -1

    for i in range(len(indices)):
        index = indices[i]

        if min_val > position_lists[i][index] and index + 1 != len(position_lists[i]):
            min_val = position_lists[i][index]
            next_index = i

    return next_index

def update_min_span(position_lists, indices, current_min_span):
    min_element, max_element = 1e9, -1

    for i in range(len(indices)):
        min_element = min(min_element, position_lists[i][indices[i]])
        max_element = max(max_element, position_lists[i][indices[i]])

    return min(current_min_span, abs(max_element - min_element))


In [192]:
def tfIdfProximitySearch(path, queryIdMap, queryCatalogDict):

    try:
        os.remove(path)
    except FileNotFoundError:
        pass
        
    # total number of documents in the corpus
    D = len(docLen)

    # map of term frequencies for a word and all documents
    tfIdf = {}
    
    _lambda = 0.8

    # Initializes the tfIdf map
    for qid in queryIdMap.keys():
        tfIdf[qid] = []

    for index, docId in enumerate(docLen.keys()):
        if index % 10000 == 0:
            print(f'On {index}th doc')

        for qid, query in queryIdMap.items():
            docTfIdf = 0
            
            currentPositionLists = []
            
            for queryWord in query:
                token = str(token_index_map[queryWord])
                
                if docID_index_map[docId] in queryCatalogDict[token]:
                    currentPositionLists.append(queryCatalogDict[token][docID_index_map[docId]])
                else:
                    currentPositionLists.append([])
                                           
            minSpan = find_min_span(currentPositionLists)    
            
            proximityScore = math.log(0.01 + math.exp(-minSpan))
            
            for queryWord in query:
                tf = 0
                
                token = str(token_index_map[queryWord])
                
                dfw = len(queryCatalogDict[token].keys())
                
                if docID_index_map[docId] in queryCatalogDict[token]:
                    tf = len(queryCatalogDict[token][docID_index_map[docId]])
                
                docTfIdf += tf / (tf + 0.5 + 1.5*(docLen[docId] / avgDocLen)) * math.log(D / dfw)
            
            heapq.heappush(tfIdf[qid], (docTfIdf + proximityScore, docId))


    for index, e in tfIdf.items():
        e = sorted(e, reverse=True)[:1000]
        getScores(path, index, e)

In [193]:
tfIdfProximitySearch(f'./output/{folder}tfIdfProximitySearch.txt', queryIdMap, queryCatalogDict)

On 0th doc
On 10000th doc
On 20000th doc
On 30000th doc
On 40000th doc
On 50000th doc
On 60000th doc
On 70000th doc
On 80000th doc
