# 22k4102 - IR ASSIGNMENT 1 - Boolean Model for Information Retreival

In [1]:
import numpy as np
import pandas as pd
import os
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
def preprocessing(text):
    with open('Stopword-List.txt', 'r') as stopwordsFile:
        stopwords = stopwordsFile.read().split()
    ps = PorterStemmer()
    text = text.lower().replace("-", " ") # to convert words like time-series into time series
    terms = word_tokenize(text)
    return [ps.stem(term) for term in terms if term.isalnum() and term not in stopwords and term != "or" and term != "not"] # The provided stopwords list does not contain or & not so I have handled them here

# Checking if preprocessing function is working correctly

In [3]:
x = preprocessing("time AND series OR classification")
print(x)

['time', 'seri', 'classif']


# Building Inverted Index

In [4]:
def buildInvertedIndex(terms, docID, invertedIndex):
    for term in terms:
        if term not in invertedIndex:
            invertedIndex[term] = []
        if docID not in invertedIndex[term]:
            invertedIndex[term].append(docID)

# Building Positional Index

In [5]:
def buildPositionalIndex(terms, docID, positionalIndex):
    for p, term in enumerate(terms):
        if term not in positionalIndex:
            positionalIndex[term] = {}
        if docID not in positionalIndex[term]:
            positionalIndex[term][docID] = []
        positionalIndex[term][docID].append(p)

# Saving Indexes

In [6]:
def saveIndexes(invertedIndex, positionalIndex):
    with open("invertedIndex.txt", "w") as f1: # this will create a txt file for inverted index
        json.dump(invertedIndex, f1)
    with open("positionalIndex.txt", "w") as f2: # # this will create a txt file for positional index
        json.dump(positionalIndex, f2)

# Loading Indexes

In [7]:
def loadIndexes():
    with open("invertedIndex.txt", "r") as f1: # loading the previously created inverted index txt file
        invertedIndex = json.load(f1)
    with open("positionalIndex.txt", "r") as f2: # loading the previously created positional index txt file
        positionalIndex = json.load(f2)
    return invertedIndex, positionalIndex

# Constructing Indexes

In [8]:
def invAndPosIndexes(directory):
    invertedIndex = {} 
    positionalIndex = {}
    
    for docID in range(1, 449):
        path = os.path.join(directory, f"{docID}.txt")
        with open(path, 'r', encoding='latin-1') as document: # I have used latin-1 encoding because the encoding format of a couple of files is not utf-8. Latin-1 resolved that issue
            terms = preprocessing(document.read())
            buildInvertedIndex(terms, docID, invertedIndex) # this will build the inverted index
            buildPositionalIndex(terms, docID, positionalIndex) # this will build the positional index
    saveIndexes(invertedIndex, positionalIndex) # this will create file for both indexes
    return invertedIndex, positionalIndex

# Boolean Queries (Simple & Complex)

In [9]:
def booleanQuery(query, invertedIndex):
    terms = preprocessing(query)
    nonPreprocessedTerms = query.upper().split() # I have done this because preprocessing would remove stopwords like 'AND' which would not return correct boolean query result
    operators = [] # a list of operators found in the query
    
    for i in range(len(nonPreprocessedTerms)):
        if nonPreprocessedTerms[i].upper() == 'AND' or nonPreprocessedTerms[i].upper() == 'OR' or nonPreprocessedTerms[i].upper() == 'NOT':
            operators.append(nonPreprocessedTerms[i])
    
    docs = set(range(1, 449))

    if terms[0].upper() == "NOT": #Handling queries which have NOT as the first term for eg "NOT autoencoders"
        return sorted(docs - set(invertedIndex.get(terms[1], set())))
    
    result = set(invertedIndex.get(terms[0], set()))

    for i in range(len(operators)):
        operator = operators[i]
        nextTerm = set(invertedIndex.get(terms[i + 1], set()))
        
        if operator == "AND": # handling intersection i.e. AND
            result &= nextTerm
        elif operator == "OR": # handling union i.e. OR
            result |= nextTerm 
        elif operator == "NOT": # handling difference i.e. NOT
            result -= nextTerm
    
    return sorted(result)

# Proximity Queries
## (I have assumed here that user will enter the proximity distance IN the query - I have extracted the distance directly from the query and passed it as a parameter in the proximityQuery function)

In [10]:
def proximityQuery(query, dist, positionalIndex):
    terms = preprocessing(query)
    print(terms)
    term1 = terms[0]
    term2 = terms[1]
    
    result = set()

    if term1 in positionalIndex and term2 in positionalIndex: # if both terms exist in the positional index
        docs = []                                             # this will store matching docs i.e the docs that contain both term1 and term2
        for docID in positionalIndex[term1]:
            if docID in positionalIndex[term2]:
                docs.append(docID)

        for docID in docs:
            p1 = positionalIndex[term1][docID] # list of positions where term1 appears
            p2 = positionalIndex[term2][docID] # list of positions where term2 appears

            for individualP1 in p1: # individualP1 and individualP2 are position pointers
                for individualP2 in p2:
                    if abs(individualP1 - individualP2) <= dist+1: # I have done +1 in the distance because my positions are starting from 0 in the positional index.
                        result.add(docID)
                        break

    return sorted(result)

# Importing Documents and building Indexes

In [11]:
directory = "./Abstracts"
invertedIndex, positionalIndex = invAndPosIndexes(directory)

# Loading Indexes from File

In [12]:
invertedIndex, positionalIndex = loadIndexes()

# Boolean Query

In [None]:
query = "NOT autoencoder"
result = booleanQuery(query, invertedIndex)
print(f"Boolean Query Result: {result}")

# Proximity Query

In [14]:
query = "feature track /5"
x = query.split()
dist = int(x[2][1:])
print(dist)
result = proximityQuery(query, dist, positionalIndex)
print(f"Proximity Query Result: {result}")

5
['featur', 'track']
Proximity Query Result: ['13', '212']


# GUI 

In [15]:
import tkinter as tk
from tkinter import messagebox

def handleBooleanQuery():
    booleanWindow = tk.Toplevel(root)
    booleanWindow.title("Boolean Query")
    booleanWindow.geometry("600x200")
    booleanWindow.configure(bg="aqua")

    tk.Label(booleanWindow, text="Enter Boolean Query (for eg: 'time AND series AND classification'):", font=("Arial", 14), bg="aqua", fg="black").pack(pady=10)
    booleanQueryEntry = tk.Entry(booleanWindow, width=50, font=("Arial", 12))
    booleanQueryEntry.pack(pady=10)

    def processBooleanQuery():
        query = booleanQueryEntry.get().strip()
        if len(query.split()) > 5:
            messagebox.showerror("Error", "Boolean Query length must not exceed 5 terms!")
            return
        result = booleanQuery(query, invertedIndex)
        messagebox.showinfo("Boolean Query Result", f"Result: {result}")
        booleanWindow.destroy()

    tk.Button(booleanWindow, text="Submit", command=processBooleanQuery, font=("Arial", 12), bg="white", fg="black").pack(pady=10)

def handleProximityQuery():
    proximityWindow = tk.Toplevel(root)
    proximityWindow.title("Proximity Query")
    proximityWindow.geometry("500x200")
    proximityWindow.configure(bg="aqua")

    tk.Label(proximityWindow, text="Enter Proximity Query (for eg: 'feature track /5'):", font=("Arial", 14), bg="aqua", fg="black").pack(pady=10)
    proximityQueryEntry = tk.Entry(proximityWindow, width=50, font=("Arial", 12))
    proximityQueryEntry.pack(pady=10)

    def processProximityQuery():
        query = proximityQueryEntry.get().strip()
        x = query.split()
        if len(x) != 3:
            messagebox.showerror("Error", "Proximity Query must be of exactly length = 3!")
            return
        try:
            dist = int(x[2][1:])
        except ValueError:
            messagebox.showerror("Error", "Invalid distance!")
            return
        result = proximityQuery(query, dist, positionalIndex)
        messagebox.showinfo("Proximity Query Result", f"Result: {result}")
        proximityWindow.destroy()

    tk.Button(proximityWindow, text="Submit", command=processProximityQuery, font=("Arial", 12), bg="white", fg="black").pack(pady=10)

def handleExit():
    exitWindow = tk.Toplevel(root)
    exitWindow.geometry("400x150")
    exitWindow.configure(bg="aqua")

    tk.Label(exitWindow, text="Thank you!", font=("Arial", 16), bg="aqua", fg="black").pack(pady=20)
    tk.Button(exitWindow, text="Close", command=root.destroy, font=("Arial", 12), bg="red", fg="black").pack(pady=10)

root = tk.Tk()
root.title("Boolean Model for IR")
root.geometry("600x400")
root.configure(bg="aqua")

tk.Label(root, text="Select your choice:", font=("Arial", 18), bg="aqua", fg="black").pack(pady=20)
tk.Button(root, text="1. Boolean Query", command=handleBooleanQuery, font=("Arial", 14), width=20, bg="white", fg="black").pack(pady=10)
tk.Button(root, text="2. Proximity Query", command=handleProximityQuery, font=("Arial", 14), width=20, bg="white", fg="black").pack(pady=10)
tk.Button(root, text="3. Exit", command=handleExit, font=("Arial", 14), width=20, bg="red", fg="black").pack(pady=10)

root.mainloop()