# 22k4102 - IR ASSIGNMENT 2 - Vector Space Model for Information Retreival

In [205]:
import numpy as np
import pandas as pd
import os
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [206]:
def preprocessing(text):
    with open('Stopword-List.txt', 'r') as stopwordsFile:
        stopwords = stopwordsFile.read().split()
    L = WordNetLemmatizer()
    text = text.lower()
    text = text.replace("-", " ")  # to handle words with special chars
    text = text.replace("/", " ")
    text = text.replace("\\", " ")
    text = text.replace("|", " ")
    text = text.replace("@", " ")
    text = text.replace("#", " ")
    text = text.replace("$", " ")
    text = text.replace("%", " ")
    text = text.replace("^", " ")
    text = text.replace("&", " ")
    text = text.replace("*", " ")
    text = text.replace("(", " ")
    text = text.replace(")", " ")
    text = text.replace("{", " ")
    text = text.replace("}", " ")
    text = text.replace("[", " ")
    text = text.replace("]", " ")
    text = text.replace(":", " ")
    text = text.replace(";", " ")
    text = text.replace('"', " ")
    text = text.replace("'", " ")
    text = text.replace("<", " ")
    text = text.replace(">", " ")
    text = text.replace(",", " ")
    text = text.replace(".", " ")
    text = text.replace("?", " ")
    text = text.replace("!", " ")
    text = text.replace("=", " ")
    text = text.replace("+", " ")
    text = text.replace("`", " ")
    text = text.replace("~", " ")
    terms = word_tokenize(text)
    return [L.lemmatize(term) for term in terms if term.isalnum() and term not in stopwords]

In [207]:
x = preprocessing("supervised kernel k-means cluster feet babies")
print(x)

['supervised', 'kernel', 'k', 'mean', 'cluster', 'foot', 'baby']


In [208]:
def buildInvertedIndex(terms, docID, invertedIndex):
    for term in terms:
        if term not in invertedIndex:
            invertedIndex[term] = []
        if docID not in invertedIndex[term]:
            invertedIndex[term].append(docID)

In [209]:
def buildPositionalIndex(terms, docID, positionalIndex):
    for p, term in enumerate(terms):
        if term not in positionalIndex:
            positionalIndex[term] = {}
        if docID not in positionalIndex[term]:
            positionalIndex[term][docID] = []
        positionalIndex[term][docID].append(p)

In [210]:
def saveIndexes(invertedIndex, positionalIndex):
    with open("invertedIndex.txt", "w") as f1: # this will create a txt file for inverted index
        json.dump(invertedIndex, f1)
    with open("positionalIndex.txt", "w") as f2: # # this will create a txt file for positional index
        json.dump(positionalIndex, f2)

In [211]:
def loadIndexes():
    with open("invertedIndex.txt", "r") as f1: # loading the previously created inverted index txt file
        invertedIndex = json.load(f1)
    with open("positionalIndex.txt", "r") as f2: # loading the previously created positional index txt file
        positionalIndex = json.load(f2)
    return invertedIndex, positionalIndex

In [212]:
def invAndPosIndexes(directory):
    invertedIndex = {} 
    positionalIndex = {}
    
    for docID in range(1, 449):
        path = os.path.join(directory, f"{docID}.txt")
        with open(path, 'r', encoding='latin-1') as document:
            terms = preprocessing(document.read())
            buildInvertedIndex(terms, docID, invertedIndex) # this will build the inverted index
            buildPositionalIndex(terms, docID, positionalIndex) # this will build the positional index
    saveIndexes(invertedIndex, positionalIndex) # this will create files for both indexes
    return invertedIndex, positionalIndex

In [213]:
directory = "./Abstracts"
invertedIndex, positionalIndex = invAndPosIndexes(directory)

In [214]:
def loadDocuments(directory): #for creating TF IDF Matrix
    documents = []
    for docID in range(1, 449):
        path = os.path.join(directory, f"{docID}.txt")
        with open(path, 'r', encoding='latin-1') as f:
            documents.append(f.read())
    return documents

In [215]:
def TFIDF(documents): # This function will create the tf idf matrix
    vec = TfidfVectorizer(tokenizer=preprocessing) # This will preprocess the docs based on the preprocessing function created above
    tfidfMatrix = vec.fit_transform(documents)
    return vec, tfidfMatrix

In [216]:
def VSM(query, vec, tfidfMatrix, alpha):
    qVector = vec.transform([query]) # query vector
    similarities = cosine_similarity(qVector, tfidfMatrix)[0] # calculating cosine similarity between the query vector and tfidf matrix
    docIDs = list(range(1, 449))

    result = []
    for i in range(len(similarities)):
        sim = similarities[i]
        if sim > alpha:
            result.append((docIDs[i], sim)) # add doc ids and similarities for similarities exceeding threshold

    return sorted(result)

In [220]:
documents = loadDocuments(directory)
vec, tfidfMatrix = TFIDF(documents)
alpha = 0.001

query = "github mashup apis"
result = VSM(query, vec, tfidfMatrix, alpha)

print("Result:")
for docID, score in result:
    print(f"Doc: {docID}, Similarity Score: {score:.5f}")

Result:
Doc: 178, Similarity Score: 0.13735
Doc: 362, Similarity Score: 0.30305


# GUI

In [224]:
import tkinter as tk
from tkinter import messagebox

def handleVSMQuery():
    vsmWindow = tk.Toplevel(root)
    vsmWindow.title("Vector Space Model (VSM) Query")
    vsmWindow.geometry("600x250")
    vsmWindow.configure(bg="aqua")

    tk.Label(vsmWindow, text="Enter VSM Query (e.g. 'github mashup apis'):", font=("Arial", 14), bg="aqua", fg="black").pack(pady=10)
    vsmQueryEntry = tk.Entry(vsmWindow, width=60, font=("Arial", 12))
    vsmQueryEntry.pack(pady=10)

    tk.Label(vsmWindow, text="Enter alpha value (e.g. 0.001):", font=("Arial", 12), bg="aqua", fg="black").pack()
    alphaEntry = tk.Entry(vsmWindow, width=20, font=("Arial", 12))
    alphaEntry.pack(pady=5)

    def processVSMQuery():
        query = vsmQueryEntry.get().strip()
        try:
            alpha = float(alphaEntry.get().strip())
        except ValueError:
            messagebox.showerror("Error", "Alpha must be a valid number!")
            return

        result = VSM(query, vec, tfidfMatrix, alpha)

        if not result:
            messagebox.showinfo("VSM Result", "No documents found above the threshold!")
        else:
            resultText = "\n".join([f"Doc: {docID}, Similarity Score: {score:.5f}" for docID, score in result])
            messagebox.showinfo("VSM Result", resultText)
        vsmWindow.destroy()

    tk.Button(vsmWindow, text="Submit", command=processVSMQuery, font=("Arial", 12), bg="white", fg="black").pack(pady=10)

root = tk.Tk()
root.title("Vector Space Model for IR")
root.geometry("600x400")
root.configure(bg="aqua")

tk.Label(root, text="Select your choice:", font=("Arial", 18), bg="aqua", fg="black").pack(pady=20)
tk.Button(root, text="1. VSM Query", command=handleVSMQuery, font=("Arial", 14), width=20, bg="white", fg="black").pack(pady=10)
tk.Button(root, text="2. Exit", command=root.destroy, font=("Arial", 14), width=20, bg="red", fg="black").pack(pady=10)

root.mainloop()