Step 1: Importing Necessary Libraries

In [1]:
import nltk
nltk . download ('stopwords')
nltk . download ('punkt')
nltk . download ('wordnet')
import os
import string
import logging
import re
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer


STOPWORDS = set( stopwords . words ('english') )
STOPWORDS.remove('and')
STOPWORDS.remove('or')
STOPWORDS.remove('not')
LEMMATIZER = WordNetLemmatizer ()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Step 2: Reading Documents

In [2]:
# Function to load documents from a specified directory
def load_documents(directory):
    documents = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as file:
                documents[filename] = file.read()
    return documents

documents = load_documents('path_to_documents')

Step 3: Text Cleaning

In [3]:
# Function to clean and preprocess text (lowercase, tokenization, stopwords removal, and lemmatization)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]
    return tokens

cleaned_documents = {filename: clean_text(content) for filename, content in documents.items()}

Step 4: Inverted Index Construction

In [4]:
# Function to create an inverted index
def create_inverted_index(documents):
    inverted_index = defaultdict(set)
    for filename, tokens in documents.items():
        for word in tokens:
            inverted_index[word].add(filename)
    return inverted_index

inverted_index = create_inverted_index(cleaned_documents)

Step 5: Boolean Query Processing: AND Operation

In [5]:
# Initialize all_documents with the set of all document filenames
all_documents = set(documents.keys())

# Function for 'AND' query (finds common documents for all terms)
def and_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result &= inverted_index.get(term, set())
    return result

Step 6: Boolean Query Processing: OR Operation

In [6]:
# Function for 'OR' query (finds documents that contain any of the terms)
def or_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result |= inverted_index.get(term, set())
    return result

Step 7: Boolean Query Processing: NOT Operation

In [7]:
# Function for 'NOT' query (finds documents that do not contain the specified term)
def not_query(term, inverted_index, all_documents):
    return all_documents - inverted_index.get(term, set())

Step 8: Boolean Query Processing: Convert “doc ids” to Filenames

In [8]:
# Function to convert document IDs (filenames) to a list
def convert_doc_ids_to_filenames(doc_ids):
    return list(doc_ids)

Step 9: Main Function

In [9]:
# Function to process the query and execute the appropriate Boolean operation
def process_query(query, inverted_index, all_documents):
    # Tokenize and preprocess the query
    terms = [LEMMATIZER.lemmatize(term) for term in word_tokenize(query.lower()) if term not in STOPWORDS]
    if 'and' in terms:
        terms.remove('and')
        result = and_query(terms, inverted_index)
    elif 'or' in terms:
        terms.remove('or')
        result = or_query(terms, inverted_index)
    elif 'not' in terms:
        terms.remove('not')
        result = not_query(terms[0], inverted_index, all_documents)
    else:
        result = inverted_index.get(terms[0], set())
    return convert_doc_ids_to_filenames(result)

Step 10: Writing Query Results to Check File

In [10]:
# Example usage
query = "not amy"
result = process_query(query, inverted_index, all_documents)
print(result)

['Doc7.txt', 'Doc3.txt', 'Doc8.txt', 'Doc4.txt', 'Doc2.txt', 'Doc10.txt', 'Doc1.txt', 'Doc6.txt']
