In [4]:
from os import remove
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import os
import string
import logging
import re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove('not')
STOPWORDS.remove('or')
STOPWORDS.remove('and')
LEMMATIZER = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
#Loading the documents
def load_documents(directory):
    documents = {}
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r') as f:
                documents[filename] = f.read()
    return documents

documents = load_documents('Documents')

In [8]:
#Cleaning and preprocessing text
def clean_text(text):

  #Lowercasing the text
  text = text.lower()

  #Removing non-alphanumeric characters
  text = re.sub(r'\W+', ' ', text)

  #Tokenizing
  tokens = word_tokenize(text)

  #Removing stopwords
  tokens = [token for token in tokens if token not in STOPWORDS]

  #Lemmatizing
  tokens = [LEMMATIZER.lemmatize(token) for token in tokens]

  return tokens

#Cleaned document
cleaned_documents = {filename: clean_text(text) for filename, text in documents.items()}

In [9]:
#Inverted index
def create_inverted_index(documents):
    inverted_index = defaultdict(set)
    for filename, tokens in documents.items():
        for word in tokens:
            inverted_index[word].add(filename)
    return inverted_index

#Creating inverted index
inverted_index = create_inverted_index(cleaned_documents)

In [11]:
#Initializing all_documents with the set of all documents filenames
all_documnets = set(documents.keys())

#Function for 'AND' query
def and_query(terms, inverted_index):
  result - inverted_index.get(terms[0], set())
  for term in terms[1:]:
    result &= inverted_index.get(term, set())
  return result

In [12]:
#Function for 'OR' query
def or_query(terms, inverted_index):
  result = inverted_index.get(terms[0], set())
  for term in terms[1:]:
    result |= inverted_index.get(term, set())
  return result

In [13]:
#Function for 'NOT' query
def not_query(term, inverted_index, all_documnets):
  return all_documents - inverted_index.get(term, set())

In [14]:
#Converting document to a list
def convert_doc_ids_to_filenames(doc_ids):
  return list(doc_ids)

In [18]:
#Processing the queries and boolean
def process_query(query, inverted_index, all_documnets):

  #Tokenize and preprocessing query
  terms = [LEMMATIZER.lemmatize(term) for term in word_tokenize(query.lower()) if term not in STOPWORDS]
  if 'and' in terms:
    terms.remove('and')
    result = and_query(terms, inverted_index) # Fixed: Removed extra = and aligned indentation
  elif 'or' in terms:
    terms.remove('or')
    result = or_query(terms, inverted_index) # Fixed: Removed extra ` and aligned indentation
  elif 'not' in terms:
    terms.remove('not')
    result = not_query(terms[0], inverted_index, all_documnets) # Fixed: Changed all_documents to all_documnets
  else:
    result = inverted_index.get(terms[0], set())
  return convert_doc_ids_to_filenames(result)

In [23]:
#Example usage
query = "AI"
result = process_query(query, inverted_index, all_documnets)
print(result)

['Doc2.txt', 'Doc4.txt', 'Doc1.txt']
