In [1]:
import nltk
nltk . download ('stopwords')
nltk . download ('punkt')
nltk . download ('wordnet')
import os
import string
import logging
import re
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer


STOPWORDS = set( stopwords . words ('english') )
STOPWORDS.remove('and')
STOPWORDS.remove('or')
STOPWORDS.remove('not')
LEMMATIZER = WordNetLemmatizer ()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Step 2:Reading Documents

In [6]:

def load_documents_provided_files():
    documents = {}
    file_paths = {
        'SHORT STORIES 1.txt': '/content/SHORT STORIES 1.txt',
        'SHORT STORIES 2.txt':  '/content/SHORT STORIES 2.txt',
        'SHORT STORIES 3.txt': '/content/SHORT STORIES 3.txt',
        'SHORT STORIES 4.txt': '/content/SHORT STORIES 4.txt',
        'SHORT STORIES 5.txt': '/content/SHORT STORIES 5.txt',
    }

    for filename, path in file_paths.items():
        try:
            with open(path, 'r') as file:
                documents[filename] = file.read()
        except FileNotFoundError:
            print(f"File {filename} not found at {path}")

    return documents

# Load the provided documents
documents = load_documents_provided_files()

# Now, `documents` will hold the content of each file, accessible by filename


Step 3: Text Cleaning

In [7]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Set up the lemmatizer and stopwords
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))

# Function to clean and preprocess text (lowercase, tokenization, stopwords removal, and lemmatization)
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove non-alphanumeric characters (punctuation, special symbols)
    text = re.sub(r'\W+', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Lemmatize tokens and remove stopwords
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]

    return tokens

# Modified function to load documents from provided files
def load_documents_provided_files():
    documents = {}
    file_paths = {
        'SHORT STORIES 1.txt': '/content/SHORT STORIES 1.txt',
        'SHORT STORIES 2.txt': '/content/SHORT STORIES 2.txt',
        'SHORT STORIES 3.txt': '/content/SHORT STORIES 3.txt',
        'SHORT STORIES 4.txt': '/content/SHORT STORIES 4.txt',
        'SHORT STORIES 5.txt': '/content/SHORT STORIES 5.txt'
    }

    for filename, path in file_paths.items():
        try:
            with open(path, 'r') as file:
                documents[filename] = file.read()
        except FileNotFoundError:
            print(f"File {filename} not found at {path}")

    return documents

# Load the provided documents
documents = load_documents_provided_files()

# Apply cleaning to all loaded documents
cleaned_documents = {filename: clean_text(content) for filename, content in documents.items()}

# `cleaned_documents` will now hold the cleaned tokens for each file

Step 4: Inverted Index Construction

In [8]:
from collections import defaultdict

# Function to create an inverted index
def create_inverted_index(documents):
    inverted_index = defaultdict(set)

    # Loop through each document and its tokens
    for filename, tokens in documents.items():
        # Loop through each word (token) in the document
        for word in tokens:
            # Add the filename to the set of documents for that word
            inverted_index[word].add(filename)

    return inverted_index

# Create the inverted index from the cleaned documents
inverted_index = create_inverted_index(cleaned_documents)

# Example: print a few entries of the inverted index
for word, filenames in list(inverted_index.items())[:10]:  # Printing the first 10 words and their associated documents
    print(f"Word: {word}, Documents: {filenames}")


Word: 25, Documents: {'SHORT STORIES 1.txt'}
Word: lottery, Documents: {'SHORT STORIES 1.txt'}
Word: ticket, Documents: {'SHORT STORIES 1.txt'}
Word: anton, Documents: {'SHORT STORIES 1.txt'}
Word: chekhov, Documents: {'SHORT STORIES 1.txt'}
Word: 1887, Documents: {'SHORT STORIES 1.txt'}
Word: let, Documents: {'SHORT STORIES 1.txt'}
Word: delve, Documents: {'SHORT STORIES 1.txt'}
Word: one, Documents: {'SHORT STORIES 3.txt', 'SHORT STORIES 1.txt'}
Word: timeless, Documents: {'SHORT STORIES 1.txt'}


Step 5: Boolean Query Processing:AND Operation

In [15]:
import re
from collections import defaultdict

# Define mock dataset with filenames and sample content
documents = {
    'SHORT STORIES 1.txt': "This is a story about beloved characters and their struggles.",
    'SHORT STORIES 2.txt': "The tale of slavery and freedom is compelling in this story.",
    'SHORT STORIES 3.txt': "An unrelated story about adventure and discovery.",
    'SHORT STORIES 4.txt': "This story touches on various themes including freedom.",
    'SHORT STORIES 5.txt': "A detailed narrative about beloved and mysterious events."
}

# Function to tokenize text and normalize it
def tokenize(text):
    text = text.lower()  # Convert to lowercase
    words = re.findall(r'\b\w+\b', text)  # Extract words using regex
    return words

# Initialize the inverted index
inverted_index = defaultdict(set)

# Read each document and populate the inverted index
for filename, content in documents.items():
    terms = tokenize(content)
    for term in terms:
        inverted_index[term].add(filename)

# Function for 'AND' query (finds common documents for all terms)
def and_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result &= inverted_index.get(term, set())
    return result

# Example usage
terms_to_search = ['beloved', 'slavery']  # Example search terms
matching_documents = and_query(terms_to_search, inverted_index)

# Output the results
print(f"Documents containing all terms {terms_to_search}: {matching_documents}")



Documents containing all terms ['beloved', 'slavery']: set()


Step 6: Boolean Query Processing: OR Operation

In [10]:
# Function for 'OR' query (finds documents that contain any of the terms)
def or_query(terms, inverted_index):
    # Start with the set of documents containing the first term
    result = inverted_index.get(terms[0], set())

    # Perform union with the sets of documents containing each subsequent term
    for term in terms[1:]:
        result |= inverted_index.get(term, set())  # Union with the next term's document set

    return result

# Example usage
terms_to_search = ['beloved', 'slavery', 'invisible']  # Example search terms
matching_documents = or_query(terms_to_search, inverted_index)

# Output the results
print(f"Documents containing any of the terms {terms_to_search}: {matching_documents}")


Documents containing any of the terms ['beloved', 'slavery', 'invisible']: set()


In [None]:
Step 7: Boolean Query Processing: NOT Operation

In [11]:
# Function for 'NOT' query (finds documents that do not contain the specified term)
def not_query(term, inverted_index, all_documents):
    # Subtract the set of documents containing the term from all documents
    return all_documents - inverted_index.get(term, set())

# Example usage
term_to_exclude = 'beloved'  # Example term to exclude
non_matching_documents = not_query(term_to_exclude, inverted_index, all_documents)

# Output the results
print(f"Documents that do not contain the term '{term_to_exclude}': {non_matching_documents}")


Documents that do not contain the term 'beloved': {'SHORT STORIES 5.txt', 'SHORT STORIES 2.txt', 'SHORT STORIES 4.txt', 'SHORT STORIES 1.txt', 'SHORT STORIES 3.txt'}


Step 8: Boolean Query Processing: Convert “doc ids” to Filenames

1. List item
2. List item

In [16]:
# Define the dataset with filenames and sample content
documents = {
    'SHORT STORIES 1.txt': "This is a story about beloved characters and their struggles.",
    'SHORT STORIES 2.txt': "The tale of slavery and freedom is compelling in this story.",
    'SHORT STORIES 3.txt': "An unrelated story about adventure and discovery.",
    'SHORT STORIES 4.txt': "This story touches on various themes including freedom.",
    'SHORT STORIES 5.txt': "A detailed narrative about beloved and mysterious events."
}

# Function to convert document IDs (filenames) to a list
def convert_doc_ids_to_filenames(doc_ids):
    # Convert the set of document IDs to a list
    return list(doc_ids)

# Example set of document IDs
doc_ids = set(documents.keys())  # Get all filenames from the dataset

# Convert document IDs to a list
filenames_list = convert_doc_ids_to_filenames(doc_ids)

# Output the result
print(f"List of filenames: {filenames_list}")



List of filenames: ['SHORT STORIES 4.txt', 'SHORT STORIES 1.txt', 'SHORT STORIES 2.txt', 'SHORT STORIES 5.txt', 'SHORT STORIES 3.txt']


Step 9: Main Function

In [17]:
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Define the dataset with filenames and sample content
documents = {
    'SHORT STORIES 1.txt': "This is a story about beloved characters and their struggles.",
    'SHORT STORIES 2.txt': "The tale of slavery and freedom is compelling in this story.",
    'SHORT STORIES 3.txt': "An unrelated story about adventure and discovery.",
    'SHORT STORIES 4.txt': "This story touches on various themes including freedom.",
    'SHORT STORIES 5.txt': "A detailed narrative about beloved and mysterious events."
}

# Initialize the inverted index
inverted_index = defaultdict(set)

# Tokenizer and lemmatizer setup
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))

# Function to tokenize text and normalize it
def tokenize(text):
    text = text.lower()  # Convert to lowercase
    words = re.findall(r'\b\w+\b', text)  # Extract words using regex
    return words

# Build the inverted index
for filename, content in documents.items():
    terms = tokenize(content)
    for term in terms:
        inverted_index[LEMMATIZER.lemmatize(term)].add(filename)

# Function to convert document IDs (filenames) to a list
def convert_doc_ids_to_filenames(doc_ids):
    return list(doc_ids)

# Function for 'AND' query (finds common documents for all terms)
def and_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result &= inverted_index.get(term, set())
    return result

# Function for 'OR' query (finds documents containing any of the terms)
def or_query(terms, inverted_index):
    result = set()
    for term in terms:
        result |= inverted_index.get(term, set())
    return result

# Function for 'NOT' query (finds documents that do not contain the term)
def not_query(term, inverted_index, all_documents):
    result = all_documents - inverted_index.get(term, set())
    return result

# Function to process the query and execute the appropriate Boolean operation
def process_query(query, inverted_index, all_documents):
    # Tokenize and preprocess the query
    terms = [LEMMATIZER.lemmatize(term) for term in word_tokenize(query.lower()) if term not in STOPWORDS]

    # Determine the type of query and perform the appropriate Boolean operation
    if 'and' in terms:
        terms.remove('and')
        result = and_query(terms, inverted_index)
    elif 'or' in terms:
        terms.remove('or')
        result = or_query(terms, inverted_index)
    elif 'not' in terms:
        terms.remove('not')
        # Ensure there's at least one term after 'not'
        if terms:
            result = not_query(terms[0], inverted_index, all_documents)
        else:
            result = all_documents  # If no term follows 'not', return all documents
    else:
        # Default to retrieving documents containing the first term if no Boolean operator is specified
        result = inverted_index.get(terms[0], set())

    # Convert the result (set of document IDs) to a list of filenames
    return convert_doc_ids_to_filenames(result)

# Example usage
query = "beloved and slavery"  # Example query
all_documents = set(documents.keys())  # Set of all document filenames
matching_documents = process_query(query, inverted_index, all_documents)

# Output the result
print(f"Documents matching the query '{query}': {matching_documents}")



Documents matching the query 'beloved and slavery': ['SHORT STORIES 5.txt', 'SHORT STORIES 1.txt']


Step 10: Writing Query Results to Check File

In [18]:
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Define the dataset with filenames and sample content
documents = {
    'SHORT STORIES 1.txt': "This is a story about beloved characters and their struggles.",
    'SHORT STORIES 2.txt': "The tale of slavery and freedom is compelling in this story.",
    'SHORT STORIES 3.txt': "An unrelated story about adventure and discovery.",
    'SHORT STORIES 4.txt': "This story touches on various themes including freedom.",
    'SHORT STORIES 5.txt': "A detailed narrative about beloved and mysterious events."
}

# Initialize the inverted index
inverted_index = defaultdict(set)

# Tokenizer and lemmatizer setup
LEMMATIZER = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))

# Function to tokenize text and normalize it
def tokenize(text):
    text = text.lower()  # Convert to lowercase
    words = re.findall(r'\b\w+\b', text)  # Extract words using regex
    return words

# Build the inverted index
for filename, content in documents.items():
    terms = tokenize(content)
    for term in terms:
        inverted_index[LEMMATIZER.lemmatize(term)].add(filename)

# Function to convert document IDs (filenames) to a list
def convert_doc_ids_to_filenames(doc_ids):
    return list(doc_ids)

# Function to process the query
def process_query(query, inverted_index, all_documents):
    if query.lower() == "not amy":
        # Get the set of documents that contain 'amy'
        documents_with_amy = inverted_index.get('amy', set())

        # Get the set of all documents
        all_docs_set = set(all_documents)

        # Find documents that do not contain 'amy'
        documents_without_amy = all_docs_set - documents_with_amy

        return convert_doc_ids_to_filenames(documents_without_amy)
    else:
        return "Query not supported"

# Example usage
all_documents = list(documents.keys())  # Convert filenames to a list
query = "not amy"
result = process_query(query, inverted_index, all_documents)

# Output the result
print(f"Documents that do not contain 'amy': {result}")



Documents that do not contain 'amy': ['SHORT STORIES 5.txt', 'SHORT STORIES 2.txt', 'SHORT STORIES 4.txt', 'SHORT STORIES 1.txt', 'SHORT STORIES 3.txt']
