In [1]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#Download nltk stuff
nltk.download('punkt')
nltk.download('stopwords')

#Set up stop words
stop_words = set(stopwords.words('english'))

preprocessed_directory = 'preprocessed_data'
all_words = set()
filenames = []
#Loop through each file in the directory
for filename in os.listdir('data'):
    #Constructs file path for a specific file in data folder
    file_path = os.path.join('data',filename)
    filenames.append(filename)
    print(file_path)
    
    # Had errors reading certain files, so try different encodings
    try:
        #Use utf-8 encoding
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except UnicodeDecodeError:
        #Try with a different encoding
        with open(file_path, 'r', encoding='latin-1') as file:
            content = file.read()
        
    #Convert to lowercase
    content_lower = content.lower()
    #Create tokens
    tokens = word_tokenize(content_lower)
    #Remove stop words 
    filtered_tokens = [word for word in tokens if word not in stop_words]
    #Remove non-alphanumeric characters 
    processed_tokens = [re.sub(r'[^a-zA-Z0-9]+', '', word) for word in filtered_tokens]
    #Remove singly occurring characters like 'm' or 'a'
    processed_tokens = [word for word in processed_tokens if len(word) > 1]

    #Add processed tokens to the set
    all_words.update(processed_tokens)
    processed_text = ' '.join(processed_tokens)

    #Gets the file path to write the processed data
    preprocessed_file_path = os.path.join(preprocessed_directory, filename)

    #Write processed text to preprocessed_data
    with open(preprocessed_file_path, 'w', encoding='utf-8') as file:
        file.write(processed_text)
        
print(f"Total unique words: {len(all_words)}")
print("Question 1 Completed")

#Initialize the inverted index as a dictionary
inverted_index = {}
#iterate through each word in all_words
for word in all_words:
    #assign an empty set for each word in the dictionary
    inverted_index[word] = set()


#Iterate through preprocessed files.
for filename in os.listdir(preprocessed_directory):
    file_path = os.path.join(preprocessed_directory, filename)
    print("Adding file to inverted index: ", filename)
    
    with open(file_path, 'r', encoding = 'utf-8') as file:
        content = file.read()
        #Create a set of words in the document
        words = set(content.split())
        
        #iterates through each word 
        for word in words:
            #if word appears in dictionary, add filename to the corresponding set.
            if word in inverted_index:
                inverted_index[word].add(filename)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rida\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rida\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


data\100west.txt
data\13chil.txt
data\3gables.txt
data\3lpigs.txt
data\3student.txt
data\3wishes.txt
data\4moons.txt
data\5orange.txt
data\6ablemen.txt
data\6napolen.txt
data\7oldsamr.txt
data\7voysinb.txt
data\ab40thv.txt
data\abbey.txt
data\abyss.txt
data\adler.txt
data\advsayed.txt
data\advtthum.txt
data\adv_alad.txt
data\aesop11.txt
data\aesopa10.txt
data\aircon.txt
data\aislesix.txt
data\alad10.txt
data\alissadl.txt
data\aminegg.txt
data\angry_ca.txt
data\antcrick.txt
data\aquith.txt
data\arctic.txt
data\assorted.txt
data\bagelman.txt
data\batlslau.txt
data\beautbst.txt
data\beggars.txt
data\berternie.txt
data\bgb.txt
data\bgcspoof.txt
data\bishop00.txt
data\blabnove.txt
data\blackp.txt
data\blh.txt
data\blind.txt
data\bluebrd.txt
data\bruce-p.txt
data\buggy.txt
data\buldetal.txt
data\buldream.txt
data\bulfelis.txt
data\bulhuntr.txt
data\bulironb.txt
data\bullove.txt
data\bulmrx.txt
data\bulnland.txt
data\bulnoopt.txt
data\bulolli1.txt
data\bulolli2.txt
data\bulphrek.txt
data\bulp

In [2]:
N = int(input("Number of Queries: "))

In [20]:
N = int(input("Number of Queries: "))

for n in range(N):
    while True:
        input_s = input("Input sentence: ")
        #INPUT FORMATTED AS op1,op2,... (i.e. OR,AND NOT)
        input_o = input("Input operation sequence: ")
        query_o = input_o.split(',')
        
        ## preprocess query sentence, similar to how tokens are preprocessed
        input_s = input_s.lower()
        input_s = word_tokenize(input_s)
        q_tokens = [word for word in input_s if word not in stop_words]
        q_tokens = [re.sub(r'[^a-zA-Z0-9]+', '', word) for word in q_tokens]
        query_s = [word for word in q_tokens if len(word) > 1]
        
        if len(query_s) != len(query_o)+1:
            print("ERROR: Unmatching number of tokens/ops, try again")
            print("Tokens: {}".format(query_s))
            print("Operations: {}".format(query_o))
            print("---")
            continue
        elif not set(query_o).issubset(['AND','OR','OR NOT','AND NOT']):
            print("ERROR: Invalid operator(s), try again")
            print("Tokens: {}".format(query_s))
            print("Operations: {}".format(query_o))
            print("---")
            continue
        break
    
    doc_list = inverted_index[query_s[0]]
    
    i = 1
    comp = 0
    
    while i < len(query_s):
        word = query_s[i]
        op = query_o[i-1]
        
        match op:
            case 'AND':
                # intersection with current doc_list
                doc_list.intersection(inverted_index[word])
                
            case 'OR':
                # add searched word doc_list to current list
                doc_list.update(inverted_index[word])
                
            case 'AND NOT':
                #get word document list, check if any in doc_list, delete from list
                for d in list(inverted_index[word]):
                    doc_list.discard(d)
                    comp +=1 
                
            case 'OR NOT':
                #get every document which does not have current word
                for fn in filenames:
                    if fn not in doc_list and fn not in inverted_index[word]:
                        doc_list.add(fn)
                        comp +=1

                
        i+=1
    print("QUERY #{}\n".format(n+1))
    print("Number of matched documents: " + str(len(doc_list)))
    print("Minimum number of comparisons required: " + str(comp))
    print("Retrieved document names: " + str(list(doc_list)))
    print("---")

QUERY #1

Number of matched documents: 47
Minimum number of comparisons required: 0
Retrieved document names: ['100west.txt', 'beggars.txt', 'sick-kid.txt', 'charlie.txt', 'arctic.txt', 'history5.txt', 'silverb.txt', 'bruce-p.txt', 'abbey.txt', 'keepmodu.txt', 'zombies.txt', 'holmesbk.txt', 'girlclub.txt', 'sre-dark.txt', 'dakota.txt', 'shulk.txt', 'musibrem.txt', 'cybersla.txt', 'fleas.txt', 'mtinder.txt', 'hitch3.txt', 'hound-b.txt', 'lament.txt', 'friends.txt', 'aesop11.txt', 'retrib.txt', 'shoscomb.txt', 'bishop00.txt', 'darkness.txt', 'bulironb.txt', 'lionmane.txt', 'missing.txt', 'partya.txt', 'roger1.txt', 'foxngrap.txt', 'gulliver.txt', 'fgoose.txt', 'pepdegener.txt', 'hellmach.txt', 'greedog.txt', 'yukon.txt', 'aesopa10.txt', 'pinocch.txt', 'enya_trn.txt', 'gatherng.txt', 'snow.txt', 'bureau.txt']
---
ERROR: Unmatching number of tokens/ops, try again
Tokens: ['rabbit', 'sheep']
Operations: ['AND', 'OR']
---


KeyboardInterrupt: Interrupted by user

True