In [137]:
# Importing libraries
import os
import json
import csv
from collections import defaultdict
import nltk
from nltk.corpus import stopwords

folder_path = './docs/'
csv_filename = 'docId_filePath_mapping.csv'
json_filename = 'pos_inverted_index.json'
stop_words = set(stopwords.words('english'))


In [138]:
def read_documents(folder_path):
    documents = {}
    document_id_to_path = {}
    document_id = 1
    
    files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
    
    if not files:
        print("No documents found in the specified directory.")
        return documents, document_id_to_path  

    for filename in files:
        with open(os.path.join(folder_path, filename), 'r') as file:
            content = file.read()
            documents[document_id] = content
            document_id_to_path[document_id] = os.path.join(folder_path, filename)
            document_id += 1
    
    return documents, document_id_to_path

In [139]:
def create_positional_inverted_index(docs):
    index = defaultdict(lambda: [0, ])
    for doc_id, content in docs.items():
        words = content.split()
        for position, word in enumerate(words):
            if word.lower() not in stop_words and len(word) > 3:
                if doc_id not in index[word][1:]:
                    index[word][0] += 1
                    index[word].append({doc_id: [position]})
                else:
                    index[word][-1][doc_id].append(position)
    return index


In [140]:
def save_index_to_json(index, filename):
    with open(filename, 'w') as f:
        json.dump(index, f, indent=4)
        
def save_doc_id_filepath_mapping(mapping, filename):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        for doc_id, path in mapping.items():
            writer.writerow([doc_id, path])

In [141]:
def load_index_from_json(filename):
    with open(filename, 'r') as f:
        index = json.load(f)
    return index

def search_phrase_in_index(phrase, index):
    words = phrase.split()
    
    if words[0] not in index:
        return set() 

    docs_containing_phrase = set([list(doc_dict.keys())[0] for doc_dict in index[words[0]][1:]])

    for word in words[1:]:
        if word not in index:
            return set()  
        
        docs_containing_word = set([list(doc_dict.keys())[0] for doc_dict in index[word][1:]])
        docs_containing_phrase.intersection_update(docs_containing_word)
        
    return docs_containing_phrase



In [142]:
def insert_new_document(folder_path, content):
    
    existing_files = os.listdir(folder_path)
    new_file_id = 1
    new_filename = f"doc_{new_file_id}.txt"
    while new_filename in existing_files:
        new_file_id += 1
        new_filename = f"doc_{new_file_id}.txt"
    
    with open(os.path.join(folder_path, new_filename), 'w') as file:
        file.write(content)
    
    return new_filename

In [143]:
def delete_document(folder_path, doc_id, document_id_to_path, index):

    if doc_id in document_id_to_path:
        os.remove(document_id_to_path[doc_id])

        words_to_delete = [] 
        
        for word, postings in index.items():
            postings = [posting for posting in postings[1:] if list(posting.keys())[0] != doc_id]
            if postings:
                index[word] = [index[word][0]] + postings
            else:
                words_to_delete.append(word)
        
        for word in words_to_delete:
            del index[word]

        del document_id_to_path[doc_id]

        print(f"Document with ID {doc_id} has been successfully deleted.")
    else:
        print(f"Document with ID {doc_id} not found.")


In [146]:
def main():

    documents, document_id_to_path = read_documents(folder_path)
    
    if not documents:
        print("Exiting due to lack of documents.")
        return 
    index = create_positional_inverted_index(documents)
    
    save_index_to_json(index, json_filename)
    
    save_doc_id_filepath_mapping(document_id_to_path, csv_filename)
    
    loaded_index = load_index_from_json(json_filename)
    
    print("\nWhat would you like to do?")
    print("1. Search for a phrase")
    print("2. Insert a new document")
    print("3. Delete a document")
    print("4. Exit")
    
    choice = input("\nEnter your choice (1, 2, 3, or 4): ")

    if choice == "1":
        phrase = input("\nEnter a phrase query: ")
        doc_ids = search_phrase_in_index(phrase, loaded_index)
        if doc_ids:
            print("\nThe phrase is found in the following documents:")
            for doc_id in doc_ids:
                print(document_id_to_path[int(doc_id)])
        else:
            print("\nNo documents found containing the phrase.")
            
    elif choice == "2":

        new_content = input("\nEnter the content of the new document: ")
        
        new_filename = insert_new_document(folder_path, new_content)
        
        new_doc_id = max(document_id_to_path.keys()) + 1
        document_id_to_path[new_doc_id] = os.path.join(folder_path, new_filename)
        
        new_index = create_positional_inverted_index({new_doc_id: new_content})
        index.update(new_index) 
        
        
        save_index_to_json(index, json_filename)
        save_doc_id_filepath_mapping(document_id_to_path, csv_filename)
        
        print(f"\nDocument inserted successfully with ID {new_doc_id} and filename {new_filename}.")

    elif choice == "3":
        doc_id_to_delete = int(input("\nEnter the document ID to be deleted: "))
        delete_document(folder_path, doc_id_to_delete, document_id_to_path, index)
        
        save_index_to_json(index, json_filename)
        save_doc_id_filepath_mapping(document_id_to_path, csv_filename)

    elif choice == "4":
        print("Exiting the program.")
    else:
        print("Invalid choice. Exiting the program.")

main()


What would you like to do?
1. Search for a phrase
2. Insert a new document
3. Delete a document
4. Exit


Document with ID 6 has been successfully deleted.
