In [48]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import defaultdict
import re
import string

# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Read the dataset
# Assuming you have a folder named 'dataset' with 10 text files.
DATASET_DIR = "covid_data"  # Change this to your dataset folder

def read_files(dataset_dir):
    corpus = {}
    for filename in os.listdir(dataset_dir):
        if filename.endswith(".txt"):
            filepath = os.path.join(dataset_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                corpus[filename] = file.read()
    return corpus

corpus = read_files(DATASET_DIR)

# Step 2: Preprocess the text (tokenization, normalization, stemming, lemmatization)
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())

    # Remove punctuation and non-alphanumeric tokens
    tokens = [re.sub(r'\W+', '', token) for token in tokens if token.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    '''stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]'''

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

# Preprocess each document in the corpus
preprocessed_corpus = {doc: preprocess_text(text) for doc, text in corpus.items()}

# Step 3: Create an inverted index (sorted)
def create_inverted_index(corpus):
    inverted_index = defaultdict(set)
    for doc_id, tokens in corpus.items():
        for token in tokens:
            inverted_index[token].add(doc_id)
    return inverted_index
    

# Sort the inverted index
def sort_inverted_index(corpus):
    inverted_index=create_inverted_index(corpus)
    # Sort the inverted index by keys (terms)
    sorted_inverted_index = {key: sorted(value) for key, value in sorted(inverted_index.items())}
    return sorted_inverted_index
    
inverted_index = create_inverted_index(preprocessed_corpus)
sorted_inverted_index=sort_inverted_index(preprocessed_corpus)

# Step 4: Query the inverted index
def boolean_query(query, inverted_index,all_docs):
    query = query.lower()
    query = query.translate(str.maketrans('', '', string.punctuation))
    query_tokens = query.split()

    if 'and' in query_tokens:
        query_tokens.remove('and')
        result_docs = inverted_index[query_tokens[0]].copy()
        for token in query_tokens[1:]:
           result_docs = result_docs.intersection(set(inverted_index[token]))
    elif 'or' in query_tokens:
        query_tokens.remove('or')
        result_docs = set()
        for token in query_tokens:
            result_docs = result_docs.union(set(inverted_index[token]))
    elif 'not' in query_tokens:
        # NOT operation
        query_tokens.remove('not')
        excluded_docs = inverted_index.get(query_tokens[0], set())
        result_docs = all_docs - excluded_docs
    else:
        result_docs = inverted_index[query_tokens[0]]

    return list(result_docs)
   
# Display results

def display_results(stage, data):
    print(f"\n--- {stage} ---")
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{key}: {value}")
    else:
        print(data)

# Main execution
if __name__ == "__main__":
    # Read dataset
    display_results("Original Corpus", corpus)

    # Preprocessed corpus
    display_results("Preprocessed Corpus", preprocessed_corpus)

    # Inverted index
    display_results("Inverted Index", sorted_inverted_index)

    # All document names
    all_documents = set(corpus.keys())

# Sample boolean queries
queries = [
    "vaccine AND time",
    "vaccine OR utilized",
    "NOT version",
    "version"
]

# Process and print the results for each query
for query in queries:
    result = boolean_query(query, inverted_index,all_documents)
    print(f"Query: '{query}' => Documents: {result}")


--- Original Corpus ---
doc1.txt: Facilitators and barriers to COVID-19 vaccine
uptake among women in two regions of
Ghana: A qualitative study

Although COVID-19 vaccines are available, evidence suggests that several factors hinder
or facilitate their use. Several studies have found gender differences in COVID-19 vaccine
uptake, with women less likely to vaccinate than men in many countries, including Ghana.
These studies, however, have primarily been quantitative. This study used a qualitative
approach to examine the facilitators and barriers to vaccine uptake among women in
Ghana. Using a cross-sectional descriptive qualitative research design, 30 women in the
Greater Accra and Ashanti regions of Ghana were conveniently sampled and interviewed
using a semi-structured interview guide. Fifteen (15) interviews were conducted in each
region. The data were transcribed verbatim and analysed thematically using QSR NVivo
version 10 software. Among the key factors that facilitate COVID-19 v

[nltk_data] Downloading package punkt to C:\Users\Subhassini
[nltk_data]     Sridharan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Subhassini
[nltk_data]     Sridharan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Subhassini
[nltk_data]     Sridharan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
