## Group 56  
## Problem Statement Assignment - 1 Problem Set - 9


## Contributors
| **Name**                | **BITS ID**      | **Contributions**  |
|-------------------------|-----------------|-------------------|
| SUBHASIS CHAKRABORTY    | 2023 AC 05309   | 100 %             |
| LALITHA SHREE V         | 2023 AC 05278   | 100 %             |
| RAMYA S                 | 2023 AC 05705   | 100 %             |
| REVATHI P               | 2023 AD 05044   | 100 %             |


In [26]:
# Installation of nltk
# In Jupyter, the console commands can be executed by the ‘!’ sign before the command within the cell
!pip install nltk
!pip install openpyxl





### Develop an application to preprocess a document collection for effective Boolean query retrieval. Implement a function that takes a dictionary of documents (where document IDs serve as keys and text content as values) and constructs an inverted index from the pre-processed text. Compute and display the number of tokens at various stages of preprocessing:
-   A.    Before preprocessing
-   B.    After stopword removal
-   C.    After normalization
-   D.    After stemming/lemmatization. 

In [27]:
import os
from docx import Document
import PyPDF2
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load and process all files from a dataset directory
def load_documents_from_folder(folder_path):
    """Load text, docx, pdf, and xlsx files from a folder into a dictionary."""
    documents = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                documents[filename] = file.read()
        elif filename.endswith('.docx'):
            doc = Document(file_path)
            documents[filename] = '\n'.join([para.text for para in doc.paragraphs])
        elif filename.endswith('.pdf'):
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                documents[filename] = '\n'.join(page.extract_text() for page in reader.pages)
        elif filename.endswith('.xlsx'):
            xls = pd.ExcelFile(file_path)
            documents[filename] = '\n'.join(xls.parse(sheet_name).to_string(index=False) for sheet_name in xls.sheet_names)
    return documents


# Preprocess text
def preprocess_text(text, use_stemming=True):
    # Tokenization
    tokens = word_tokenize(text)
    token_count_before = len(tokens)

    # Stopword Removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    token_count_stopwords_removed = len(tokens)

    # Normalization (lowercasing, removing punctuation)
    # not removing numbers
    tokens = [word.lower() for word in tokens if word.isalnum()]
    token_count_normalized = len(tokens)

    # Stemming/Lemmatization
    if use_stemming:
        stemmer = PorterStemmer()
        tokens =  [stemmer.stem(token) for token in tokens]
    else:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    token_count_lemmatized = len(tokens)

    return tokens, token_count_before, token_count_stopwords_removed, token_count_normalized, token_count_lemmatized

# Build inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(set)
    token_counts_per_doc = {}

    for doc_id, text in documents.items():
        tokens, count_before, count_stopwords_removed, count_normalized, count_lemmatized = preprocess_text(text)
        for token in tokens:
            inverted_index[token].add(doc_id)

        # Store token counts for the document
        token_counts_per_doc[doc_id] = {
            "before": count_before,
            "stopwords_removed": count_stopwords_removed,
            "normalized": count_normalized,
            "lemmatized": count_lemmatized,
        }

    return inverted_index, token_counts_per_doc

# executor Program
def preprocess_executor():
    dataset_path = "Dataset"  
    documents = load_documents_from_folder(dataset_path)
    inverted_index, token_counts_per_doc = build_inverted_index(documents)

    total_counts = {"before": 0, "stopwords_removed": 0, "normalized": 0, "lemmatized": 0}
    # Display token counts for each document
    for doc_id, counts in token_counts_per_doc.items():
        print(f"Document: {doc_id}")
        print(f"Tokens before preprocessing: {counts['before']}")
        print(f"Tokens after stopword removal: {counts['stopwords_removed']}")
        print(f"Tokens after normalization: {counts['normalized']}")
        print(f"Tokens after lemmatization: {counts['lemmatized']}\n")

        # Accumulate totals
        total_counts["before"] += counts["before"]
        total_counts["stopwords_removed"] += counts["stopwords_removed"]
        total_counts["normalized"] += counts["normalized"]
        total_counts["lemmatized"] += counts["lemmatized"]

    # Display total token counts
    print("Total Token Counts:")
    print(f"Tokens before preprocessing: {total_counts['before']}")
    print(f"Tokens after stopword removal: {total_counts['stopwords_removed']}")
    print(f"Tokens after normalization: {total_counts['normalized']}")
    print(f"Tokens after lemmatization: {total_counts['lemmatized']}\n")

    # Display inverted index
    print("Inverted Index:")
    for term, doc_ids in sorted(inverted_index.items()):
        print(f"{term}: {sorted(doc_ids)}")


preprocess_executor()


Document: Nature_1.docx
Tokens before preprocessing: 167
Tokens after stopword removal: 110
Tokens after normalization: 84
Tokens after lemmatization: 84

Document: Nature_10.pdf
Tokens before preprocessing: 155
Tokens after stopword removal: 99
Tokens after normalization: 79
Tokens after lemmatization: 79

Document: Nature_2.pdf
Tokens before preprocessing: 197
Tokens after stopword removal: 114
Tokens after normalization: 85
Tokens after lemmatization: 85

Document: Nature_3.txt
Tokens before preprocessing: 144
Tokens after stopword removal: 96
Tokens after normalization: 63
Tokens after lemmatization: 63

Document: Nature_4.xlsx
Tokens before preprocessing: 120
Tokens after stopword removal: 75
Tokens after normalization: 62
Tokens after lemmatization: 62

Document: Nature_5.docx
Tokens before preprocessing: 85
Tokens after stopword removal: 57
Tokens after normalization: 46
Tokens after lemmatization: 46

Document: Nature_6.pdf
Tokens before preprocessing: 171
Tokens after stopword

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Subhasis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Subhasis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Subhasis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 2.     Implement dictionary compression techniques to estimate space savings and compare their efficiency. Techniques to be implemented include:
·       Without blocking
·       With blocking
·       Blocking combined with front coding. (for both dictionary and posting lists)
Analyze the space utilization of the compressed dictionary for each method and present the results.

In [23]:
import sys

# Compression without blocking
def compress_without_blocking(dictionary):
    compressed_dict = {}  # Store term and offset
    current_offset = 0
    for term in sorted(dictionary):
        compressed_dict[term] = current_offset
        current_offset += len(term) + 1  # +1 for null terminator
    return compressed_dict

# Compression with blocking
def compress_with_blocking(dictionary, block_size):
    compressed_dict = {}
    current_offset = 0
    block = []
    block_start = None

    for idx, term in enumerate(sorted(dictionary)):
        if idx % block_size == 0:
            block_start = term
            block = []
            compressed_dict[block_start] = current_offset
        block.append(term)
        current_offset += len(term) + 1

    return compressed_dict

# Compression with blocking and front coding
def compress_with_front_coding_legacy(dictionary, block_size):
    compressed_dict = {}
    current_offset = 0
    block = []
    block_start = None

    for idx, term in enumerate(sorted(dictionary)):
        if idx % block_size == 0:
            block_start = term
            block = []
            compressed_dict[block_start] = current_offset
            prev_term = term
            current_offset += len(term) + 1
        else:
            prefix_length = 0
            while (prefix_length < len(prev_term) and prefix_length < len(term) and prev_term[prefix_length] == term[prefix_length]):
                prefix_length += 1
            compressed_dict[f"{prefix_length}|{term[prefix_length:]}"] = current_offset
            current_offset += len(term) - prefix_length + 1
            prev_term = term

    return compressed_dict

def compress_with_front_coding(dictionary, block_size):
    """Compress dictionary and posting lists using blocking combined with front coding."""
    compressed_dict = []
    keys = sorted(dictionary.keys())

    for i in range(0, len(keys), block_size):
        block = keys[i:i + block_size]
        base = block[0]
        # Compress terms in the block using front coding
        encoded_block = [base] + [key[len(base):] for key in block[1:]]
        # Compress posting lists (convert sets to lists and apply front coding)
        postings = [sorted(list(dictionary[key])) for key in block]
        compressed_postings = compress_posting_lists_with_front_coding(postings)
        compressed_dict.append((encoded_block, compressed_postings))

    return compressed_dict

def compress_posting_lists_with_front_coding(postings):
    """Compress posting lists using front coding for strings."""
    compressed_postings = []
    for block in postings:
        if not block:
            compressed_postings.append([])
            continue
        base = block[0]
        # For strings, store suffixes relative to the base
        encoded_block = [base] + [doc[len(base):] for doc in block[1:] if doc.startswith(base)]
        compressed_postings.append(encoded_block)
    return compressed_postings


# Calculate dictionary size
def calculate_size(dictionary):
    return sum(sys.getsizeof(key) + sys.getsizeof(value) for key, value in dictionary.items())

# Main execution
if __name__ == "__main__":
    dataset_path = "Dataset" 
    documents = load_documents_from_folder(dataset_path)
    inverted_index, token_counts_per_doc = build_inverted_index(documents)

    # Compress the dictionary
    dictionary = {term: postings for term, postings in inverted_index.items()}

    compressed_without_blocking = compress_without_blocking(dictionary)
    compressed_with_blocking = compress_with_blocking(dictionary, block_size=8)
    compressed_with_front_coding = compress_with_front_coding(dictionary, block_size=8)

    # Analyze space savings
    original_size = calculate_size(dictionary)
    size_without_blocking = calculate_size(compressed_without_blocking)
    size_with_blocking = calculate_size(compressed_with_blocking)
    size_with_front_coding = sum(sys.getsizeof(block) for block in compressed_with_front_coding)


    print("Space Utilization Analysis:")
    print(f"Original Dictionary Size: {original_size} bytes")
    print(f"Compressed Without Blocking: {size_without_blocking} bytes")
    print(f"Compressed With Blocking: {size_with_blocking} bytes")
    print(f"Compressed With Front Coding: {size_with_front_coding} bytes")



Space Utilization Analysis:
Original Dictionary Size: 120960 bytes
Compressed Without Blocking: 33936 bytes
Compressed With Blocking: 4250 bytes
Compressed With Front Coding: 3192 bytes
