In [11]:
import os
import re
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [12]:
# Custom stopwords
STOPWORDS = {'the', 'is', 'in', 'it', 'and', 'to', 'a', 'of', 'for', 'on', 'with', 'as', 'by', 'at', 'an'}

# Converting text to lowercase
def to_lowercase(text):
    return text.lower()

# Removing URLs
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

# Removing special characters
def remove_special_characters(text):
    return re.sub(r'[^\w\s]', '', text)

# Removing digits
def remove_digits(text):
    return re.sub(r'\d+', '', text)

In [13]:
# Tokenization
def tokenize_text(text):
    return text.split()

# Stopword removal
def remove_stopwords(tokens):
    return [word for word in tokens if word not in STOPWORDS]

# Lemmatization
def lemmatize_word(word):
    if word.endswith('ing') or word.endswith('ed'):
        word = word[:-3]
    elif word.endswith('s'):
        word = word[:-1]
    return word

def lemmatize_tokens(tokens):
    return [lemmatize_word(word) for word in tokens]

In [14]:
# Full text preprocessing pipeline
def process_text(text):
    text = to_lowercase(text)
    text = remove_urls(text)
    text = remove_special_characters(text)
    text = remove_digits(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens

In [15]:
class IRSystem:
    def __init__(self):
        self.vocab = {}
        self.docs = []
        self.index = defaultdict(set)
        self.doc_counter = 0
        self.doc_map = {}

    def clean_text(self, text):
        return process_text(text)

    def add_doc(self, text, filename):
        words = self.clean_text(text)
        self.docs.append(text)
        self.doc_map[self.doc_counter] = filename
        for word in words:
            if word not in self.vocab:
                self.vocab[word] = len(self.vocab)
            self.index[self.vocab[word]].add(self.doc_counter)
        self.doc_counter += 1

    def add_docs_from_folder(self, folder):
        for file in os.listdir(folder):
            if file.endswith('.txt'):
                with open(os.path.join(folder, file), 'r', encoding='utf-8') as f:
                    self.add_doc(f.read(), file)

    def search(self, query):
        words = self.clean_text(query)
        result = None
        for word in words:
            word_id = self.vocab.get(word, -1)
            if word_id == -1:
                return set()
            doc_ids = self.index[word_id]
            result = doc_ids if result is None else result & doc_ids
        return result

    def get_filenames(self, doc_ids):
        return [self.doc_map[doc_id] for doc_id in doc_ids]


In [16]:
# Main function to read the dataset and process each line
def main():
    final_tokens = []

if __name__ == "__main__":
    ir = IRSystem()
    ir.add_docs_from_folder('/content/books')

    input1 = "Causes: bacteria deep wound infection susceptibility debility"
    input2 = "The classification made is British matron"

    result1 = ir.search(input1)
    result2 = ir.search(input2)

    print(f"'{input1}' is in document:", ir.get_filenames(result1))
    print('\n')
    print(f"'{input2}' is in document:", ir.get_filenames(result2))

    main()

'Causes: bacteria deep wound infection susceptibility debility' is in document: ['book4.txt']


'The classification made is British matron' is in document: ['book1.txt', 'book3.txt']
