In [8]:
import os
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
import pickle

##### Tokenization #####

In [2]:
def process_text_files(folder_path):
    tokenized_text = []
    stop_words = set(stopwords.words('english'))
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                # Tokenize into sentences
                sentences = sent_tokenize(text)
                # Clean and tokenize each sentence
                cleaned_sentences = []
                for sentence in sentences:
                    # Remove punctuation and irrelevant characters
                    cleaned_sentence = re.sub(r'[^\w\s]', '', sentence)
                    # Convert to lowercase and tokenize
                    tokens = cleaned_sentence.lower().split()
                    # Remove stopwords
                    tokens = [token for token in tokens if token not in stop_words]
                    cleaned_sentences.append(tokens)
                tokenized_text.append(cleaned_sentences)
    return tokenized_text

In [3]:
# Function to save tokens to a text file
def save_tokens_to_file(tokenized_text, filename):
    directory = os.path.dirname(filename)
    try:
        os.makedirs(directory, exist_ok=True)
        with open(filename, 'w', encoding='utf-8') as file:
            for book in tokenized_text:
                for sentence_tokens in book:
                    for token in sentence_tokens:
                        file.write(token + ' ')
                    file.write('\n')
    except Exception as e:
        print("Error:", e)

In [5]:
olevel_folder = "ConvertedBooks/Olevel"
olevel_tokenized_text = process_text_files(olevel_folder)
print("Tokenized text for O-Level book:")
print(olevel_tokenized_text[0][0])

Tokenized text for O-Level book:
['resource', 'endorsed', 'cambridge', 'assessment', 'international', 'education', 'provides', 'support', 'option', 'b', 'cambridge', 'igcse', 'igcse', '91', 'level', 'syllabuses', '047009772147', 'examination', '2020']


In [6]:
# save to text file

olevel_output_file = "olevel_tokens.txt"
save_tokens_to_file(olevel_tokenized_text, "GeneratedTokens/Olevel/Book.txt")
print("Tokens saved to files successfully.")

Tokens saved to files successfully.


##### Indexing #####

In [None]:
def create_inverted_index(tokenized_text):
    inverted_index = defaultdict(list)
    for i, book in enumerate(tokenized_text):
        for j, sentence_tokens in enumerate(book):
            for token in sentence_tokens:
                inverted_index[token].append((i, j))
    return inverted_index

In [7]:
olevel_inverted_index = create_inverted_index(olevel_tokenized_text)
word = 'mathematics'
print("\nO-Level books containing the word '{}' are indexed at:".format(word))
print(olevel_inverted_index.get(word, []))


O-Level books containing the word 'mathematics' are indexed at:
[(0, 7035), (0, 8418), (0, 8419)]


In [9]:
# save indexing to pickle file

olevel_index_file = "Indexing/Olevel/Book.txt"
directory = os.path.dirname(olevel_index_file)
try:
    os.makedirs(directory, exist_ok=True)
    with open(olevel_index_file, 'wb') as file:
        pickle.dump(olevel_inverted_index, file)
    print("Inverted indices saved to files successfully.")
except Exception as e:
    print("Error:", e)


Inverted indices saved to files successfully.
