In [1]:
import ebooklib
from ebooklib import epub
import re
import os

import enchant

from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words, stopwords, names

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
nltk.download('stopwords')
from bs4 import BeautifulSoup

from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from scipy.sparse import random
from sklearn.decomposition import TruncatedSVD

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.neighbors import NearestNeighbors

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Abdulmù\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abdulmù\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Read epub into paragraphs of max length

In [2]:
def merge_strings_until_limit(strings, min_length, max_length, test_for_max=0):
    merged_string = ""
    merged_strings = []

    for s in strings:
        if len(merged_string) <= min_length:
            merged_string += s
        elif len(merged_string) > max_length and test_for_max < 5:
            split_paragraph = merged_string.split('.')
            split_paragraph_re_point = [sp + '.' for sp in split_paragraph]
            merged = merge_strings_until_limit(split_paragraph_re_point, min_length, max_length, test_for_max + 1)
            merged_strings.extend(merged)
            merged_string = s
        else:
            merged_strings.append(merged_string)
            merged_string = s

    if merged_string:
        merged_strings.append(merged_string)

    return merged_strings

#### Read epub books and paragraphs 

In [3]:
def read_epub_paragraphs(epub_file, ID):
    book = epub.read_epub(epub_file)                                            # Read the EPUB file and store its content in the 'book' variable
    paragraphs = []

    for item in book.get_items():                                               # Iterate through each item in the EPUB book
        if item.get_type() == ebooklib.ITEM_DOCUMENT:                           # Check if the item is a document
            content = item.get_content().decode('utf-8')                        # Get the content of the document and decode it from bytes to string
            paragraphs.extend(content.strip().split("<p>")) 
            for i in range(len(paragraphs)):
                paragraphs[i] = re.sub('<[^<]+?>', '', paragraphs[i])                      # Remove HTML tags
                paragraphs[i] = re.sub('\s+', ' ', paragraphs[i])                          # Replace multiple whitespaces with a single space
                paragraphs[i] = re.sub('\n', ' ', paragraphs[i])
            # paragraphs.extend(content.strip().split("&#13;"))                   # Split the content into paragraphs and add them to the list
    paragraphs = merge_strings_until_limit(paragraphs, 100, 2500)               # Merge consecutive strings until they reach a certain limit
    paragraphs = [paragraph for i, paragraph in enumerate(paragraphs) if len(paragraph) > 160] 
    paragraphs = [paragraph for i, paragraph in enumerate(paragraphs) if "http" not in paragraph] 
    paragraphs = [{'paragraph':paragraphs[i], 'nr':i,'bookID':ID} for i, paragraph in enumerate(paragraphs)]
    print(paragraphs)
    print(len(paragraphs))
    
    return paragraphs                                                     # Return the paragraphs, excluding the first and last elements

In [13]:
def read_epub_folder(folder_path):
    paragraphs = []                                                             # Empty list to store paragraphs
    BooksList=sorted(os.listdir(folder_path))                                           # Get a list of files in the specified folder
    for index in range(0,10):                                                    # Iterate through the first 8 items in the list
        filename = BooksList[index]                                             # Get the filename at the current index
        if filename.endswith('.epub'):                                          # Check if the file has the .epub extension
            file_path = os.path.join(folder_path, filename)                     # Construct the full path to the file
            print(f"File: {filename}, Index: {index}")                          # Print information about the file and its index
            paragraphs.append(read_epub_paragraphs(file_path, index))           # Read paragraphs from the EPUB file and append them to the list
            # print(paragraphs)
    
    return paragraphs

In [14]:
folder_path = '../train_files/'
epub_books=read_epub_folder(folder_path)

File: 15-000-londoners-the-root-and-branch-petition.epub, Index: 0
[{'paragraph': ' 15,000 Londoners The Root and Branch Petition 1640 Retrieved on 19th May 2021 from history.hanover.edu and archive.org The petition was published in Gee, Henry, and William John Hardy, ed., Documents Illustrative of English Church History (New York: Macmillan, 1896), 537–545. The response was published in Calder, Robert, The Priesthood of the Old and New Testament by Succession (J. Wilson, 1773), 116–119. ', 'nr': 0, 'bookID': 0}, {'paragraph': ' Editors’ Introduction:  THIS petition was presented by 1,500 persons on Dec. 11, 1640, on behalf of 15,000 Londoners who had signed it. The Commons postponed its consideration, but in the following February referred it to a committee. The petition must be distinguished from the Root and Branch Bill said to have been drawn up by St. John, and presented to Parliament by Vane and Cromwell in May, 1641. The bill was dropped in the House of Commons, and finally aban

In [15]:
epub_books[5][104]

{'paragraph': ' Mandarini, Matteo. 2008. ‘Not fear but hope in the Apocalypse’ Ephemera 8(2): 176-181.  Mason, Kelvin and Mark Whitehead. 2012. Transition Urbanism and the Contested Politics of the Spatial Practice. Antipode, 44(2): 493 -516. ',
 'nr': 104,
 'bookID': 5}

### Vectorise Text - 1

In [16]:
ENGLISH_DICT1 = enchant.request_dict("en_GB")
ENGLISH_DICT2 = enchant.request_dict("en_US")

In [17]:
def is_english_word(word):
    # Initialize the Enchant English dictionary
    return (ENGLISH_DICT1.check(word) or ENGLISH_DICT2.check(word))

is_english_word('test')

True

### Preprocess the paragraphs

In [18]:
# epub_books: list
#        A book: list
#            paragraphs: list
                    # dictionary

In [19]:
def preprocess(books):
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    processed_paragraphs = []

    for book in books:
        for paragraph in book:
            words = gensim.utils.simple_preprocess(paragraph['paragraph'], min_len=3, deacc=True)
            lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
            filtered_words = [word for word in lemmatized_words if word not in stop_words and is_english_word(word)]
            stemmed_words = [stemmer.stem(word) for word in filtered_words]
            processed_paragraph = " ".join(stemmed_words)
            processed_paragraphs.append(processed_paragraph)

    return processed_paragraphs

In [20]:
processedbooks = preprocess(epub_books)

In [21]:
processedbooks[0]

'root branch petit retriev may histori archiv org petit publish gee henri john hardi document illustr english church histori new york respons publish priesthood old new testament success'

# TF-IDF

In [22]:
vectorizer_TFIDF = TfidfVectorizer(min_df=3)

In [23]:
tfidf_matrix = vectorizer_TFIDF.fit_transform(processedbooks)

In [24]:
vectorizer_TFIDF.get_feature_names_out()

array(['abandon', 'abil', 'abl', ..., 'yet', 'york', 'zero'], dtype=object)

In [25]:
# Search

query = 'rule'
processedQuery = preprocess([[{'paragraph':query}]])[0]
print(processedQuery)

rule


In [26]:
query_vector = vectorizer_TFIDF.transform([processedQuery])
query_vector

<1x1313 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [27]:
similarities = cosine_similarity(query_vector, tfidf_matrix)

In [62]:
# Top 5
similarities.flatten().argsort()[-5:][::-1]

array([242, 125,  48, 279, 109], dtype=int64)

In [63]:
def flatten(lst):
    flattened_list = []
    for item in lst:
        if isinstance(item, list):
            flattened_list.extend(flatten(item))
        else:
            flattened_list.append(item)
    return flattened_list

In [64]:
ALLParagraphsList=flatten(epub_books)

In [76]:
def print_results(ALLParagraphsList, *indices):
    [print(ALLParagraphsList[index]) for index in indices]

print_results(ALLParagraphsList, 242,125,48,279,109)

{'paragraph': ' So, if “a handful of good-for-nothings or hotheads, or even a single individual pig-headedly say no, is anarchy then to be ruled out?” Damn it! Let’s not bandy phoney arguments. Such individuals are free to say no, but they will not be able to stop others from pushing for yes—and so they will have to fit in as best they can. And if “good-for-nothings and hotheads” were sufficiently numerous as to be in a position to seriously thwart society and prevent it from blithely functioning, then …sad to say, anarchy would still be a way off. ', 'nr': 4, 'bookID': 7}
{'paragraph': ' Anarchist critiques of parliamentary politics started from Pierre Joseph Proudhon’s deception after his experience at the 1848 Assemblee nationale constituante, the assembly which followed the insurrections of February 1848 and ruled the French Second Republic from 4 May 1848 to 26 May 1849. The first political thinker who labelled himself explicitly as "an anarchist", Proudhon hoped to represent ther