In [1]:
import ebooklib
from ebooklib import epub
import re
import os

In [2]:
%cd E:\bbks

E:\bbks


In [3]:
def merge_strings_until_limit(strings, min_length, max_length, test_for_max = 0):
    merged_string = ""
    merged_strings = []

    for s in strings:
        if len(merged_string) <= min_length:
            merged_string += s
        
        elif len(merged_string) > max_length and test_for_max<5:
            splitParagraph = merged_string.split('.')
            splitParagraphRePoint = []
            for sp in splitParagraph:
                splitParagraphRePoint.append(sp+'.')
            merged = merge_strings_until_limit(splitParagraphRePoint, min_length, max_length, test_for_max+1)
            merged_strings.extend(merged)
            merged_string = s
        else:
            merged_strings.append(merged_string)
            merged_string = s

    if merged_string:
        merged_strings.append(merged_string)

    return merged_strings

In [4]:
def read_epub_paragraphs(epub_file, ID):
    book = epub.read_epub(epub_file)
    paragraphs = []

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            content = item.get_content().decode('utf-8')
            content = re.sub('<[^<]+?>', '', content)  # Remove HTML tags
            content = re.sub('\s+', ' ', content)  # Replace multiple whitespaces with a single space
            content = re.sub('\n', ' ', content)
            paragraphs.extend(content.strip().split("&#13;"))
    paragraphs = merge_strings_until_limit(paragraphs, 400, 1000)
    paragraphs = [{'paragraph':paragraphs[i], 'nr':i,'bookID':ID} for i in range(len(paragraphs)) if len(paragraphs[i])>50]
    return paragraphs[1:-1]

In [5]:
import os

def read_all_epub_paragraphs(folder_path):
    all_paras = []

    for f in os.listdir(folder_path):
        if f.endswith(".epub"):
            file_path = os.path.join(folder_path, f)
            book_id = os.path.splitext(f)[0]  
            book_paragraphs = read_epub_paragraphs(file_path, book_id)
            all_paras.extend(book_paragraphs)

    return all_paras

In [6]:
folder_path = r"E:\bbks"  
all_paras = read_all_epub_paragraphs(folder_path)
all_paras



[{'paragraph': 'Poemata: Latin, Greek and Italian Poems Latin, Greek and Italian Poems by John Milton John Milton Translator : William Cowper Copyright © 2015 epubBooks All Rights Reserved. This publication is protected by copyright. By payment of the required fees, you have been granted the non-exclusive, non-transferable right to access and read the text of this ebook on-screen or via personal text-to-speech computer systems. No part of this text may be reproduced, transmitted, downloaded, decompiled, reverse engineered, stored in or introduced into any information storage and retrieval system, in any form or by any means, whether electronic or mechanical, now known or hereinafter invented, without the express written permission of epubBooks. www.epubbooks.com',
  'nr': 1,
  'bookID': 'milton-poemata-latin-greek-and-italian-poems'},
 {'paragraph': 'Complimentary Pieces Addressed to the Author.[1] Well as the author knows that the following testimonies are not so much about as above h

In [33]:
all_paras[1200]

{'paragraph': " And―?' 'In the mind. In human memories.' 'In memory. Very well, then. We, the Party, control all records, and we control all memories. Then we control the past, do we not?' 'But how can you stop people remembering things?' cried Winston again momentarily forgetting the dial. 'It is involuntary. It is outside oneself. How can you control memory? You have not controlled mine!' O'Brien's manner grew stern again.",
 'nr': 995,
 'bookID': 'orwell-nineteen-eighty-four'}

In [7]:
from nltk.corpus import words

In [8]:
import gensim
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words, stopwords, names

In [10]:
def get_paragraphs(all_paras):
    paras = []
    for i in all_paras:
        paras.append(i['paragraph'])
    return paras

In [13]:
from nltk.corpus import words
english_words_set = set(words.words())

In [14]:
def process_para(paras):
    para2 = []
    
    for p in paras:
        words = gensim.utils.simple_preprocess(p, min_len=3, deacc=True)

        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

        stop_words = set(stopwords.words("english"))
        filtered_words = [word for word in lemmatized_words if (word not in stop_words) and (word in english_words_set)]

        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(word) for word in filtered_words]

        processed_doc = " ".join(stemmed_words)

        para2.append(processed_doc)

    return para2

In [15]:
paras = get_paragraphs(all_paras)
para2 = process_para(paras)

In [16]:
para2

['poem poem translat copyright right reserv public copyright payment fee non exclus non transfer right access read text screen via person text speech comput system part text may revers inform storag retriev system form mean whether electron mechan known hereinaft without express written permiss',
 'complimentari piec author well author know follow testimoni much men great ingenu well friend apt abund zeal prais rather draw like wa yet unwil world remain alway ignor composit much especi ha friend much importun solicit public',
 'awar excess commend envi would hand thrust much danger tribut may right belong time cannot deni set highest valu suffrag judici distinguish person manso marqui villa featur form mien manner mind intellig refin thi pieti fault free thou wouldst angl angel',
 'epigram poet worthi three laurel poesi urn depress boast henceforth thi let flood sinc shall singl match three sound thi homer thi name equal fame',
 'gentleman ode exalt sky may form starri crown beyond hel

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import random
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [29]:
def get_three_best_results(para2,all_paras,query):
    vectorizer = TfidfVectorizer(min_df=3)
    tfidf_matrix = vectorizer.fit_transform(para2)
    
    processedQuery = process_para([query])  
    query_vector = vectorizer.transform(processedQuery)
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    nearest_neighbor_index = similarities.argmax()
    
    n_components = 250
    svd = TruncatedSVD(n_components=n_components, algorithm='randomized')
    reduced_matrix = svd.fit_transform(tfidf_matrix)
    reduced_query_vec = svd.transform(query_vector)
    similarities2 = cosine_similarity(reduced_query_vec, reduced_matrix)
    nearest_neighbor_index2 = similarities2.argmax()
    
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(para2)]
    model = Doc2Vec(vector_size=250, min_count=0, alpha=0.025, min_alpha=0.025, epochs=100)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
    
    if isinstance(processedQuery, str):
        processedQuery = [processedQuery]
    
    vector = model.infer_vector(processedQuery)
    nearest_neighbor_index3 = model.dv.most_similar(positive=[vector])[0][0]

    return [
        'TF-IDF: ' + all_paras[int(nearest_neighbor_index)]['paragraph'],
        'nr:' + str(all_paras[int(nearest_neighbor_index)]['nr']),
        'ID: ' + all_paras[int(nearest_neighbor_index)]['bookID'],
        'SVD: ' + all_paras[int(nearest_neighbor_index2)]['paragraph'],
        'nr:' + str(all_paras[int(nearest_neighbor_index2)]['nr']),
        'ID: ' + all_paras[int(nearest_neighbor_index2)]['bookID'],
        'Doc2Vec: ' + all_paras[int(nearest_neighbor_index3)]['paragraph'],
        'nr:' + str(all_paras[int(nearest_neighbor_index3)]['nr']),
        'ID: ' + all_paras[int(nearest_neighbor_index3)]['bookID'],
    ]

In [30]:
get_three_best_results(para2,all_paras,'Tall white pyramids reaching for the sky')

['TF-IDF:  The Ministry of Truth—Minitrue, in Newspeak [Newspeak was the official language of Oceania. For an account of its structure and etymology see Appendix.]—was startlingly different from any other object in sight. It was an enormous pyramidal structure of glittering white concrete, soaring up, terrace after terrace, 300 metres into the air. From where Winston stood it was just possible to read, picked out on its white face in elegant lettering, the three slogans of the Party: WAR IS PEACE FREEDOM IS SLAVERY IGNORANCE IS STRENGTH The Ministry of Truth contained, it was said, three thousand rooms above ground level, and corresponding ramifications below.',
 'nr:11',
 'ID: orwell-nineteen-eighty-four',
 "SVD:  O'Brien motioned with his head to the man in the white coat, who had stood immobile throughout the proceedings. The man in the white coat bent down and looked closely into Winston's eyes, felt his pulse, laid an ear against his chest, tapped here and there, then he nodded to

In [34]:
def get_topk_best_results(para2, all_paras, query, k):
    vectorizer = TfidfVectorizer(min_df=3)
    tfidf_matrix = vectorizer.fit_transform(para2)
    
    processedQuery = process_para([query])  
    query_vector = vectorizer.transform(processedQuery)
    similarities = cosine_similarity(query_vector, tfidf_matrix)
    
    topk_indices = similarities.argsort()[0][-k:][::-1]
    
    results = []
    
    for index in topk_indices:
        paragraph_info = all_paras[index]
        result = {
            'paragraph': paragraph_info['paragraph'],
            'nr': paragraph_info['nr'],
            'bookID': paragraph_info['bookID']
        }
        results.append(result)
    
    return results

In [35]:
get_topk_best_results(para2, all_paras, 'Tall white pyramids reaching for the sky', 3)

[{'paragraph': ' The Ministry of Truth—Minitrue, in Newspeak [Newspeak was the official language of Oceania. For an account of its structure and etymology see Appendix.]—was startlingly different from any other object in sight. It was an enormous pyramidal structure of glittering white concrete, soaring up, terrace after terrace, 300 metres into the air. From where Winston stood it was just possible to read, picked out on its white face in elegant lettering, the three slogans of the Party: WAR IS PEACE FREEDOM IS SLAVERY IGNORANCE IS STRENGTH The Ministry of Truth contained, it was said, three thousand rooms above ground level, and corresponding ramifications below.',
  'nr': 11,
  'bookID': 'orwell-nineteen-eighty-four'},
 {'paragraph': " Outside man there is nothing.' 'But the whole universe is outside us. Look at the stars! Some of them are a million light–years away. They are out of our reach for ever.' 'What are the stars?' said O'Brien indifferently. 'They are bits of fire a few 