### Imports

In [163]:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import regex as re
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import zipfile
import cssutils
import logging
cssutils.log.setLevel(logging.CRITICAL)

### Load Ebook and run Ace Checker

In [164]:
# Get Path
filename = input("Kindly provide the filename of the EPUB: ")
filepath = filename + '.epub'

# Run Ace Checker on Epub
# Uncomment for first use to navigate to sub-folder
# %cd OAPEN Ebooks
# !ace --verbose --force --subdir --outdir results $filepath
# !ace --verbose --subdir --outdir results $filepath

Kindly provide the filename of the EPUB: e9a21834-1853-40ae-a2ea-0515d2ade7a3


### Check errors

In [165]:
# Open EPUB Ace results folder
with open('results/'+filename+'/report.json', encoding="utf8") as f:
    
    # Load report
    report = json.load(f)

    # Capture metadata
    metadata = report['earl:testSubject']['metadata']

    # Find assertions
    assertions=report['assertions']
    
    # Create list with errors
    errors=[]
    for entry in assertions:

        # If any errors are found
        if entry['earl:result']['earl:outcome'] == 'fail':

            # Look over all assertions
            for assertion in entry['assertions']:

                # If an error is found
                if assertion['earl:result']['earl:outcome']=='fail':

                    # Add error to list of errors
                    errors.append(assertion['earl:test'])

Wall time: 9.63 ms


### Extract error titles and check for target errors

In [166]:
PageBreakFix = False
LinkFix = False

error_titles = {error['dct:title'] for error in errors}

# Set value to true if pagebreak fix needs to be applied
if 'epub-pagelist-missing-pagebreak' in error_titles or 'epub-pagelist-broken' in error_titles:
    PageBreakFix = True
    print('PageBreakFix: True')

# Set value to true if link in text fix needs to be applied
if 'link-in-text-block' in error_titles:
    LinkFix = True
    print('LinkFix: True')

### Open and dissassemble with Ebooklib

In [168]:
if PageBreakFix or LinkFix:
    # Ignore NCX is turned on by default to avoid duplication at end of pipeline
    book = epub.read_epub(filepath)
#     book = epub.read_epub(filepath, options={'ignore_ncx ': True})
else:
    print("No errors detected (within this thesis' scope)")
#     exit()
    pass

Wall time: 24.1 ms


### Declare helper functions

In [169]:
def LoadHTMLSnippets(error_type):
    """
    Finds HTML snippets of associated error type per EPUB item.
    """

    linkfix_dict = {}

    for assertion in report['assertions']:
        chapter = assertion['earl:testSubject']['url']
        linkfix_dict[chapter] = []

        # Find HTML snippet of link-in-text-block errors
        for a in assertion['assertions']:
            if a['earl:test']['dct:title'] == error_type:

                # Add HTML snippet for finding this error with BS4
                linkfix_dict[chapter].append(a['earl:result']['html'])
    
    return linkfix_dict



def RefineHTMLSnippets(linkfix_dict):
    """
    Refines the HTML snippets so they are easier to match in the BS4 soup
    """
    
    refined_dict = {}
    href_pattern = r'href=".*?"'
    
    for chapter in linkfix_dict:
        refined_dict[chapter] = []
        
        for snippet in linkfix_dict[chapter]:
            re_match_object = re.search(href_pattern, snippet)
            refined_dict[chapter].append(re_match_object[0])
            
    return refined_dict

### Fix links

In [170]:
if LinkFix:
    
    # Load refined HTML snippets of link-in-text-block errors
    linkfix_dict = LoadHTMLSnippets('link-in-text-block')
    href_dict = RefineHTMLSnippets(linkfix_dict)
    
    # Keep track of which chapters are altered for EPUB reassemble
    chapter_item_list = []
    href_pattern = r'href=".*?"'
    error_links = []
    unerror_links = []
    
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):

        # Skip item if doesn't appear in dictionary
        if item.get_name() not in href_dict:
            chapter_item_list.append(item)
            continue
            
        # Skip item if no HTML snippets for this chapter
        if not href_dict[item.get_name()]:
            chapter_item_list.append(item)
            continue
        
        # Load HTML snippets for current chapter
        html_snippets = href_dict[item.get_name()]

        # Create BS4 object from chapter
        soup = BeautifulSoup(item.content, 'html.parser')

        # Find all links
        links = soup.find_all('a')

        for link in links:
            
            # skip link if it has no href
            if 'href' not in link.attrs:
                continue

            link_href = re.search(href_pattern, str(link))[0]
            
            if link_href in html_snippets:
                link['style'] = "text-decoration:underline"
                unerror_links.append((item.get_name(), link_href, html_snippets))
            else:
                error_links.append((item.get_name(), link_href, html_snippets))


        item.content = bytes(str(soup), 'utf-8')
        chapter_item_list.append(item)

Wall time: 46.8 ms


### Save modified EPUB

In [171]:
# Reassemble the EPUB
updated_book = epub.EpubBook()
updated_book.set_title(book.get_metadata('DC', 'title')[0][0])
updated_book.set_language(book.get_metadata('DC', 'language')[0][0])

updated_book.set_identifier(book.get_metadata('DC', 'identifier')[0][0])
updated_book.add_metadata('DC', 'creator', book.get_metadata('DC', 'creator')[0][0])
try:
    updated_book.add_metadata('DC', 'publisher', book.get_metadata('DC', 'publisher')[0][0])
except IndexError:
    updated_book.add_metadata('DC', 'publisher', 'unknown publisher')
    print('unknown publisher here')

for item in book.get_items():
    # 9 = ITEM_DOCUMENT 
    if item.get_type() == 9:
#         print('found content, break')
        break
    updated_book.add_item(item)
#     print('added item', item)

# print('\nchapter added here\n')    
for item in chapter_item_list:
    updated_book.add_item(item)
#     print('added a chapter', item)

checksum = []
for item in book.get_items():
    if checksum:
        if item.get_type() != 9:
            updated_book.add_item(item)
    elif item.get_type() == 9:
        checksum.append('checkvalue')
        
# Take the original book's table of content and spine
updated_book.toc = book.toc
updated_book.spine = book.spine

# Add the Ncx and Nav file
updated_book.add_item(epub.EpubNcx())
updated_book.add_item(epub.EpubNav())

# Save the Updated EPUB
epub.write_epub('pre_patch.zip', updated_book, {})
epub.write_epub('pre_patch.epub', updated_book, {})

Wall time: 760 ms


### Patch Ebook

In [172]:
head_dict = {}
for item in book.get_items():
    if item.get_name().endswith('.xhtml') or item.get_name().endswith('.html') or item.get_name().endswith('.htm'):
        soup = BeautifulSoup(item.content, 'html.parser')
        head_tag = soup.find('head')
        head_dict[item.get_name().split('/')[-1]] = head_tag
        
def is_text_decoration_none_important(decl):
    return (decl.name == 'text-decoration' and decl.value == 'none' and decl.priority == 'important'
    )

Wall time: 1.18 s


In [173]:
input_archive = zipfile.ZipFile("pre_patch.epub", "r")
output_archive_epub = zipfile.ZipFile("post_patch.epub", "w")
output_archive_zip = zipfile.ZipFile("post_patch.zip", "w")
old_book = zipfile.ZipFile(filepath, "r")

old_list = old_book.infolist()
file_list = input_archive.infolist()

for x in range(0, len(old_list)):
    if old_list[x].filename.endswith(".opf"):
        
        item = old_book.open(old_list[x])
        content = item.read()
        soup = BeautifulSoup(content, 'lxml-xml')
        metadata = soup.find('metadata')
        
        schema_sufficient = ''
        schema_summary = ''
        schema_mode = ''
        schema_feature = ''
        schema_hazard = ''        
        
        for meta in metadata.find_all('meta'):
            if 'property' not in meta.attrs:
                continue
            if str(meta['property']) == "schema:accessModeSufficient":
                schema_sufficient = meta.string
            elif str(meta['property']) == "schema:accessibilitySummary":
                schema_summary = meta.string
            elif str(meta['property']) == "schema:accessMode":
                schema_mode = meta.string
            elif str(meta['property']) == "schema:accessibilityFeature":
                schema_feature = meta.string
            elif str(meta['property']) == "schema:accessibilityHazard":
                schema_hazard = meta.string
                
                
                
        
for x in range(0, len(file_list)):
    item = input_archive.open(file_list[x])
    content = item.read()
    
    if file_list[x].filename.endswith(".xhtml") or file_list[x].filename.endswith(".html") or file_list[x].filename.endswith(".htm"):

        # Find head tag inside content
        soup = BeautifulSoup(content, 'html.parser')
        head_tag = soup.find('head')
        
        # Try to replace the empty head content with the old content or replace with default
        try:
            head_tag.replace_with(head_dict[file_list[x].filename.split('/')[-1]])
        except KeyError:
            head_tag.replace_with(head_dict[list(head_dict.keys())[0]])
        
        modification = bytes(str(soup), 'utf-8')
        
        output_archive_epub.writestr(file_list[x].filename, modification)
        output_archive_zip.writestr(file_list[x].filename, modification)
    
    
    elif file_list[x].filename.endswith(".opf"):
        
        # Load soup content
        soup = BeautifulSoup(content, 'html.parser')
        metadata = soup.find('metadata')
        package = soup.find('package')
        
        if PageBreakFix:
        # Add metadata to the .opf file that describes the origin of the pagefile
            pagesource_tag = soup.new_tag('dc:source', id="pg-src")
            meta_tag_1 = soup.new_tag('meta', property="source-of", refines="#pg-src")
            meta_tag_2 = soup.new_tag('meta', property="pageBreakSource")

            pagesource_tag.string = 'AccessiPub'
            meta_tag_1.string = 'pagination'
            meta_tag_2.string = 'AccessiPub'

            metadata.append(pagesource_tag)
            metadata.append(meta_tag_1)
            metadata.append(meta_tag_2)
        
        
        if schema_sufficient:
            schema_sufficient_tag = soup.new_tag('meta', property="schema:accessModeSufficient")
            schema_sufficient_tag.string = schema_sufficient
            metadata.append(schema_sufficient_tag)
        
        
        if schema_summary:
            schema_summary_tag = soup.new_tag('meta', property="schema:accessibilitySummary")
            schema_summary_tag.string = schema_summary
            metadata.append(schema_summary_tag)
            
            
        if schema_mode:
            schema_mode_tag = soup.new_tag('meta', property="schema:accessMode")
            schema_mode_tag.string = schema_mode
            metadata.append(schema_mode_tag)
            
            
        if schema_feature:
            schema_feature_tag = soup.new_tag('meta', property="schema:accessibilityFeature")
            schema_feature_tag.string = schema_feature
            metadata.append(schema_feature_tag)
            
            
        if schema_hazard:
            schema_hazard_tag = soup.new_tag('meta', property="schema:accessibilityHazard")
            schema_hazard_tag.string = schema_hazard
            metadata.append(schema_hazard_tag)
            
        
        
        # Take language from DC metadata and add it to <package> tag for higher accessibility        
        # Only add language if not defined yet
        if package.find('xml:lang') is None:
            try:
                package['xml:lang'] = book.get_metadata('DC', 'language')[0][0]
            except IndexError:
                package['xml:lang'] = "Language was not specified in metadata"
                print('unknown language here')
        
        
        # Load modification back into bytes and write the file
        modification = bytes(str(soup), 'utf-8')
        
        output_archive_epub.writestr(file_list[x].filename, modification)
        output_archive_zip.writestr(file_list[x].filename, modification)
    
    elif file_list[x].filename.endswith(".css"):
        
        # Parse content string into css sheet
        try:
            sheet = cssutils.parseString(content)
        except UnicodeDecodeError:
            sheet = cssutils.parseString(content, encoding='latin-1')
            
        
        for rule in sheet.cssRules:
            if rule.type == rule.STYLE_RULE:
                declarations_to_remove = [decl for decl in rule.style if is_text_decoration_none_important(decl)]
                for decl in declarations_to_remove:
                    print(rule)
                    rule.style.cssText = 'text-decoration: none'
                    print(rule)
        try:
            modification = bytes(sheet.cssText.decode('utf-8'),'utf-8')
        except UnicodeDecodeError:
            modification = bytes(sheet.cssText.decode('latin-1'),'utf-8')
        
        output_archive_epub.writestr(file_list[x].filename, modification)
        output_archive_zip.writestr(file_list[x].filename, modification)
    
    else:
        #For the other file types, simply copy the original content:
        output_archive_epub.writestr(file_list[x].filename, content)
        output_archive_zip.writestr(file_list[x].filename, content)
#         print(content)

input_archive.close()
output_archive_epub.close()
output_archive_zip.close()

Wall time: 1.44 s


### Evaluation

In [174]:
!ace --verbose --force --subdir --outdir results post_patch.epub


[36mverbose[39m: Ace 1.3.2, Node v18.15.0, Windows_NT 10.0.22631
[36mverbose[39m: Options:
[32minfo[39m:    Processing post_patch.epub
[36mverbose[39m: Extracting EPUB
[32minfo[39m:    Parsing EPUB
[36mverbose[39m: at location 'C:\Users\kolts\AppData\Local\Temp\tmp-13364-3V5mGe34uwPJ'
[32minfo[39m:    Analyzing accessibility metadata
[32minfo[39m:    Checking package...
[32minfo[39m:    - EPUB\content.opf: 5 issues found
[32minfo[39m:    Checking documents...
[36mverbose[39m: - Processing Text/Cover.xhtml
[36mverbose[39m: - Processing Text/TitlePage.xhtml
[36mverbose[39m: - Processing Text/TOC.xhtml
[36mverbose[39m: - Processing Text/Preface.xhtml
[36mverbose[39m: Converting aXe results to ace for Text/Cover.xhtml
[32minfo[39m:    - Text/Cover.xhtml: No issues found
[36mverbose[39m: - Processing Text/Contributors.xhtml
[36mverbose[39m: Converting aXe results to ace for Text/TitlePage.xhtml
[32minfo[39m:    - Text/TitlePage.xhtml: No issues found
