In [1]:
from IPython.display import clear_output

In [16]:
!pip install easyocr
!pip install pyspellchecker
clear_output()

In [3]:
!unzip blackwhite.zip -d blackwhite
clear_output()

In [18]:
from re import findall
from os import listdir
from easyocr import Reader
from torch.cuda import is_available
from spellchecker import SpellChecker

In [65]:
class MangaTextExtractor:
    '''Simple class to extract text from manga pages'''

    def __init__(self, folder, lang='ru', max_distance=2):
        '''Get the content of folder with manga'''

        # getting info about pages
        self.folder = folder
        self.all_pages = listdir(self.folder)

        # deciding where to infer the model
        self.GPU = True if is_available() else False

        # initializing the reader
        self.reader = Reader([lang], gpu=self.GPU)

        # initializing the spellchecker
        self.checker = SpellChecker(language=lang, distance=max_distance)

    def get_text_from_pages(self, confidence=0.1):
        '''Return list of texts for each page in a folder'''

        # going through each page and appending detected texts
        detections = []
        for file_name in self.all_pages:

            # getting the raw detection from easyocr
            detection = self.reader.readtext(f"{self.folder}/{file_name}")

            # filtering out some predictions by confidence
            detection = list(filter(lambda det: det[2] > confidence, detection))

            # detecting the words presented in lowercase
            words = findall(r'\w+', " ".join(list(map(lambda det: det[1].lower(), detection))))

            # correcting the spellchecking of those words
            misspelled = self.checker.unknown(words)

            # replacing misspelled words with correct versions
            for i in range(len(words)):
                if words[i] in misspelled:
                    words[i] = self.checker.correction(words[i])

            # appending the corrected words
            detections.append(words)

        return detections

In [66]:
# initializing the MTE engine
MTE = MangaTextExtractor('blackwhite')

# getting texts from chapter in a folder
texts = MTE.get_text_from_pages()

# printing it out
print(*texts, sep='\n')