In [1]:
from IPython.display import clear_output

In [27]:
# installing the needed libraries
!pip install easyocr
!pip install pyspellchecker
!pip install sentence-transformers
!pip install annoy
clear_output()

In [4]:
# getting the manga data
!unzip dataset.zip -d dataset
clear_output()

In [28]:
import cv2
import numpy as np

from re import findall
from os import listdir
from pathlib import Path
from easyocr import Reader
from annoy import AnnoyIndex
from torch.cuda import is_available
from spellchecker import SpellChecker
from sentence_transformers import SentenceTransformer

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
class MangaTextExtractor:
    '''Simple class to extract text from manga page'''

    def __init__(self, lang='ru', max_distance=1, confidence=0.1):
        '''Get the content of folder with manga'''

        # setting up how confident we should be in the text extraction
        self.confidence = confidence

        # deciding where to infer the model
        self.GPU = True if is_available() else False

        # initializing the reader
        self.reader = Reader([lang], gpu=self.GPU)

        # initializing the spellchecker
        self.checker = SpellChecker(language=lang, distance=max_distance)

    def get_text_from_page(self, page_file_name):
        '''Return list of texts for each page in a folder'''

        # getting the raw detection from easyocr
        detection = self.reader.readtext(page_file_name)

        # filtering out some predictions by confidence
        detection = list(filter(lambda det: det[2] > self.confidence, detection))

        # detecting the words presented in lowercase
        words = findall(r'\w+', " ".join(list(map(lambda det: det[1].lower(), detection))))

        # correcting the spellchecking of those words
        misspelled = self.checker.unknown(words)

        # replacing misspelled words with correct versions
        for i in range(len(words)):
            if words[i] in misspelled:
                words[i] = self.checker.correction(words[i])

        # returning the corrected words
        return words

In [9]:
"""THE FOLLOWING CODE IS TAKEN FROM LAB 8 OF OUR COMPUTER VISION COURSE"""

class YOLO:
    def __init__(self, confidence=0.5, threshold=0.3):
        
        self.CONFIDENCE = confidence
        self.THRESHOLD = threshold

        # load the custom class labels our YOLO model was trained on
        with open(f"yolo/darknet.labels", 'r') as f:
            self.labels = f.read().split("\n")

        # initialize a list of colors to represent each possible class label
        np.random.seed(42)
        self.COLORS = np.random.randint(0, 255, size=(len(self.labels), 3),	dtype="uint8")

        # derive the paths to the YOLO weights and model configuration
        weightsPath = f"yolo/yolo_best.weights"
        configPath = f"yolo/yolov4.cfg"

        # load YOLO object detector trained on our dataset (15 classes)
        net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)
        net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

        # determine only the *output* layer names that we need from YOLO
        ln = net.getLayerNames()
        ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]

        self.net = net
        self.ln = ln

    def IoU(self, box1, box2):
        # left -> rightmost
        left_x = max(box1[0], box2[0])
        # bottom - topmost
        bottom_y = min(box1[3], box2[3])
        # right - leftmost
        right_x = min(box1[2], box2[2])
        # top - bottommost
        top_y = max(box1[1], box2[1])
        
        # compute intersection area
        interArea = (top_y - bottom_y) * (right_x - left_x)

        # compute the area of both the prediction and ground-truth
        ground_area = (box1[2] - box1[0]) * (box1[1]-box1[3])
        predicted_area = (box2[2] - box2[0]) * (box2[1]-box2[3])

        areaOrNegative = interArea/(ground_area + predicted_area - interArea)

        # compute the IoU
        return max(0, areaOrNegative)
    
    def forward(self, image):
        (H, W) = image.shape[:2]
        # construct a blob from the input image and then perform a forward
        # pass of the YOLO object detector, giving us our bounding boxes and
        # associated probabilities
        blob = cv2.dnn.blobFromImage(image, 1 / 255.0, (416, 416), swapRB=True, crop=False)
        self.net.setInput(blob)
        layerOutputs = self.net.forward(self.ln)
        
        # initialize our lists of detected bounding boxes, confidences, and
        # class IDs, respectively
        boxes = []
        confidences = []
        classIDs = []

        for output in layerOutputs:
            for detection in output:
                # extract the class ID and confidence
                scores = detection[5:]
                classID = np.argmax(scores)
                confidence = scores[classID]

                # filter out weak predictions 
                if confidence > self.CONFIDENCE:
                    # scale the bounding box coordinates back relative to the
                    # size of the image, keeping in mind that YOLO actually
                    # returns the center (x, y)-coordinates of the bounding
                    # box followed by the boxes' width and height
                    box = detection[0:4] * np.array([W, H, W, H])
                    (centerX, centerY, width, height) = box.astype("int")

                    # use the center (x, y)-coordinates to derive the top and
                    # and left corner of the bounding box
                    x = int(centerX - (width / 2))
                    y = int(centerY - (height / 2))

                    # update our list of bounding box coordinates, confidences,
                    # and class IDs
                    boxes.append([x, y, int(width), int(height)])
                    confidences.append(float(confidence))
                    classIDs.append(classID)
                
        self.boxes = np.array(boxes)
        self.confidences = np.array(confidences)
        self.classIDs = np.array(classIDs)
    
    def non_max_supression(self):
        idxs = np.argsort(-self.confidences)
        confidences = self.confidences[idxs]
        boxes = self.boxes[idxs]
        classIDs = self.classIDs[idxs]

        for i in range(len(boxes)):
            x,y,w,h = boxes[i][0],  boxes[i][1], boxes[i][2], boxes[i][3]
            box1 = [x, y, x+w, y+h]
            for j in range(i+1, len(boxes)):
                # If they are of the same class
                # and have a IoU above self.THRESHOLD
                # we regard them as describing the same object and
                # set the confidence of the box with lower confidence to 0
                x, y, width, height = boxes[j][0],  boxes[j][1], boxes[j][2], boxes[j][3]
                box2 = [x, y, x+width, y+height]

                if self.IoU(box1, box2) > self.THRESHOLD and classIDs[i] == classIDs[j]:
                    if (confidences[i] >= confidences[j]):
                        confidences[j] = 0
                    else:
                        confidences[i] = 0

        idxs = np.where(confidences>0)
        self.boxes = boxes[idxs]
        self.confidences = confidences[idxs]
        self.classIDs = classIDs[idxs]
  
    def detect(self, image):

        self.forward(image)
        self.non_max_supression()
        
        return self.classIDs

In [18]:
class MangaFeatureExtractor:
    '''Class to extract the vectorized features from manga directory'''

    def __init__(self):
        '''Initializing the modules that to vectorizing'''

        # modules that do text embeddings 
        self.text_extractor = MangaTextExtractor() # extracting
        self.sbert = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2') # embedding

        # yolo module that do extraction of visual features
        self.yolo = YOLO(confidence=0.01, threshold=0.2)

    def get_features(self, folder):
        '''
        Calculate features for all manga pages in specified directory
        and return them in two separate dictionaries
        '''

        # recursively searching all the jpg and manga pages in the specified folders
        pages_file_names_jpg = list(map(str, Path(folder).rglob("*.jpg")))
        pages_file_names_png = list(map(str, Path(folder).rglob("*.png")))
        pages_file_names = pages_file_names_jpg + pages_file_names_png

        ###################
        # TEXT EMBEDDINGS #
        ###################

        # getting text from the text extractor for all pages
        manga_texts = list(map(lambda path: self.text_extractor.get_text_from_page(path),
                               pages_file_names))
        
        # getting the list om embeddings of texts on manga pages
        text_embs = list(map(lambda text: self.sbert.encode(" ".join(text)) if text else None,
                             manga_texts))

        # wrapping features into dictionary for later usage
        embeddings_dict = dict(zip(pages_file_names, text_embs))

        #########################
        # VISUAL OBJECTS COUNTS #
        #########################

        # structure to hold the counts of visual words
        visual_words = []

        # going through each page and getting counts of classes instances
        for page_file_name in pages_file_names:

            # opening the page
            page = cv2.imread(page_file_name)

            # getting predictions for a page
            classes = self.yolo.detect(page)

            # iniailizing a counter with amount of trainable classes
            counter = [0 for _ in range(len(self.yolo.labels))]

            # populating the counter with amoutns of class instances
            for class_ in classes:
                counter[class_] += 1

            # appending counts to the overall structure if counts are not a zero vector
            visual_words.append(None if all(count == 0 for count in counter) else counter)

        # wrapping counter features in a dict
        counts_dict = dict(zip(pages_file_names, visual_words))

        return embeddings_dict, counts_dict

In [19]:
# initializing the manga feature extractor
MFE = MangaFeatureExtractor()

# preparing the data to be indexed
query_texts, query_counts = MFE.get_features('dataset/Boruto')

# geenrating the query data to be queried
index_texts, index_counts = MFE.get_features('dataset')

In [91]:
class AnnoyIdx:
    def __init__(self, text_idx, img_idx):
        self.word_index, self.word_reverse = self.build_index(text_idx)
        self.img_index, self.img_reverse = self.build_index(img_idx) 
        
    def build_index(self, index, trees=10, dist='angular'):
        
        #https://stackoverflow.com/questions/24068306/is-there-a-way-to-remove-nan-from-a-dictionary-filled-with-data
        # Cleaning index
        index = clean_dict = {k: index[k] for k in index if not index[k] is None}
        
        # Extracting dimensionality of data
        index_dim = len(next(iter(index.values())))
        print(index_dim)
        
        # Initializing trees
        result_tree = AnnoyIndex(index_dim, dist)
        result_reverse = {}
        total = 0
        
        # Inserting items
        for idx, key in enumerate(index.keys()):
            result_reverse[idx] = key
            result_tree.add_item(idx, index[key])
          
        # Building trees
        result_tree.build(trees)
            
        return result_tree, result_reverse
    
    def find_similar(self, text_dict, img_dict, top_n=5, nns=10):

        # removing entries that do not contain features
        text_dict = {file_name:feature for file_name, feature in text_dict.items() if feature is not None}
        img_dict = {file_name:feature for file_name, feature in img_dict.items() if feature is not None}

        # sctructure to hold prediction with their cosine similarities 
        predictions = []
        for key in text_dict:
            
            # Extracting vectors
            text = text_dict[key]
            
            # Getting top-k results
            best_words, dist_words = self.word_index.get_nns_by_vector(text, nns, include_distances=True)
            
            # appending prediction with its metric distance to the prediction structure
            for word, dist in zip(best_words, dist_words):
                predictions.append((self.extract_name(self.word_reverse[word]), dist))
        
        # doing the same thing for visual features
        for key in img_dict:

            # Extracting vectors
            image = img_dict[key]

            # Getting top-k results
            best_imgs, dist_imgs = self.img_index.get_nns_by_vector(image, nns, include_distances=True)

            # appending prediction with its metric distance to the prediction structure
            for img, dist in zip(best_imgs, dist_imgs):
                predictions.append((self.extract_name(self.word_reverse[img]), dist))

        # removing zeros and wrapping into dict
        predictions_dict = dict()
        for pred, dist in predictions:
            if dist != 0:
                if pred in predictions_dict and dist < predictions_dict[pred]:
                    predictions_dict[pred] = dist
                else:
                    predictions_dict[pred] = dist

        predictions = list(predictions_dict.items())
        predictions.sort(key=lambda pred: pred[1])

        return predictions[:top_n]
            
    def extract_name(self, name):
        return name.split('/')[1]

In [92]:
# build an index
index = AnnoyIdx(index_texts, index_counts)

# generate top picks
print('Top picks for Boruto')
print(*index.find_similar(query_texts, query_counts), sep='\n')

768
15
Top picks for Boruto
('Onepiece2', 0.5721490383148193)
('Mobpsycho100', 0.5851652026176453)
('Onepunchman', 0.6255282759666443)
('JJBASteelBallRun', 0.6330125331878662)
('ВеликийизБродячихпсов', 0.7457972168922424)
