In [44]:
from kraken import blla
from kraken import rpred
from kraken.lib import models
from PIL import Image
import cv2 as cv
import json
import numpy as np
from matplotlib import pyplot as plt
import os

model_path = 'KrakenModel/HTR-United-Manu_McFrench.mlmodel'
model = models.load_any(model_path)

In [45]:
main_dir = "test/oneFile/extract_image"

# OCR all files inside

def process_images(main_dir, ocr = False, crop = False) :
    image_extension = (".jpg", ".png", ".svg", "jpeg")
    for (dirpath, subdirnames, filenames) in os.walk(main_dir):
        for filename in filenames :
            if not filename.lower().endswith(image_extension) : 
                continue
            filepath = dirpath+os.sep+filename
            print("Processing : "+filepath)
            im = Image.open(filepath)


            # Segmentations and predictions
            baseline_seg = blla.segment(im) # Json
            
            
            if ocr :
                # https://kraken.re/main/api.html#recognition
                ocr_img(model, im, baseline_seg)

                
            # Cropping segmented text
            cropping(baseline_seg, filepath, predictions, crop)

            im.close()
            print("Done processing")
            return None

In [46]:
def ocr_img(model, im, baseline_seg) :
    ocr_dir = 'tmp'+os.sep+'ocr_result'
    os.makedirs(ocr_dir, exist_ok=True)
    
    # https://kraken.re/main/api.html#recognition
    predictions = [record for record in rpred.rpred(model, im, baseline_seg)]

    with open(ocr_dir+os.sep+filename[:-4]+'_ocr.txt', 'w') as f:
        for record in predictions :
            #print(record.prediction)
            f.write(record.prediction+"\n")
        print("Created "+ocr_dir+os.sep+filename[:-4]+'_ocr.txt')
    

In [47]:
# Cropping segmented text
def cropping(json_data, filepath, predictions, crop =False) :
    
    img = cv.imread(filepath, cv.IMREAD_COLOR)
    filename = filepath.split(os.sep)[-1]
    
    cropping_dir = "cropped"+os.sep+filename
    segmented_img_dir = "tmp"+os.sep+"segmented"
    os.makedirs(cropping_dir, exist_ok=True)
    os.makedirs(segmented_img_dir, exist_ok=True)
    
    
    name_iterator = 1
    
    for line in json_data["lines"]:

        # Baselines
        baselines = line["baseline"]
        for i in range(1,len(baselines)):
            img = cv.line(img, baselines[i-1],
                          baselines[i], (0, 0, 255), 5)

        # Boundaries
        boundaries = line["boundary"]
        x_min = x_max = boundaries[0][0] 
        y_min = y_max = boundaries[0][1] 
        for i in range(1, len(boundaries)):
            img = cv.line(img, boundaries[i-1],
                          boundaries[i], (255, 0, 0, 0.25), 5)
            x = boundaries[i][0]
            y = boundaries[i][1]
            x_min = (x if x < x_min else x_min)
            x_max = (x if x > x_max else x_max)
            y_min = (y if y < y_min else y_min)
            y_max = (y if y > y_max else y_max)

        # Crop and save each region segmented by kraken
        if crop :
            cropped = img_original[y_min:y_max, x_min:x_max].copy()
            cropped_img_path = cropping_dir+os.sep+filename[:-4]+"_"+str(name_iterator)+".jpg"
            cv.imwrite(cropped_img_path, cropped)

            """
            # Deprecated due to bad results and time wasting
            # OCR cropped image
            cropped_im = Image.open(cropped_img_path)

            baseline_seg = blla.segment(im) # this takes time

            with open(cropped_img_path[:-4]+'.json', 'w') as f:
                f.write(str(baseline_seg)+"\n")

            pred_it = rpred.rpred(model, cropped_im, baseline_seg)
            """


            with open(cropped_img_path[:-4]+'.gt.txt', 'w') as f:
                f.write(predictions[name_iterator-1].prediction+"\n")
                print(predictions[name_iterator-1])
            print("DID >"+cropped_img_path)

        name_iterator +=1

    #Show original image with segmentation
    #plt.imshow(img)
    #plt.show()

    cv.imwrite(segmented_img_dir+os.sep+filename[:-4]+"_segmented.jpg", img)


In [48]:
process_images(main_dir, ocr = False, crop = False)

Processing : test/oneFile/extract_image/ms1620-2-138.jpg
Done processing


In [5]:
def LevenshteinDist(s1, s2) :
    m, n = len(s1)+1, len(s2)+1
    
    D= [[0]*(n) for i in range(m)]
    
    """
    for i in range(m) :
        D[i][0] = i
    for j in range(n) :
        D[0][j] = j
    """    
    
    for j in range(1,n) :
        for i in range(1,m) :
            cost = 0 if s1[i-1] == s2[j-1] else 1
            D[i][j] = min(
                D[i-1][j]+1,  # effacement du nouveau caractère de s1
                D[i][j-1]+1,  # insertion dans s2 du nouveau caractère de s1
                D[i-1][j-1]+cost) # substitution
                
    
    for i in D:
         #print(i)
        pass
    return D[m-1][n-1]


string1 = "chiens"
string2 = "niche"

print(LevenshteinDist(string1,string2))

3


In [17]:
from Levenshtein import distance as levenshtein_distance


string1 = "niche"
string2 = "chiens"

print(levenshtein_distance(string1,string2))

5
