In [1]:
import pytesseract
import cv2
from matplotlib import pyplot as plt
import os

In [2]:
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

In [3]:
def generate_image(image_data, filename):
    os.makedirs("temp", exist_ok=True)
    cv2.imwrite(f"temp/{filename}.jpg", image_data)

In [4]:
image_path = "data/list-of-names.jpg"
image = cv2.imread(image_path)
base_image = image.copy()

In [5]:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

In [6]:
generate_image(gray, "gray")

In [7]:
blur = cv2.GaussianBlur(gray, (7,7), 0)

In [8]:
generate_image(blur, "blur")

In [9]:
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

In [10]:
generate_image(thresh, "thresh")

In [11]:
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 13))
dilate = cv2.dilate(thresh, kernel, iterations=1)
generate_image(dilate, "dilate")

In [12]:
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])

In [13]:
results = []
for c in cnts:
    x, y, w, h = cv2.boundingRect(c)
    if h > 200 and w > 20:
        roi = image[y:y+h, x:x+w]
        cv2.rectangle(image, (x, y), (x+w, y+h), (38, 255, 12), 2)
        ocr_res = pytesseract.image_to_string(roi)
        ocr_res = ocr_res.split("\n")
        for i in ocr_res:
            results.append(i)
generate_image(image, "image_bounding_box_new")

In [15]:
print(results)

[' ', '', 'Abdel-Malek, A., 301, 319,', '', '495', '', 'Abernethy, D. B., 134f.', '', 'Achinger, H., 304', '', 'Adler, M., 479', '', 'The affluent worker in the', 'class structure, 336f.', '', 'Agarwala, S.N., 496', '', 'The Age of Affluence 1951-', '1964, 510', '', 'Agnelli, A., 145', '', 'Agosti, A., 514', '', 'Albrecht, W., 150', '', 'Alexander, R. J., 317', '', 'Der algerische Sozialismus,', '489', '', 'Almasy, E., 124', '', 'Alston, P. L., 343', '', 'Altholz, J. L., 510', '', 'Ammassari, G., 514', '', 'Anders, K., 150', '', 'Anderson, E. N., 485', '', 'Andréani, E., 330', '', 'Andréas, B., 331', '', 'Andreasi, A., 514', '', 'Andrews, W., 340', '', 'Angell, A., 491', '', 'Anikeev, V. V., 165', '', 'Anklage und Botschaft, 144', '', 'Annales de Démographie', 'Historique 1969, 305', '', 'Ansart, P., 123', '', 'Anweiler, O., 517', '', 'Archiv fiir Sozialgeschichte,', 'IX, 483', '', 'Archives de Jules Humbert-', '', 'Droz, I, 484', '', 'Archivi di “‘Giustizia e Liber-', '', 'ta” (1915-1

In [21]:
names = []
for i in results:
    i = i.strip().replace("\n", "")
    i = i.split(" ")[0]
    if len(i) > 2:
        if i[0] == "A" or i[0] == "B":    
            names.append(i)

In [22]:
print(names)

['Abdel-Malek,', 'Abernethy,', 'Achinger,', 'Adler,', 'Agarwala,', 'Agnelli,', 'Agosti,', 'Albrecht,', 'Alexander,', 'Almasy,', 'Alston,', 'Altholz,', 'Ammassari,', 'Anders,', 'Anderson,', 'Andréani,', 'Andréas,', 'Andreasi,', 'Andrews,', 'Angell,', 'Anikeev,', 'Anklage', 'Annales', 'Ansart,', 'Anweiler,', 'Archiv', 'Archives', 'Archivi', 'Arian,', 'Arima,', 'Aron,', 'Aronson,', 'Ashraf,', 'Ashtor,', 'Asociacién', 'Atkin,', 'Atkinson,', 'Austromarxismus,', 'Auty,', 'Avakumovié,', 'Avrich,', 'Azmon,', 'Baer,', 'Bahlo,', 'Bahne,', 'Bailey,', 'Bairoch,', 'Bakoenin,', 'Bakunin,', 'Bakuninisti', 'Ballon,', 'Barbadoro,', 'Barnes,', 'Barrére,', 'Barry,', 'Bartel,', 'Bartke,', 'Bartley,', 'Bartoli,', 'Bastide,', 'Bauer,', 'Bayer-Lothe,', 'Bayod', 'Bebel,', 'Bécarud,', 'Bechhofer,', 'Behr,', 'Benda,', 'Benn,', 'Bennassar,', 'Bennett,', 'Beradt,', 'Berger,', 'Bergner,', 'Berman,', 'Bernard,', 'Bernstein,', 'Bernstein,', 'Bernstein,', 'Bernstein,', 'Berque,', 'Besnard,', 'Bethell,', 'Bethell,', '