# Rotate Text in image

The images from the slide scanner are rarely in an orientation where the text runs horizontally across the page. This is a script that aims to rotate the images to correct for this. 

--- 
### Import dependancies

In [1]:
import pytesseract as pytess
import numpy as np 
import skimage 
import scipy 
import aicsimageio
import os 
import tifffile as tf
from PIL import Image
import tkinter as tk 
import pandas as pd
import matplotlib.pyplot as plt
from tkinter import filedialog

----
### Find four couners of the image

In [2]:
def corners_of_text(letter_properties):
    ''' '''
    # Format data
    letter_properties = letter_properties.sort_values(['centroid-0', 'centroid-1'])
    
    # Get the co-ordinates for the top left. 
    tl_props = letter_properties[letter_properties['centroid-1'] < letter_properties['centroid-1'].min() + 10]
    tl_props = tl_props[tl_props['centroid-0'] == tl_props['centroid-0'].min()]
    top_left = [int(tl_props['centroid-1'].iloc[0]), 
                int(tl_props['centroid-0'].iloc[0])]


    br_props = letter_properties[letter_properties['centroid-1'] > letter_properties['centroid-1'].max() - 10]
    br_props = br_props[br_props['centroid-0'] == br_props['centroid-0'].max()]
    # Get the co-ordinated for the bottom right. 
    bot_right = [int(br_props['centroid-1'].iloc[-1]), 
                int(br_props['centroid-0'].iloc[-1])]

    # Filter values to find top right 
    tr_filter = letter_properties[letter_properties['centroid-0'] < letter_properties['centroid-0'].iloc[0] + 50]
    # Find largest centroid-1 value 
    find_biggest_tr = np.where( tr_filter['centroid-1'] ==  np.max( tr_filter['centroid-1'] ) )[0][0]
    # Get the top  
    tr = [int(tr_filter['centroid-1'].iloc[find_biggest_tr]), 
         int(tr_filter['centroid-0'].iloc[find_biggest_tr])]

    # Filter values to find top right 
    if tr == top_left: 
        # Filter values to find top right 
        tr_filter = letter_properties[letter_properties['centroid-0'] > letter_properties['centroid-0'].iloc[0] - 50]
        # Find largest centroid-1 value 
        find_biggest_tr = np.where( tr_filter['centroid-1'] ==  np.max( tr_filter['centroid-1'] ) )[0][0]
        # Get the top  
        tr = [int(tr_filter['centroid-1'].iloc[find_biggest_tr]), 
             int(tr_filter['centroid-0'].iloc[find_biggest_tr])]

    # Filter values to find bottom left
    bl_filter = letter_properties[letter_properties['centroid-0'] > letter_properties['centroid-0'].iloc[-1] - 50]
    # Find largest centroid-1 value 
    find_biggest_bl = np.where( bl_filter['centroid-1'] ==  np.min( bl_filter['centroid-1'] ) )[0][0]
    # Get the top  
    bl = [int(bl_filter['centroid-1'].iloc[find_biggest_bl]), 
         int(bl_filter['centroid-0'].iloc[find_biggest_bl])]
    
    if bl == bot_right:
        # Filter values to find bottom left
        bl_filter = letter_properties[letter_properties['centroid-0'] < letter_properties['centroid-0'].iloc[-1] + 50]
        # Find largest centroid-1 value 
        find_biggest_bl = np.where( bl_filter['centroid-1'] ==  np.min( bl_filter['centroid-1'] ) )[0][0]
        # Get the top  
        bl = [int(bl_filter['centroid-1'].iloc[find_biggest_bl]), 
             int(bl_filter['centroid-0'].iloc[find_biggest_bl])]

    four_corners = np.array( [top_left, bot_right, tr, bl]) 
    
    return(four_corners)

----
### Calculate Angle

In [3]:
def angle_calc(four_corners): 
    ''' ''' 
    #
    min_x = np.where( four_corners[0:2, :] == np.min(four_corners[0:2, :]) )[0]
    point_1 = four_corners[min_x, :][0]
    #
    loc_bot_right = np.abs(four_corners[2:, 0] - point_1[0]).argmin() + 2
    
    #
    point_2 = four_corners[loc_bot_right, :]
    
    # delta x 
    delta_x = point_1[0] - point_2[0]
    delta_y = point_1[1] - point_2[1]

    # angle 
    angle = 180 * ( np.arctan(delta_x / delta_y) / np.pi)

    return(angle)

---
### Get images folder

In [4]:
# # # Creates dialogue to ask directory
# # # Get the folder containing the image stack. 
root = tk.Tk()
root.attributes('-topmost', True)
root.withdraw() # Stops a second window opening
folder = filedialog.askdirectory(title = 'Select Stack file')

all_files = os.listdir(folder) 

files = []
for i in range( len(all_files) ):
    if all_files[i][-4:] == '.png': 
        files.append(all_files[i])

print(len(files))

29


----
# Get image data

In [5]:
# Initalise
props = {'label', 'area', 'centroid'}
results = []
pytess.pytesseract.tesseract_cmd = r'C:/Users/rcorbyn/AppData/Local/Programs/Tesseract-OCR/tesseract.exe'

# Loop around all files
for file in files:
    # Open the image
    image_data = np.array( Image.open(folder + '/' + file) ) 
    # Segment just the lettes in the image. 
    letters = skimage.measure.label(image_data)[:, :, 0]
    # Find the properties of the letters. 
    letter_props = pd.DataFrame( skimage.measure.regionprops_table(letters, properties = props) )
    # Ignore the labels with an area smaller than 250 pixels.
    letter_props = letter_props[ letter_props['area'] > 250 ]
    # Find the leters at the corners of the label. 
    corners = corners_of_text(letter_props)
    # Find the angle of rotation
    angle = angle_calc(corners)
    # Rotate the image by the angle found above.
    rotate_im = scipy.ndimage.rotate(image_data, ( -1 * angle) ) 
    
    # Perform OCR. 
    text = pytess.image_to_string(rotate_im)
    # Clean up the text from the OCR. 
    text = text.replace( chr(10), '_' )
    text = text.replace( ' ' , '_' )
    text = text.replace( '/', '_' )
    text = text.replace('=', '')
    # Save the results of the OCR. 
    results.append(text)

In [6]:
results

['GS_LIV_RBBCMT_RMQ30_4d_21_9446__SM_ROS__25x_522_',
 'GS_LIV_RBCMT_RMH152_7c_24_21330__DT_ROS_25x_522_',
 'GS_LIV_RBCMT_RMH153_6c_24_18455__GS_RO9__25x_522_',
 'GS_LIV_RBCMT_RMH135_2d_23_2635__SM_RO9Q__25x_522_',
 'GS_LIV_BCMT__RB__RMQ35_2a_21_15696__SM_RO9Q__25x_522_',
 'LIV_RBCMT_RMH126_5a_23_2641_SM_RO9__25x_522_',
 'GS_LIV_RBBCMT_RMQ34_4a_21_21_467__SM_RO9__25x_522_',
 'eh_LIV_RBCMT_RMH135_2b_23_2631__SM_ROS_25x_522_',
 'GS_LIV_RBCMT_RMH156_1b_24_18453__GS_RO9__25x_522_',
 'GS_LIV_RBCMT_RMH156_1f_24_18452__GS_RO9__25x_522_',
 'LIV_RBCMT_RMH135_2c_23_2633_SMRO9_25x_522_',
 'LIV_RBBCMT_RMQ34_1b_21_16603_SMRO9__25x_522_',
 'GS_LIV_RBBCMT_RMQ34_3b_21_15694__SM_ROS__25x_522_',
 'GS_LIV_RBBCMT_RMQ30_4c_21_9444__SM_RO9Q__25x_522_',
 'GS_LIV_RBBCMT_RMQ35_2c_21_15698__SM_RO9_25x_522_',
 'GS_LIV_RBCMT_RMH152_7b_24_21328__DT_ROQ9_25x_522_',
 'GS_LIV_RBCMT._RMH137_2d_23_2645__SM_RO9__25x_522_',
 'LIV_RBCMT_RMH126_Sb_23_2643_SM_ROQ_25x_522_',
 'LIV_RBBCMT_RMQ35_2d_21_15699_SM_RO9__25x_522_',
 