## To Do (5/9/23)

* clean up description processing code and wrap it all into a single function
    * works similarly to image_processor

In [1]:
import usImageProc as uip

import pandas as pd
from PIL import Image
import cv2
import os
import re
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import largestinteriorrectangle as lir
from tqdm import tqdm
from time import sleep

import re

import easyocr
# configure easyocr reader
reader = easyocr.Reader(['en'])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# configuration
description_kw = ['breast','lt','long','rt','trans','area','palpated','axilla','areolar','radial','marked','supraclavicular','oblique','contrast']
description_kw_expand= ['cm','fn','breast','lt','long','rt','trans','area',
                        'palpated','axilla','areolar','radial','marked',
                        'supraclavicular','oblique','contrast','retroareolar',
                        'harmonics','axillary','subareolar','nipple','anti', 
                        'periareolar','subclavicular']
description_kw_contract = ['retro areolar', 
                           'sub areoloar', 
                           'peri areolar',
                           'anti -rad']
description_kw_sub = {'scm':'5 cm', 
                      'anti radial':'anti-rad', 
                      'axillary':'axilla', 
                      'axlla':'axilla',
                      'subclavcular':'subclavicular'}

description_labels_dict = {
    'area':{'breast':['breast'],
            'axilla':['axilla'],
            'supraclavicular':['superclavicular','supraclavicular'],
            'subclavicular':['subclavicular','subclavcular']},
    'laterality':{'left':['lt','left'],
                  'right':['rt','right']},
    'orientation':{'long':['long'],
                    'trans':['trans'],
                    'anti-radial':['anti-rad','anti-radial'],
                    'radial':['radial'],
                    'oblique':['oblique']}
}

# image id followed by dictionary of corrections to apply
corrections = { 4094:{'description':'long lt breast 10.00 scm fn area palpated', 'area':'breast'} }

In [3]:
# note these numbers refer to the filenames, e.g.
# 2 → 000002_cropped.png
# subtract 1 from each to get the index in the alphabetized list

sectors = [2,11,12,94,100,287,291,522,523,525,526,
          527,528,530,531,533,536,544,559,635,637,
          638,639,640,641,645,646,892,916,917,918,
          919,920,972,973,978,983,984,1140,1146,1147,
          1150,1498,1553,1555,1556,1557,1710,1711,
           1712,1713,1714,1715,1716,1717,1718,1856,
          1857,1861,1862,1863,1864,1973,1978,1979,
          1982,1984,1985,1987,1988,1992,1995,1998]
traps = [ ]
len(sectors)

73

In [4]:
# this cell is for text utilities
import re

# Helper functions for text processing - mostly used to extract description from image

def contains_substring(input_string, substring_list):
    for substring in substring_list:
        if substring in input_string:
            return True
    return False

def has_digit(input_string):
    pattern = re.compile(r'\d') # Compile a regular expression pattern to match digits
    return bool(pattern.search(input_string)) # Return True if a match is found, False otherwise

def text_freq_df_column( df, col = 'description'):
    """Compute frequencies of words in column of strings.  Not case sensitive.
    
    Helps to identify keywords and also mispellings
    
    Args:
        df: Pandas dataframe 
        col: column of strings for frequency analysis (defaults to 'description')
    
    Returns:
        counts:  pd series of counts indexed by words in descending order of frequency
        
    Example:
        db = pd.read_csv('database_total.csv')
        db = db.fillna('')
        counts = text_freq_df_column(db)
        counts[0:60]
    """

    soup = []
    for d in df[col]:
        if d is not None:
            split_soup = d.split(' ')
            for s in split_soup:
                s = s.lower()
                s = s.replace('/','')
                if s !='' and not has_digit(s):
                    soup.append(s)
    print(len(soup))
    counts = pd.Series(soup).value_counts()
    return counts

def pad_substrings_with_spaces(substrings, input_str):
    # Iterate over each substring in the list
    for substring in substrings:
        # Replace each occurrence of the substring with the same substring padded with spaces
        input_str = input_str.replace(substring, f" {substring} ")

    # remove duplicate spaces    
    words = input_str.split()
    input_str = ' '.join(words)
    
    # Return the string
    return input_str

def clean_text(input_str, sub_dict, kw_expand, kw_contract):
    """Process input string and add spaces around keywords, substitute for common OCR mistakes
    
    Args:
        input_str: string to be processed
        kw_list:  any word in this list will be searched and padded with spaces
        repair_dict: keys are substrings that will be replaced by their corresponding values
    
    Returns:
        output_str: repaired string
    """
    
    # first make substitutions
    for k in sub_dict.keys():
        input_str = input_str.replace( k, sub_dict[k] )
        
    # now add spaces around all substrings in kw_expand
    for substring in kw_expand:
        input_str = input_str.replace( substring, f" {substring} " )
        
    # remove duplicate spaces
    words = input_str.split()
    input_str = ' '.join(words)
    
    # remove spaces from words in kw_contract
    for substring in kw_contract:
        input_str = input_str.replace( substring, substring.replace(' ','') )
        
    return input_str

def clean_text_df( df, sub_dict, kw_expand, kw_contract, col = 'description'):
    df[col] = df[col].apply(clean_text, args = (sub_dict, kw_expand, kw_contract) )

def label_parser(x, label_dict={}):
    for k in label_dict.keys():
        labels = label_dict[k]
        if contains_substring(x,labels):
            return k
    return 'unknown'

def find_time_substring(text):
    # Regular expression to match time substrings of the form HH:MM or HH.MM
    # does not need to be have blank spaces
    pattern = r'\d{1,2}[:.]\d{2}'
    
    # Find all matches in the input text
    matches = re.findall(pattern, text)
    
    if len(matches)==0:
        return 'unknown'
    else:
        # Return only the first match
        time = matches[0].replace('.',':')
        return time
    
def find_cm_substring(input_str):
    """Find first substring of the form #cm or # cm or #-#cm or #-# cm, not case sensitive
    
    Args:
        input_str:  string
        
    Returns:
        list of matched substrings
    """
    # Regular expression to match s
    pattern = r'\d+(-\d+)?\s*cm'
    
    input_str = input_str.lower()
    input_str = input_str.replace("scm","5cm") #easyocr sometimes misreads 5cm as scm
    
    # Find all matches in the input string
    matches = re.finditer(pattern, input_str)
    
    # get list of matches
    list_of_matches = [m.group() for m in matches]
    
    if len(list_of_matches)==0:
        return 'unknown'
    else:
        return list_of_matches[0]
    
def extract_descript_features( input_str, labels_dict ):
    
    output_dict = {}
    for feature in labels_dict.keys():
        levels_dict = labels_dict[feature]
        output_dict[feature] = label_parser( input_str, levels_dict)

    output_dict['clock_pos'] = find_time_substring(input_str)
    output_dict['nipple_dist'] = find_cm_substring(input_str)
    
    return output_dict
    
def extract_descript_features_df(df, labels_dict, col = 'description'):

    # first extract simple text features
    for feature in labels_dict.keys():
        levels_dict = labels_dict[feature]
        df[feature] = df[col].apply( label_parser, label_dict = levels_dict )
    
    # extract clock_position
    df['clock_pos'] = df[col].apply( find_time_substring )
    
    # extract nipple_dist
    df['nipple_dist'] = df[col].apply( find_cm_substring )
    
    return df


In [5]:
description = 'long rt breast 8.00 5cm fn'
dict = extract_descript_features( description, description_labels_dict)
dict

{'area': 'breast',
 'laterality': 'right',
 'orientation': 'long',
 'clock_pos': '8:00',
 'nipple_dist': '5cm'}

In [6]:
db = pd.read_csv('database_total.csv')
db = db.iloc[:100]
db2 = extract_descript_features_df( db, description_labels_dict )
db2

FileNotFoundError: [Errno 2] No such file or directory: 'database_total.csv'

In [25]:
image_folder_path = r"C:/Users/jbaggett/image"
proc_images_folder = r"C:/Users/jbaggett/proc_image"

input_file = r"database.csv"
output_file = r"database_testing.csv"

# processing configuration
debug = False
write_images = True
display_images = False

# open database and get filenames to be processed
db_in = pd.read_csv(input_file)
files = db_in['filename']
image_numbers = np.arange(len(files))

# open or create output database
import os.path
check_db_out = os.path.isfile(output_file)
if check_db_out:
    db_out = pd.read_csv(output_file)
else:
    db_out = db_in.copy()
    new_features = ['processed','crop_x', 'crop_y', 'crop_w', 'crop_h', 'description', 'size', 'sector_detected', 'darkness','area','laterality','orientation','clock_pos','nipple_dist']
    for nf in new_features:
        db_out[nf] = None
    db_out['processed'] = False
                

for i in tqdm(image_numbers):
    sleep(0.01)
    if not db_out['processed'][i]:
        file_name = db_in['filename'][i]
        us_x = min(db_in['us_x0'][i],0)
        us_y = db_in['us_y0'][i]
        us_w = db_in['us_x1'][i]-us_x
        us_h = db_in['us_y1'][i]-us_y
        rect_us = (us_x, us_y, us_w, us_h)
        #print('rect_us: ', rect_us)
        # Check if the file is an image
        if file_name.endswith(('.png', '.jpg', '.jpeg', '.bmp')):
            # Construct the full path to the image file

            if debug:
                print('Processing: ', file_name )

            full_filename = os.path.join(image_folder_path, file_name)
            image_out_path = os.path.join(proc_images_folder, file_name)

            # Open the image file and store it in an image object
            img = Image.open(full_filename)

            # recast image as numpy array
            img = np.array(img)
            img_orig = img.copy()

            img_dict = uip.img_processor(img, reader, 
                                         rect_US = rect_us,
                                         kw_list = description_kw)
            if debug: 
                print(img_dict)
                print('Processing Complete: ', file_name)

            # insert into total database
            new_features = ['crop_x', 'crop_y', 'crop_w', 'crop_h', 'description', 'size', 'is_sector', 'darkness']
            crop_x, crop_y, crop_w, crop_h = img_dict['rect_crop']
            description = img_dict['text_description']
            db_out.loc[i,'crop_x'] = crop_x
            db_out.loc[i,'crop_y'] = crop_y
            db_out.loc[i,'crop_w'] = crop_w
            db_out.loc[i,'crop_h'] = crop_h
            db_out.loc[i,'description'] = description
            db_out.loc[i,'size'] = img_dict['text_size']
            db_out.loc[i,'sector_detected'] = img_dict['sector_detected']
            db_out.loc[i,'processed'] = True
            db_out.loc[i,'darkness'] = img_dict['darkness']
            if len(description)>0:
                feature_dict = extract_descript_features( description, description_labels_dict )
                display_str = ''
                for feature in feature_dict.keys():
                    db_out.loc[i,feature] = feature_dict[feature]
                    display_str = display_str + feature_dict[feature] + ' '
            else:
                display_str = ''

            if write_images or display_images: # add description and crop region to image
                img_orig = uip.add_rect(img_orig, img_dict['rect_crop'])
                img_orig = uip.add_text(img_orig, display_str)

            if write_images:
                cv2.imwrite(image_out_path,img_orig)
            if display_images:
                img2 = img_orig.copy()
                img2 = uip.add_rect(img2, img_dict['rect_machine'])
                img2 = uip.add_rect(img2, img_dict['rect_description'])
                img2 = uip.add_rect(img2, img_dict['rect_colorbar'])
                if len(img_dict['rect_sizebox'])>0:
                    img2 = uip.add_rect(img2, img_dict['rect_sizebox'])
                    
                fig, (ax1,ax2) = plt.subplots(1,2,figsize=(20, 15)) 

                ax1.imshow(img_orig,cmap='gray')   
                ax2.imshow(img2,cmap='gray')
                fig.show()                
                
db_out.to_csv(output_file,index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 4098/4098 [29:50<00:00,  2.29it/s]


In [39]:
text_freq_df_column( db_out, col = 'area')

4096


breast             3563
axilla              486
unknown              29
supraclavicular      16
subclavicular         2
dtype: int64

In [58]:
db_out[ db_out['patient_id']==4]['image_id']

12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
20    21
21    22
22    23
23    24
24    25
25    26
26    27
27    28
Name: image_id, dtype: int64

In [61]:
# database utilities

def db_filters( db_in, db_out = None, only_breast = True, only_gray = False, only_calipers = False, max_darkness = 50):
    # db_in is the name of the csv file containing our database
    # returns a dataframe with filterrs applied and optionally writes to db_out
    pass

def fetch_index_for_patient_id( id, db, only_gray = False, only_calipers = False ):
    # id is a patient id number that should be listed in database
    # only_gray = True → return only monochrome files (not doppler)
    # only_calipers = True → return only files that include calipers
    # returns list of indices
    
    if id in db['patient_id'].tolist():
         indices= db.index[db['patient_id']==id].tolist()
    else:
        indices = []
    return indices

fetch_index_for_patient_id( 4, db_out )

[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]

In [107]:
image_folder_path = r"C:/Users/jbaggett/image"


def find_nearest_images( db, patient_id):

    idx = np.array(fetch_index_for_patient_id(patient_id, db_out))
    num_images = len(idx)
    result = {}

    for j,c in enumerate(idx):
        x = db_out.loc[c]['crop_x']
        y = db_out.loc[c]['crop_y']
        w = db_out.loc[c]['crop_w']
        h = db_out.loc[c]['crop_h']
        img_stack = np.zeros((num_images,w*h))
        root_image = np.where(calipers==c)
        for i,image_id in enumerate(idx):
            file_name = db.loc[image_id]['filename']
            full_filename = os.path.join(image_folder_path, file_name)
            img = Image.open(full_filename)
            img = np.array(img)
            img,_ = uip.make_grayscale(img)
            img = img[y:y+h,x:x+w]
            img = img.flatten()
            img_stack[i,:] = img
        img_stack = np.abs(img_stack - img_stack[j,:])
        img_stack = np.mean( img_stack, axis=1 ) 
        img_stack[j] = 1000
        sister_image = np.argmin(img_stack)
        distance = img_stack[sister_image]
        result[c] = {'filename': db.loc[c]['filename'],
                     'sister_filename':db.loc[ idx[sister_image]]['filename'],
                     'distance': distance}
    return result


input_file = r'database_total.csv'
output_file = r'datbase_total_v2.csv'

db_in = pd.read_csv(input_file)
patient_ids = db_in['patient_id'].unique()
        
db_out = db_in.copy()
db_out['closest_fn']=''
db_out['distance'] = -1

for pid in tqdm(patient_ids):
    result = find_nearest_images(db_in, 4)
    idxs = result.keys()
    for i in idxs:
        db_out.loc[i,'closest_fn'] = result[i]['sister_filename']
        db_out.loc[i,'distance'] = result[i]['distance']
        
db_out.to_csv(output_file,index=False)


100%|████████████████████████████████████████████████████████████████████████████████| 361/361 [21:34<00:00,  3.59s/it]


In [91]:
x = np.array([ [1,2,3],[4,5,6],[7,8,9] ] )
print(x)
print(x-x[0,:])

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[0 0 0]
 [3 3 3]
 [6 6 6]]


In [46]:
import numpy as np
from scipy.spatial.distance import cdist

def compute_distance_matrix(arr):
    """
    Computes a distance matrix for differences along the first dimension of a numpy array.
    """
    return cdist(arr, arr, 'euclidean')

# Define a numpy array
arr = np.array([[1, 2], [3, 4], [5, 6]])

# Compute the distance matrix
dist_matrix = compute_distance_matrix(arr)

# Print the result
print(dist_matrix)


[[0.         2.82842712 5.65685425]
 [2.82842712 0.         2.82842712]
 [5.65685425 2.82842712 0.        ]]


'This is a 3 cm test of long lt'

In [None]:
# fix this up
# the size box seems to always have the horizontal position
# but the vertical coordinates can vary
#
# read the text in that region, if any, using easyocr
# use the text coordinates to estimate the height of the rectangle
# may not use erosion and dilation at all
def get_box_rect(img, rect = (720,520,160,200), thresh = 40):
    # img is HxW or HxWx3 image in numpy array
    # look for box in lower right corner and return coordinates
    # return tuple (x,y,w,h) if box detected, otherwise empty tuple ()

    img_gray, is_color = make_grayscale(img)
    img_bw = make_mask(img_gray, thresh = thresh)

    # blackout everything not in box region (this won't generalize cuz the approx box loc is hardwired)
    H,W = img_bw.shape
    x,y,w,h = rect
    rect = (x,y,min(W-x,w),min(H-y,h)) # truncate if box extends too far down or right
    img_bw = blackout_rectangle_exterior(img_bw,rect)
    
    # dilate a bit to make sure box is closed
    kernel = np.ones((5,5),np.uint8)
    img_dilated = cv2.dilate(img_bw,kernel,iterations=1)
    
    # get contours and find largest, if its big enough get box coordinates
    contours, hierarchy = cv2.findContours(img_dilated, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
    area = 0
    size_strings = []
    if len(contours) > 0:
        c = max(contours,key=cv2.contourArea)
        area = cv2.contourArea(c)
        if area >= 1000:
            x,y,w,h = cv2.boundingRect(c)
            img_cleaned = blackout_rectangle_exterior(img, (x,y,w,h) )
            result = reader.readtext(img_cleaned,paragraph=False)
            size_box_found = len(result)>0
            if size_box_found:
                size_strings = [r[1] for r in result]

    if (area >= 1000) and size_box_found:
        box_rect = (x,y,w,h)
    else:
        box_rect = None
        
    return box_rect, size_strings

def get_text_box(img, thresh = 170, rect = (22,500,811,220), solidify = True, pos0 = (409,655)):
    # img is HxW or HxWx3 image in numpy array
    # look for text at bottom of image
    # return tuple (x,y,w,h) of rectangle containing text and text string
    # this works best if the text has a clear separation from the "size box" in the lower right corner
    # remove the "size box" before text extraction
    # pos0 is used as reference point, the closest "paragraph" to the ref point will be returned
    #     if multiple strings are found in the rectangle

    # if image has multiple channels convert it to grayscale (could probably improve this)
    if len(img.shape)>2:
        img_gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    else:
        img_gray = img.copy()
    
    # black out everything outside of text region
    H,W = img_gray.shape
    img_gray = blackout_rectangle_exterior( img_gray, rect)
                                                                                
    # convert to black and white to increase contrast
    threshold = thresh
    th,img_bw = cv2.threshold(img_gray,threshold,255,cv2.THRESH_BINARY)
    
    # new approach 4/12/23
    #
    # use easyocr as first pass and get longest string and coordinates of box
    # use tesseract to extract string from the box provided by easyocr
    img_focused = blackout_rectangle_exterior( img, rect)
    result = reader.readtext(img_focused,paragraph=True)
    #print('Easyocr results: ',result)
    lengths = [ len(r[1]) for r in result ]
    
    if len(lengths)==0: # no text detected in region
        return [],[] # return empty and try again
    
    # this needs to be redone to check content of string, not just position
    distsq = [ (r[0][0][0]-pos0[0])**2 + (r[0][0][1]-pos0[1])**2 for r in result ]
    #idx = np.argmax( np.array(lengths) )
    idx = np.argmin( np.array(distsq) )
    string = result[idx][1]
    coord = result[idx][0]
    rect_txt = easyocr_coord_to_rect( coord )
    img_bw = blackout_rectangle_exterior( img_bw, rect_txt)
    text = pytesseract.image_to_string(img_bw)
    text = text.replace("\n"," ")
    return rect_txt, text



### finish this function
### 1. return coordinates with respect to original rectangle
### 2. implement multiple methods 
def extract_text_from_rectangle(img, rect = (22,500,811,220), method = 'easyocr', thresh = 170, solidify = True):
    # img is HxW or HxWx3 image in numpy array
    # rect = (x0,y0,w,h) is region
    # returns string and bounding rectangle with respect to original image
    (x,y,w,h) = rect
    img_cropped = img[y:y+h,x:x+w]
    # use the reader from the global scope, bad coding, refactor later
    result = reader.readtext(img_cropped,paragraph=True)
    if len(result)>0:
        size_strings = result[0][1]
        coords = result[0][0]
        # rectangle coord relative to input rect
        rect1 = easyocr_coord_to_rect(coords)
        rect_text = (rect[0]+rect1[0], rect[1]+rect1[1],rect1[2],rect1[3])
    else:
        size_strings = []
        rect_text = []
    
    return size_strings, rect_text

In [300]:
x = None
x == None

True

In [2]:
rect = (10,20,50,30)
x,y,w,h = rect
print(x,y,w,h)

10 20 50 30


In [None]:
def size_box_extracter(img):
    # img is HxW or HxWx3 image in numpy array
    # returns string and bounding rectangle with respect to original image
    
    # start by looking for the last row of text in the size box by using easyocr
    x = 722
    y = 688
    w = 900-x
    h = 720-y
    img_cropped = img[y:y+h,x:x+w]
    # use the reader from the global scope, bad coding, refactor later
    result = reader.readtext(img_cropped,paragraph=True)
    
    if len(result)>0:
        size_string = result[0][1]
        if len(size_string)<3: # it detected something but not a full size row
            size_string = []
            rect_box = []
        else:
            # print('Size String in Extraction: ',size_string)
            first_char = size_string[0]
            if not first_char.isnumeric():
                first_char = '2' # this is a hack
            num_rows = int(first_char)
            h = 34*num_rows+26+5 # approx height of box
            y = 720 - h # approximate top of box
            x = 720 # tie this to the coordinate extraction
            w = 900-x
            #print('Inside size_box_extracter init guess: ',(x,y,w,h))
            rect_box,size_string = get_box_rect(img, rect = (x,y,w,h), thresh = 40)
            #print('Inside size_box_extracter rect_box: ',rect_box)
    else:
        size_string = []
        rect_box = []
    
    return rect_box, size_string