In [13]:
import io
from google.cloud import vision
from google.cloud.vision import types

from tqdm import tqdm
import statistics

In [2]:
client = vision.ImageAnnotatorClient()



In [4]:
def get_response(filein):
    with io.open(filein, 'rb') as image_file:
        image = types.Image(content = image_file.read())
    response = client.document_text_detection(image=image)
    return response

In [5]:
response_list = []
for i in range(10):
    filein = "./convert/7.012noteslindrew-{}.png".format(i)
    response_list.append(get_response(filein))

In [297]:
special = "&%$#_{}~^\\"
special_map = {}
for s in special:
    special_map[s] = "\\" + s
special_map["~"] = "\\textasciitilde"
special_map["^"] = "\\textasciicircum"
special_map["\\"] = "\\textbackslash"
special_map["["] = "{[}"
special_map["]"] = "{]}"

def parse_sym(s):
    if s in special_map.keys():
        return special_map[s]
    return s

def parse_detected_break(text, detected_break, in_itemize = False):
    break_text = ""
    kind = detected_break.type
    
    if kind:
        if kind == 1:
            break_text = " "
        elif kind == 2:
            break_text = "    "
        elif kind == 3:
            if in_itemize:
                break_text = "\n"
            else:
                break_text = "\\\\\n"
#             break_text = "$3\n"
        elif kind == 5:
            if in_itemize:
                break_text = "\n"
            else:
                break_text = "\\\\\n"
#             break_text = "$5\n"
#         elif kind == 2:
#             break_text = " "
        
#         elif 

    if detected_break.is_prefix:
        return break_text + text
    else:
        return text + break_text

def avg_sym_width(block):
    widths = []
    for paragraph in block.paragraphs:
        for word in paragraph.words:
            for sym in word.symbols:
                widths.append(sym.bounding_box.vertices[1].x - sym.bounding_box.vertices[0].x)
    return statistics.median(widths)

def avg_sym_heights(document):
    widths = []
    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for sym in word.symbols:
                        widths.append(sym.bounding_box.vertices[2].y - sym.bounding_box.vertices[0].y)
    return statistics.median(widths)

def avg_sym_height_block(block):
    widths = []
    for paragraph in block.paragraphs:
        for word in paragraph.words:
            for sym in word.symbols:
                widths.append(sym.bounding_box.vertices[2].y - sym.bounding_box.vertices[0].y)
    return statistics.median(widths)

def get_raw(block):
    b = ""
    for p in block.paragraphs:
        for w in p.words:
            for s in w.symbols:
                b += s.text
    return b

def extract_block(block):
    avg_width = avg_sym_width(block)
    
    b = ""
    
    in_itemize = False
    itemize_levels = 0
    last_itemize = None
    
    for paragraph in block.paragraphs:
        p = ""
        for word in paragraph.words:
            w = ""
            for symbol in word.symbols:
#                 print(symbol.property.detected_languages)
                detected_break = symbol.property.detected_break
                text = parse_sym(symbol.text)
                
                if text == "." and (p == "1" or p == "|"):
                    p = ""
                    text = "•"
                elif len(p) >= 2 and p[-2:] == "\n1":
                    text = "•"
                    p = p[:-1]
                    
                washere = False
                if text in "•-" and (p == "" or p[-1] == '\n'):
                    washere = True
                    
                    text = ""
                    if not in_itemize:
                        text += "\\begin{itemize}\n"
                        in_itemize = True
                        itemize_levels += 1
                    if in_itemize:
                        text += "\\item "
                        
                    if last_itemize and itemize_levels <= 3:
                        dist = symbol.bounding_box.vertices[0].x - last_itemize.bounding_box.vertices[0].x
                        
                        y1 = [last_itemize.bounding_box.vertices[0].y, last_itemize.bounding_box.vertices[2].y]
                        y2 = [symbol.bounding_box.vertices[0].y, symbol.bounding_box.vertices[2].y]
            
                        inter = set(range(y1[0], y1[1])).intersection(set(range(y2[0], y2[1])))
                        if dist >= 4 * avg_width:
                            text = "\\begin{itemize}\n" + text
                            itemize_levels += 1
                            in_itemize = True
                        elif dist <= -4 * avg_width and itemize_levels >= 2:
                            text = "\\end{itemize}\n" + text
                            itemize_levels -= 1
                            in_itemize = (itemize_levels != 0)
                
                    last_itemize = symbol
                w += parse_detected_break(text, detected_break, in_itemize)
                
            if in_itemize and "\\item" not in w:
                if p != "" and p[-1] == '\n':
                    for _ in range(itemize_levels):
                         p += "\\end{itemize}\n"
                    itemize_levels = 0
                    in_itemize = False
                    last_itemize = None
                
            p += parse_detected_break(w, word.property.detected_break, in_itemize)
        
        b += parse_detected_break(p, paragraph.property.detected_break, in_itemize)
        
    if in_itemize:
        for _ in range(itemize_levels):
             b += "\\end{itemize}\n"
        
        itemize_levels = 0
        in_itemize = False
        last_itemize = None
        
    text = parse_detected_break(b, block.property.detected_break)
    text = text.replace("\\\\\\begin{itemize}", "\\begin{itemize}")
    text = text.replace("\\\\\n\\begin{itemize}", "\n\\begin{itemize}")
#     text = text.replace("\\\\\\begin{itemize}")
#     text = text.replace("\\end{itemize}\n\\begin{itemize}", "")
    return text

In [296]:
with open("dev2take2.tex", 'w') as f:
    fmt = open("format.tex", "r").read()
    
    al = ""
    for response in response_list:
        avg_height = avg_sym_heights(response.full_text_annotation)
        for page in response.full_text_annotation.pages:
            for block in page.blocks:
                height = avg_sym_height_block(block)
                if height >= 1.9 * avg_height:
                    al += "\\newpage \n \\section{" + get_raw(block) + "}"
#                 print(avg_sym_width(block))
                else:
                    text = extract_block(block)
                    al += text
                
    text = fmt + al + "\\end{document}"
                
    f.write(text)


