In [1]:
from PIL import Image
import sys

import pyocr
import pyocr.builders

In [2]:
tools = pyocr.get_available_tools()

In [3]:
if len(tools) == 0:
    print('please install tesseract')
    print('brew install tesseract')

tesseractがインストールされてない場合はhomebrew使ってインストールしてください

In [4]:
tool = tools[0]

In [5]:
langs = tool.get_available_languages()

In [6]:
langs

['eng', 'osd']

In [7]:
lang = langs[0]

In [8]:
def run_ocr(image_path):
    txt = tool.image_to_string(
        Image.open(image_path),
        lang=lang,
        builder=pyocr.builders.TextBuilder()
    )
    return txt.split('\n')

In [9]:
special_words = ['to form a paste', 'until cloudy']

In [10]:
def parse_chemical(chemical):
    words = chemical.split(' ')

    was_digit  = False
    chemical_name = []
    name = ''
    amount = 0
    unit = ''
    
    # for special word
    for w in special_words:
        if w in chemical:
            name = ' '.join(words[:-1*len(w.split(' '))])
            amount = w
            
            return {'name': name, 'amount': amount, 'unit': unit}

    for w in words:
        if w[0].isdigit() is False and was_digit is False:
            chemical_name.append(w)
            continue
        if w[0].isdigit():
            try:
                amount = float(w)
            except:
                amount = w
            was_digit = True
        if was_digit:
            unit = w
            continue

    return {'name': ' '.join(chemical_name), 'amount': amount, 'unit': unit}

In [11]:
method_group_prefix = [' (A)', ' (B)', ' (A+B)', '(A and B)']

In [12]:
method_exception = ['(sulphide']

In [13]:
def parse_method(method):
    target = ''
    for m in method_group_prefix:
        if m in method:
            target = m.replace(' ', '')
        method = method.replace(m, '')
    
    is_method = True

    method_part = []
    period_part = []
    words = method.split(' ')
    for w in words:
        if '(' in w and w not in method_exception:
            is_method = False
        if is_method:
            method_part.append(w)
        else:
            period_part.append(w.replace('(', '').replace(')', ''))
        
    method = ' '.join(method_part)
    unit = ''
    if len(period_part) > 0:
        unit = period_part[-1]
    period = ' '.join(period_part[:-1])
    return {'name': method, 'period': period, 'unit': unit, 'target': target}

In [14]:
chemicals_group_prefix = ['A', 'B']
method_group_substring = ['(A)', '(B)', '(A+B)']

In [15]:
def parse_text(text):
    state = 0
    json = {}
    
    title = ''
    
    chemicals = []
    new_chemical_group = True
    is_plural = False
    
    methods = []
    
    description = ''
    appendixes = []
    is_new = True
    
    for t in text:
        
        if t == '':
            if state is not 4:
                state += 1
            else:
                is_new = True
            continue
        
        # avoiding strange new line.
        if state is 4 and len(t.split(' ')) < 2:
            state = 3
            
        # if chemicals is separated 2 groups, detect it.
        if state is 2 and is_plural and t.split(' ')[0] in chemicals_group_prefix:
            state = 1
            
        if state is 1:
            if t.split(' ')[0] in chemicals_group_prefix:
                is_plural = True
                new_chemical_group = True
                t = ' '.join(t.split(' ')[1:])
            
        # method checker
        for sub in method_group_substring:
            if sub in t:
                state = 2
        
        if state is 0:
            title = t
        if state is 1:
            if new_chemical_group:
                chemicals.append([t])
                new_chemical_group = False
            else:
                chemicals[-1].append(t)
        if state is  2:
            methods.append(t)
        if state is 3:
            if len(description) is 0:
                description = t
            else:
                description += ' ' + t
        if state is 4:
            if is_new:
                appendixes.append(t)
                is_new = False
            else:
                appendixes[-1] += ' ' + t
    
    recipes = []
    for i in range(len(chemicals)):
        materials = []
        for c in chemicals[i]:
            parsed = parse_chemical(c)
            materials.append(parsed)
        
        if len(chemicals) == len(methods):
            method = parse_method(methods[i])
        if len(methods) < len(chemicals):
            method = parse_method(methods[i-1])
        
        recipes.append({'materials': materials, 'method': method})
        
    
    json = {
        'title': title,
        'recipes': recipes,
        'description': description,
        'appendixes': appendixes
    }
    
    return json

In [16]:
import os

In [17]:
from IPython.display import display, clear_output
import ipywidgets as widtgets

In [268]:
dir_path = './dataset/docs_cropped/bronze_cast_2/'
files = os.listdir(dir_path)

In [265]:
cursor = 0
result = ''
results = []

In [273]:
def check_ocr_result(b):
    file_path = os.path.join(dir_path, files[cursor])
    global result
    result = run_ocr(file_path)
    clear_output()
    for t in result:
        print(t)

In [274]:
def parse_result(b):
    json = parse_text(result)
    results.append(json)
    
    global cursor
    cursor += 1
    
    clear_output()
    print(results[-1]['title'])
    print('\nrecipes')
    print(results[-1]['recipes'])
    print('\ndescription: ')
    print(results[-1]['description'])

In [275]:
check_ocr_button = widtgets.Button(description='run ocr')
display(check_ocr_button)
check_ocr_button.on_click(check_ocr_result)

parse_text_button = widgets.Button(description='parse')
display(parse_text_button)
parse_text_button.on_click(parse_result)

1.213 Blue-green patina on mottled light brown Semi-matt

recipes
[{'materials': [{'amount': 90.0, 'name': 'Ammonium sulphate', 'unit': 'gm'}, {'amount': 90.0, 'name': 'Copper nitrate', 'unit': 'gm'}, {'amount': 1.0, 'name': 'Ammonia (.880 solution)', 'unit': 'cm3'}, {'amount': 1.0, 'name': 'Water', 'unit': 'litre'}], 'method': {'unit': 'hours', 'name': 'Cloth technique', 'period': 'Twenty', 'target': ''}}]

description: 
Soft cotton cloth which has been soaked with the solution is applied to the surface of the object, and stippled into place with a stiff bristle—brush. The object is then left for a period of about twenty hours. The cloth should be removed when it is very nearly dry, and the object then left to dry in air without washing. The blue-green patina tends to develop during the drying period. When treatment is complete and


## memo
1.9, 1.108, 2.107, 108は成分がないため省略

### 手動でデータを修正する場合はここでやる

In [528]:
tmp = result

In [529]:
tmp

['1.199* Grey-brown Matt',
 '',
 'Ammonium chloride 350 gm',
 'Copper acetate ‘ 200 gm',
 'Water 1 litre',
 '',
 'Cotton-wool technique (Ten to twenty hours)',
 '',
 'Cotton-wool, moistened with the solution, is applied to the surface of the object.',
 'This is then left for a period of ten or twenty hours, ensuring that the cotton-wool',
 'remains moist. After the ﬁrst few hours, the surface should be periodically examined',
 'by exposing a small portion, to check the progress. When the desired surface ﬁnish',
 'has been reached, the object is removed, washed in cold water and allowed to dry in',
 '',
 'air. When completely dry, it may be wax ﬁnished.']

In [530]:
tmp = [
'1.199* Grey-brown Matt',
 '',
 'Ammonium chloride 350 gm',
 'Copper acetate 200 gm',
 'Water 1 litre',
 '',
 'Cotton-wool technique (Ten to twenty hours)',
 '',
 'Cotton-wool, moistened with the solution, is applied to the surface of the object.',
 'This is then left for a period of ten or twenty hours, ensuring that the cotton-wool',
 'remains moist. After the ﬁrst few hours, the surface should be periodically examined',
 'by exposing a small portion, to check the progress. When the desired surface ﬁnish',
 'has been reached, the object is removed, washed in cold water and allowed to dry in',
 'air. When completely dry, it may be wax ﬁnished.'
]

In [531]:
result = tmp

In [532]:
parse_text(result)

{'appendixes': [],
 'description': 'Cotton-wool, moistened with the solution, is applied to the surface of the object. This is then left for a period of ten or twenty hours, ensuring that the cotton-wool remains moist. After the ﬁrst few hours, the surface should be periodically examined by exposing a small portion, to check the progress. When the desired surface ﬁnish has been reached, the object is removed, washed in cold water and allowed to dry in air. When completely dry, it may be wax ﬁnished.',
 'recipes': [{'materials': [{'amount': 350.0,
     'name': 'Ammonium chloride',
     'unit': 'gm'},
    {'amount': 200.0, 'name': 'Copper acetate', 'unit': 'gm'},
    {'amount': 1.0, 'name': 'Water', 'unit': 'litre'}],
   'method': {'name': 'Cotton-wool technique',
    'period': 'Ten to twenty',
    'target': '',
    'unit': 'hours'}}],
 'title': '1.199* Grey-brown Matt'}

In [141]:
import json

In [None]:
with open('./blass.json', 'w') as f:
    json.dump(results, f)

### 後処理
- IDつける
- 素材、手法の表記揺れ対応

In [18]:
import json

In [19]:
with open('./bronze.json', 'r') as f:
    bronze = json.load(f)

In [56]:
with open('./blass.json', 'r') as f:
    blass = json.load(f)

### memo

1.9, 1.108, 2.107, 108は成分がないため省略

In [52]:
bronze_id = 1001
for item in bronze['colorings']:
    if bronze_id == 1009:
        bronze_id += 1
    if bronze_id == 1108:
        bronze_id += 1
        
    item['id'] = bronze_id
    bronze_id += 1

In [57]:
blass_id = 2001
for item in blass['colorings']:
    if blass_id == 2107:
        blass_id += 1
    if blass_id == 2108:
        blass_id += 1
    
    item['id'] = blass_id
    blass_id += 1

### 成分の表記揺れ対応

In [61]:
bronze_materials = []
for item in bronze['colorings']:
    for recipe in item['recipes']:
        for material in recipe['materials']:
            bronze_materials.append(material['name'])

In [63]:
import pandas as pd

In [64]:
bronze_mat = pd.DataFrame(bronze_materials, columns=['material'])

In [67]:
bronze_mat.groupby('material').agg({'material': 'count'}).to_excel('./material_check_bronze.xlsx')

In [94]:
blass_materials = []

for item in blass['colorings']:
    for recipe in item['recipes']:
        for material in recipe['materials']:
            blass_materials.append(material['name'])

blass_mat = pd.DataFrame(blass_materials, columns=['material'])

In [95]:
blass_mat.groupby('material').agg({'material': 'count'}).to_excel('./material_check_blass.xlsx')

## Bronze
手修正内容:

Ammonium chloride lgm -> Ammonium chloride 1 gm

Ammonium Chloride -> Ammonium chloride

F erric oxide -> Ferric oxide

Ferric sulphate ng　-> Ferric sulphate 5 gm

Water . -> Water

Water to form -> Water

Water to form a taste  -> Water

In [92]:
for item in bronze['colorings']:
    for recipe in item['recipes']:
        for material in recipe['materials']:
            if material['name'] == 'Water to form a taste':
                print(item['id'])
                material['name'] = 'Water'
                material['amount'] = 'to form a taste'

1126


### Blass
手修正内容

Acetic acid (10% solution) llitre -> Acetic acid (10% solution) 1 litre

Ammonium chloride lOgm -> Ammonium chloride 10 gm

F erric sulphate -> Ferric sulphate

Water to form a -> Water

In [113]:
for item in blass['colorings']:
    for recipe in item['recipes']:
        for material in recipe['materials']:
            if material['name'] == 'Water ‘':
                print(item['id'])
                material['name'] = 'Water'
                # material['amount'] = ''
                # material['unit'] = 'gm'

2163


### 手法の表記揺れ対応

In [127]:
bronze_methods = []
for item in bronze['colorings']:
    for recipe in item['recipes']:
        bronze_methods.append(recipe['method']['name'])

In [128]:
bronze_methods_mat = pd.DataFrame(bronze_methods, columns=['method'])
bronze_methods_mat.groupby('method').agg({'method': 'count'}).to_excel('./method_check_bronze.xlsx')

### bronze

Cold scratch—brushing -> Cold scratch-brushing

Cotton-Wool technique -> Cotton-wool technique

Cotton~wool technique ->

Cotton—wool technique ->

Hot immersion and scratCh-brushing -> Hot immersion and scratch-brushing

In [126]:
for item in bronze['colorings']:
    for recipe in item['recipes']:
        if recipe['method']['name'] == 'Hot immersion and scratCh-brushing':
            print(item['id'])
            recipe['method']['name'] = 'Hot immersion and scratch-brushing'

1083


In [129]:
blass_methods = []
for item in blass['colorings']:
    for recipe in item['recipes']:
        blass_methods.append(recipe['method']['name'])
        
blass_methods_mat = pd.DataFrame(blass_methods, columns=['method'])
blass_methods_mat.groupby('method').agg({'method': 'count'}).to_excel('./method_check_blass.xlsx')

### Blass

Boiling immersion, scratch—brushing -> Boiling immersion and scratch-brushing

Cold scratch—brushing -> Cold scratch-brushing

Cotton—wool technique -> Cotton-wool technique

Hot immersion . -> Hot immersion

Torch technique -> Torch technique 

In [134]:
for item in blass['colorings']:
    for recipe in item['recipes']:
        if recipe['method']['name'] == 'Torch technique ':
            print(item['id'])
            recipe['method']['name'] = 'Torch technique'

2075
2075
2084
2084


In [139]:
with open('bronze.json', 'w') as f:
    json.dump(bronze, f)

In [140]:
with open('blass.json', 'w') as f:
    json.dump(blass, f)