# Extract text form xml file

In [3]:
from bs4 import BeautifulSoup
import re

def extract_chinese_chars_from_level3(xml_file_path):
    """level3 > text > content > paragraph에서 한자/중국어 문자만 추출"""
    
    try:
        with open(xml_file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        soup = BeautifulSoup(content, 'xml')
        
        # 1. level3 태그들 찾기
        level3_tags = soup.find_all('level3')
        
        if not level3_tags:
            print("❌ <level3> 태그를 찾을 수 없습니다.")
            return ""
        
        print(f"🔍 발견된 <level3> 태그: {len(level3_tags)}개")
        
        all_chinese_text = ""
        
        for i, level3_tag in enumerate(level3_tags):
            # 2. level3 > text > content > paragraph 경로 따라가기
            text_tag = level3_tag.find('text')
            if not text_tag:
                print(f"  ❌ level3[{i}]에서 <text> 태그 없음")
                continue
                
            content_tag = text_tag.find('content')
            if not content_tag:
                print(f"  ❌ level3[{i}]에서 <content> 태그 없음")
                continue
            
            paragraphs = content_tag.find_all('paragraph')
            if not paragraphs:
                print(f"  ❌ level3[{i}]에서 <paragraph> 태그 없음")
                continue
            
            print(f"  ✅ level3[{i}]에서 {len(paragraphs)}개 paragraph 발견")
            
            for j, paragraph in enumerate(paragraphs):
                # 3. annotation 태그 완전 제거 (원주/주석)
                for annotation in paragraph.find_all('annotation'):
                    annotation.decompose()
                
                # 4. index 태그는 내용만 남기고 태그 제거
                for index_tag in paragraph.find_all('index'):
                    index_tag.replace_with(index_tag.get_text())
                
                # 5. paragraph에서 모든 텍스트 추출
                paragraph_text = paragraph.get_text()
                
                # 6. 한자/중국어 문자만 추출
                chinese_chars = extract_chinese_characters_only(paragraph_text)
                
                if chinese_chars:
                    all_chinese_text += chinese_chars
                    print(f"    📝 paragraph[{j}]: {chinese_chars[:50]}... (총 {len(chinese_chars)}자)")
        
        print(f"\n📄 총 추출된 한자: {len(all_chinese_text)}자")
        return all_chinese_text
        
    except Exception as e:
        print(f"오류: {e}")
        return ""

def extract_chinese_characters_only(text):
    """텍스트에서 한자/중국어 문자만 추출 (간체자/번체자/일본 한자 모두 포함)"""
    
    # 한자 유니코드 범위:
    # U+4E00-U+9FFF: CJK Unified Ideographs (기본 한자)
    # U+3400-U+4DBF: CJK Extension A 
    # U+20000-U+2A6DF: CJK Extension B
    # U+2A700-U+2B73F: CJK Extension C
    # U+2B740-U+2B81F: CJK Extension D
    # U+2B820-U+2CEAF: CJK Extension E
    
    chinese_pattern = r'[\u4e00-\u9fff\u3400-\u4dbf]'
    chinese_chars = re.findall(chinese_pattern, text)
    
    return ''.join(chinese_chars)

def find_target_in_chinese_text(xml_file_path, target_text):
    """추출된 한자 텍스트에서 목표 텍스트 검색"""
    
    # level3에서 한자만 추출
    extracted_chinese = extract_chinese_chars_from_level3(xml_file_path)
    
    print(f"📝 추출된 한자 텍스트 (처음 200자):")
    print(extracted_chinese[:400])
    
    # 목표 텍스트에서도 한자만 추출
    target_chinese = extract_chinese_characters_only(target_text)
    
    print(f"\n🎯 목표 한자 텍스트 :")
    print(target_chinese)
    
    # 검색 수행
    if target_chinese in extracted_chinese:
        start_pos = extracted_chinese.find(target_chinese)
        print(f"\n✅ 한자 텍스트에서 목표를 발견했습니다!")
        print(f"📍 위치: {start_pos}번째 문자부터")
        print(f"📝 매칭된 구간: {extracted_chinese[start_pos:start_pos+100]}...")
        
        return {
            'found': True,
            'position': start_pos,
            'extracted_text': extracted_chinese,
            'target_text': target_chinese
        }
    else:
        print(f"\n❌ 완전 매칭 실패")
        
        # 부분 매칭 시도
        min_length = min(30, len(target_chinese) // 3)
        partial_matches = []
        
        for i in range(len(target_chinese) - min_length + 1):
            substring = target_chinese[i:i + min_length]
            if substring in extracted_chinese:
                match_pos = extracted_chinese.find(substring)
                partial_matches.append({
                    'target_start': i,
                    'xml_pos': match_pos,
                    'matched_text': substring
                })
        
        if partial_matches:
            print(f"🔍 부분 매칭 발견: {len(partial_matches)}개")
            for i, match in enumerate(partial_matches[:3]):
                print(f"  {i+1}. 위치 {match['xml_pos']}: {match['matched_text']}")
        
        return {
            'found': False,
            'partial_matches': partial_matches,
            'extracted_text': extracted_chinese,
            'target_text': target_chinese
        }


In [83]:
# 찾고자 하는 텍스트
target_text = "太祖康獻至仁啓運聖文神武大王姓李氏諱旦字君晉古諱成桂號松軒全州大姓也有司空諱翰仕新羅娶太宗王十世孫軍尹金殷義之女生侍中諱自延侍中生僕射諱天祥僕射生阿幹諱光禧阿幹生司徒三重大匡諱立全司徒生諱兢休兢休生諱廉順廉順生諱承朔承朔生諱充慶充慶生諱景英景英生諱忠敏忠敏生諱華華生諱珍有珍有生諱宮進宮進生大將軍諱勇夫大將軍生內侍執奏諱降執奏娶侍中文公諱克謙之女生將軍諱陽茂將軍娶上將軍李公諱康濟之女生諱安社是爲穆祖性豪放有志四方初在全州時年二十餘勇略過人山城別監入館因官妓事與州官有隙州官與按廉議上聞發兵圖之穆祖聞之遂徙居江陵道三陟縣民願從而徙者百七十餘家嘗造船十五隻以備倭既元也窟大王兵侵諸郡穆祖保頭陀山城以避配通前日山城別監新除按廉使又將至穆祖恐禍及挈家浮海至東北面宜州"

# XML 파일에서 텍스트 검색
result = find_target_in_chinese_text('/Users/user/paddleocrtest/target_to_compare/2nd_waa_000.xml', target_text)

if result['found']:
    print("🎉 매칭 성공!")
else:
    print("🔍 부분 매칭 확인이 필요할 수 있습니다.")


🔍 발견된 <level3> 태그: 134개
  ✅ level3[0]에서 1개 paragraph 발견
    📝 paragraph[0]: 太祖康獻至仁啓運聖文神武大王姓李氏諱旦字君晋古諱成桂號松軒全州大姓也有司空諱翰仕新羅娶太宗王十世孫軍... (총 581자)
  ✅ level3[1]에서 1개 paragraph 발견
    📝 paragraph[0]: 明年乙卯散吉聞于元帝元爲立斡東千戶所給降金牌爲南京等處五千戶所首千戶兼達魯花赤斡東在南京東南九十餘里... (총 154자)
  ✅ level3[2]에서 1개 paragraph 발견
    📝 paragraph[0]: 斡東東南三十餘里有海島曰者考羅北連於陸穆祖築石城以放牛馬... (총 28자)
  ✅ level3[3]에서 1개 paragraph 발견
    📝 paragraph[0]: 憲宗八年受散吉令旨管領李春文大純趙奧魯哥兒卓靑尙哉光奕張哥等八介百戶之任上充兼扢扎百戶句當... (총 45자)
  ✅ level3[4]에서 1개 paragraph 발견
    📝 paragraph[0]: 世祖皇帝中統二年辛酉六月尙書省給降本所行使銅印... (총 23자)
  ✅ level3[5]에서 1개 paragraph 발견
    📝 paragraph[0]: 至元元年甲子五月欽受宣命仍充斡東千戶句當至元十一年甲戌十二月薨葬于孔州城南五里後遷葬于咸興府之義興部... (총 56자)
  ✅ level3[6]에서 1개 paragraph 발견
    📝 paragraph[0]: 穆祖配孝妃李氏非一李也千牛衛長史諱公肅之女生諱行里是爲翼祖至元十二年乙亥三月襲職... (총 40자)
  ✅ level3[7]에서 1개 paragraph 발견
    📝 paragraph[0]: 十八年辛巳世祖征日本天下兵船會于合浦翼祖蒙上司文字將本所人戶簽撥軍人與雙城摠管府三撒千戶蒙古大塔失等... (총 115자)
  ✅ level3[8]에서 1개 paragraph 발견
    📝 paragraph[0]: 初穆祖時時往峴城諸女眞千戶達魯花赤皆願納交遂與之從遊諸千戶禮待甚厚必宰牛馬享宴輒留數日諸千戶有至斡東... (총 492자)
  ✅ level3

In [None]:
import opencc
jp2t = opencc.OpenCC("jp2t")
s2t = opencc.OpenCC('s2t')
def convert_ocr_result(jp, simple, text):
    jp_txt = jp.convert(text)
    result = simple.convert(jp_txt)
    return result

In [23]:
a        = "太祖康獻至仁啓運聖文神武大王姓李氏諱旦字君晋古諱成桂號松軒全州大姓也有司空諱翰仕新羅娶太宗王十世孫軍尹金殷義之女生侍中諱自延侍中生僕射諱天祥僕射生阿干諱光禧阿干生司徒三重大匡諱立全司徒生諱兢休兢休生諱廉順廉順生諱承朔承朔生諱充慶充慶生諱景英景英生諱忠敏忠敏生諱華華生諱珍有珍有生諱宮進宮進生大將軍諱勇夫大將軍生內侍執奏諱隣執奏娶侍中文公諱克謙之女生將軍諱陽茂將軍娶上將軍李公諱康濟之女生諱安社是爲穆祖性豪放有志四方初在全州時年二十餘勇略過人山城別監入館因官妓事與州官有隙州官與按廉議上聞發兵圖之穆祖聞之遂徙居江陵道三陟縣民願從而徙者百七十餘家嘗造船十五隻以備倭旣元也窟大王兵侵諸郡穆祖保頭陀山城以避亂適前日山城別監新除按廉使又將至穆祖恐禍及挈家浮海至東北面宜州" # ext
OCR_data = "太祖康献至仁啓运聖文神武大王姓李氏諱旦字君晋古諱成桂號松軒全州大姓也有司空諱翰仕新罗娶太宗王十世孫軍尹金殷義之女生侍中諱自延侍中生僕射諱天祥僕射生阿干諱光禧阿干生司徒三重大匡諱立全司徒生諱兢休兢休生諱廉顺廉顺生諱承朔承朔生諱充慶充慶生諱景英景英生諱忠敏忠敏生諱华华生諱珍有珍有生諱宫進宫進生大将军諱勇夫大将军生内侍执奏諱降执奏娶侍中文公諱克谦之女生将军諱阳茂将軍娶上将军李公諱康濟之女生諱安社是為穆祖性豪放有志四方初在全州时年二十餘勇略過人山城别监入馆因官妓事與州官有隙州官與按廉議上聞發兵圖之穆祖聞之遂徙居江陵道三陟縣民愿從而徙者百七十餘家嘗造船十五隻以備倭既元也窟大王兵侵諸郡穆祖保頭陀山城以避配通前日山城别监新除按廉使又将至穆祖恐祸及挈家浮海至东北面宜州"
jp_sp = convert_ocr_result(jp2t, s2t, OCR_data)
st = convert_ocr_result(s2t,s2t,OCR_data)
jp = convert_ocr_result(jp2t,jp2t,OCR_data)
sp_jp = convert_ocr_result(s2t, jp2t, OCR_data)
b = '太祖康獻至仁啓運聖文神武大王姓李氏諱旦字君晉古諱成桂號松軒全州大姓也有司空諱翰仕新羅娶太宗王十世孫軍尹金殷義之女生侍中諱自延侍中生僕射諱天祥僕射生阿幹諱光禧阿幹生司徒三重大匡諱立全司徒生諱兢休兢休生諱廉順廉順生諱承朔承朔生諱充慶充慶生諱景英景英生諱忠敏忠敏生諱華華生諱珍有珍有生諱宮進宮進生大將軍諱勇夫大將軍生內侍執奏諱降執奏娶侍中文公諱克謙之女生將軍諱陽茂將軍娶上將軍李公諱康濟之女生諱安社是爲穆祖性豪放有志四方初在全州時年二十餘勇略過人山城別監入館因官妓事與州官有隙州官與按廉議上聞發兵圖之穆祖聞之遂徙居江陵道三陟縣民願從而徙者百七十餘家嘗造船十五隻以備倭既元也窟大王兵侵諸郡穆祖保頭陀山城以避配通前日山城別監新除按廉使又將至穆祖恐禍及挈家浮海至東北面宜州'

length = len(a)
strings = [jp_sp, st, sp_jp]
strings2 = ["jp_sp", "st", "sp_jp"]

for i in range(length):
    flag = [0, 0, 0]
    for X in range(3):
        if a[i] != strings[X][i]:
            flag[X] = 1
    if any(flag):
        print_str = f"char {i} : origin = {a[i]}, "
        for X in range(3):
            if flag[X]:
                print_str += " " + strings2[X] + ": " + strings[X][i]
        print(print_str)

char 21 : origin = 晋,  jp_sp: 晉 st: 晉 sp_jp: 晉
char 74 : origin = 干,  jp_sp: 幹 st: 幹 sp_jp: 幹
char 79 : origin = 干,  jp_sp: 幹 st: 幹 sp_jp: 幹
char 160 : origin = 隣,  jp_sp: 降 st: 降 sp_jp: 降
char 197 : origin = 爲,  st: 為
char 280 : origin = 旣,  jp_sp: 既 st: 既 sp_jp: 既
char 299 : origin = 亂,  jp_sp: 配 st: 配 sp_jp: 配
char 300 : origin = 適,  jp_sp: 通 st: 通 sp_jp: 通


In [25]:
import opencc
simple = opencc.OpenCC("t2s")
jp_to_tr_to_simple = convert_ocr_result(jp2t, simple, OCR_data)
simplified = simple.convert(a)
for i in range(length):
    if jp_to_tr_to_simple[i] != simplified[i]:
        print(i, jp_to_tr_to_simple[i], simplified[i])
print(a)
print(jp_to_tr_to_simple)
print(simplified)

160 降 隣
299 配 乱
300 通 适
太祖康獻至仁啓運聖文神武大王姓李氏諱旦字君晋古諱成桂號松軒全州大姓也有司空諱翰仕新羅娶太宗王十世孫軍尹金殷義之女生侍中諱自延侍中生僕射諱天祥僕射生阿干諱光禧阿干生司徒三重大匡諱立全司徒生諱兢休兢休生諱廉順廉順生諱承朔承朔生諱充慶充慶生諱景英景英生諱忠敏忠敏生諱華華生諱珍有珍有生諱宮進宮進生大將軍諱勇夫大將軍生內侍執奏諱隣執奏娶侍中文公諱克謙之女生將軍諱陽茂將軍娶上將軍李公諱康濟之女生諱安社是爲穆祖性豪放有志四方初在全州時年二十餘勇略過人山城別監入館因官妓事與州官有隙州官與按廉議上聞發兵圖之穆祖聞之遂徙居江陵道三陟縣民願從而徙者百七十餘家嘗造船十五隻以備倭旣元也窟大王兵侵諸郡穆祖保頭陀山城以避亂適前日山城別監新除按廉使又將至穆祖恐禍及挈家浮海至東北面宜州
太祖康献至仁启运圣文神武大王姓李氏讳旦字君晋古讳成桂号松轩全州大姓也有司空讳翰仕新罗娶太宗王十世孙军尹金殷义之女生侍中讳自延侍中生仆射讳天祥仆射生阿干讳光禧阿干生司徒三重大匡讳立全司徒生讳兢休兢休生讳廉顺廉顺生讳承朔承朔生讳充庆充庆生讳景英景英生讳忠敏忠敏生讳华华生讳珍有珍有生讳宫进宫进生大将军讳勇夫大将军生内侍执奏讳降执奏娶侍中文公讳克谦之女生将军讳阳茂将军娶上将军李公讳康济之女生讳安社是为穆祖性豪放有志四方初在全州时年二十余勇略过人山城别监入馆因官妓事与州官有隙州官与按廉议上闻发兵图之穆祖闻之遂徙居江陵道三陟县民愿从而徙者百七十余家尝造船十五只以备倭既元也窟大王兵侵诸郡穆祖保头陀山城以避配通前日山城别监新除按廉使又将至穆祖恐祸及挈家浮海至东北面宜州
太祖康献至仁启运圣文神武大王姓李氏讳旦字君晋古讳成桂号松轩全州大姓也有司空讳翰仕新罗娶太宗王十世孙军尹金殷义之女生侍中讳自延侍中生仆射讳天祥仆射生阿干讳光禧阿干生司徒三重大匡讳立全司徒生讳兢休兢休生讳廉顺廉顺生讳承朔承朔生讳充庆充庆生讳景英景英生讳忠敏忠敏生讳华华生讳珍有珍有生讳宫进宫进生大将军讳勇夫大将军生内侍执奏讳隣执奏娶侍中文公讳克谦之女生将军讳阳茂将军娶上将军李公讳康济之女生讳安社是为穆祖性豪放有志四方初在全州时年二十余勇略过人山城别监入馆因官妓事与州官有隙州官与按廉议上闻发兵图之穆祖闻之遂徙居江陵道三陟县民愿从而徙者百七十余家尝造船十五只以备倭既元也窟大王兵侵诸郡穆祖保头陀山城以避乱适前日山城别监新除按廉使又将