In [None]:
import pandas as pd
import re
from g2pk import G2p

# G2P 객체 생성
g2p = G2p()

# 특수 토큰과 대체 단어 설정
special_tokens = {
    '#Person1#': '임시토큰11',
    '#Person2#': '임시토큰12',
    '#Person3#': '임시토큰13',
    '#PhoneNumber#': '임시토큰14',
    '#Address#': '임시토큰15',
    '#PassportNumber#': '임시토큰16'
}

# 특수 토큰을 임시로 대체하는 함수
def replace_special_tokens(text):
    for token, replacement in special_tokens.items():
        text = text.replace(token, replacement)
    return text

# 특수 토큰을 원래 상태로 복원하는 함수
def restore_special_tokens(text):
    for replacement, token in special_tokens.items():
        text = text.replace(token, replacement)
    return text

# summary와 dialogue에서 고유한 영어 단어 추출 함수
def extract_unique_english_words(text):
    words = text.split()
    english_words = set()
    
    for word in words:
        cleaned_word = re.sub(r'[^a-zA-Z]', '', word)
        if cleaned_word and cleaned_word not in special_tokens.values():
            english_words.add(cleaned_word)
    
    return list(english_words)

# 영어 이름을 한국어 발음으로 변환하는 함수
def english_to_korean_pronunciation(name):
    return g2p(name)

# Summary와 dialogue에서 영어 이름을 한국어 발음으로 교체하는 함수
def replace_english_names(row):
    dialogue = row['dialogue']
    summary = row['summary']
    
    # 특수 토큰을 임시로 대체
    dialogue = replace_special_tokens(dialogue)
    summary = replace_special_tokens(summary)
    
    # 1. summary와 dialogue에서 고유한 영어 단어 추출
    english_words_summary = extract_unique_english_words(summary)
    english_words_dialogue = extract_unique_english_words(dialogue)
    
    # 2. summary에만 있는 영어 단어 처리
    unique_to_summary = list(set(english_words_summary) - set(english_words_dialogue))
    
    # 3. summary와 dialogue에 모두 등장하는 영어 단어 처리
    common_english_words = list(set(english_words_summary) & set(english_words_dialogue))
    
    # 4. summary에만 있는 영어 단어 번역 및 교체
    if unique_to_summary:
        for e_word in unique_to_summary:
            k_word = english_to_korean_pronunciation(e_word)
            summary = summary.replace(e_word, k_word)
            print(f'Replacing {e_word} with {k_word} in summary only.')
    
    # 5. summary와 dialogue에 모두 있는 영어 단어 번역 및 교체
    if common_english_words:
        for e_word in common_english_words:
            k_word = english_to_korean_pronunciation(e_word)
            summary = summary.replace(e_word, k_word)
            dialogue = dialogue.replace(e_word, k_word)
            print(f'Replacing {e_word} with {k_word} in both summary and dialogue.')
    
    # 업데이트된 값을 반환
    row['summary'] = summary
    row['dialogue'] = dialogue
    
    return row

# 데이터프레임 생성 예시
# new_ordered_df = pd.DataFrame({
#     'dialogue': ordered_df[ordered_df['fname'] == 'train_3104']['dialogue'],
#     'summary': ordered_df[ordered_df['fname'] == 'train_3104']['summary']
# })

# Summary와 dialogue에서 영어 이름을 한국어 발음으로 교체
new_ordered_df = new_ordered_df.apply(replace_english_names, axis=1)

# 특수 토큰을 원래 상태로 복원
def restore_tokens(row):
    row['dialogue'] = restore_special_tokens(row['dialogue'])
    row['summary'] = restore_special_tokens(row['summary'])
    return row

# 복원 작업 적용
new_ordered_df = new_ordered_df.apply(restore_tokens, axis=1)

# # 결과 확인
# print(new_ordered_df['summary'])
# print(new_ordered_df['dialogue'])

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from g2pk import G2p
import pandas as pd
from googletrans import Translator

# G2P 객체 생성
g2p = G2p()
translator =Translator()

# NER 모델 로드
tokenizer_en = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model_en = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_en = pipeline("ner", model=model_en, tokenizer=tokenizer_en)



special_tokens = {
    '#Address#': '#임시주소153#',
    '#CarNumber#': '#임시차량번호154#',
    '#CardNumber#': '#임시카드번호155#',
    '#DateOfBirth#': '#임시생년월일156#',
    '#Email#': '#임시이메일157#',
    '#PassportNumber#': '#임시여권번호158#',
    '#Person#': '#임시인물159#',
    '#Person1#': '#임시인물160#',
    '#Person2#': '#임시인물161#',
    '#Person3#': '#임시인물162#',
    '#Person4#': '#임시인물163#',
    '#Person5#': '#임시인물164#',
    '#Person6#': '#임시인물165#',
    '#Person7#': '#임시인물166#',
    '#PhoneNumber#': '#임시전화번호167#',
    '#SSN#': '#임시주민번호168#'
}
# 특수 토큰을 임시로 대체하는 함수
def replace_special_tokens(text):
    for token, replacement in special_tokens.items():
        text = text.replace(token, replacement)
    return text

# 특수 토큰을 원래 상태로 복원하는 함수
def restore_special_tokens(text):
    for replacement, token in special_tokens.items():
        text = text.replace(token, replacement)
    return text

def detect_language(word):
    is_korean = all('가' <= char <= '힣' for char in word)

    if is_korean:
        return True
    else:
        return False

# 한글 자모 분리를 위한 유니코드 조작 함수
def decompose_hangul(syllable):
    base_code = ord(syllable) - 0xAC00
    choseong = base_code // 588
    jungseong = (base_code - (choseong * 588)) // 28
    jongseong = base_code % 28
    choseong_base = 0x1100
    jungseong_base = 0x1161
    jongseong_base = 0x11A7

    if jongseong != 0:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), chr(jongseong_base + jongseong)
    else:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), None  # 종성이 없으면 None

def split_syllables(text):
    result = []
    for char in text:
        if '가' <= char <= '힣':  # 한글 음절 여부 확인
            result.append(decompose_hangul(char))
        else:
            result.append((char,))
    return result

def normalize_consonant(consonant):
    if consonant is None:
        return None
    consonant_code = ord(consonant)
    
    # 초성을 종성으로 변환
    if 0x1100 <= consonant_code <= 0x1112:  # 초성 범위
        return chr(consonant_code + 169)  # 초성을 종성으로 변환
    
    # 종성을 초성으로 변환
    elif 0x11A8 <= consonant_code <= 0x11C2:  # 종성 범위
        return chr(consonant_code - 169)  # 종성을 초성으로 변환
    
    return consonant

# 음절 단위로 유사도 계산 및 초성/종성 변환 후 비교
def calculate_similarity(word1, word2):
    word1_split = split_syllables(word1)
    word2_split = split_syllables(word2)

    match_count = 0
    max_count = max(len(word1_split), len(word2_split))

    i = 0
    while i < min(len(word1_split), len(word2_split)):
        syllable1 = word1_split[i]
        syllable2 = word2_split[i]

        print(f"Comparing syllable '{syllable1}' with '{syllable2}'")

        # 초성, 중성 비교
       
        if syllable1[0] == syllable2[0]:
            match_count += 4/11  # 각 성분이 일치할 때마다 1/3점 부여
            print(f"Partial match for '{syllable1[0]}' and '{syllable2[0]}'. Match count: {match_count}")
        else:
            print(f"No match for '{syllable1[0]}' and '{syllable2[0]}'")
               
        if syllable1[1] == syllable2[1]:
            match_count += 1/3  # 각 성분이 일치할 때마다 1/3점 부여
            print(f"Partial match for '{syllable1[1]}' and '{syllable2[1]}'. Match count: {match_count}")
        else:
            print(f"No match for '{syllable1[1]}' and '{syllable2[1]}'")

        # 종성 비교
        if syllable1[2] == syllable2[2]:  # 종성이 일치할 때
            match_count += 1/3
            print(f"Match for 종성 '{syllable1[2]}' and '{syllable2[2]}'. Match count: {match_count}")
        elif syllable1[2] is None and syllable2[2] is not None:  # 첫 단어에 종성이 없고, 두 번째 단어에 종성이 있는 경우
            if i + 1 < len(word1_split):
                next_choseong = normalize_consonant(word1_split[i + 1][0])
                if next_choseong == syllable2[2]:
                    match_count += 1/3
                    print(f"Match for 종성 '{syllable2[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                else:
                    print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        elif syllable2[2] is None and syllable1[2] is not None:  # 첫 단어에 종성이 있고, 두 번째 단어에 종성이 없는 경우
            if i + 1 < len(word2_split):
                next_choseong = normalize_consonant(word2_split[i + 1][0])
                if next_choseong == syllable1[2]:
                    match_count += 1/3
                    print(f"Match for 종성 '{syllable1[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                else:
                    print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        else:
            print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")

        i += 1

    # 유사도 비율 계산
    similarity = match_count / max_count
    print(f"Final similarity: {similarity:.2f}\n")

    return similarity

# 영어 이름을 한국어 발음으로 변환하고, 동일한 이름이 양쪽에 있으면 그대로 변환
def convert_word(word, entity_type, context_words, is_common):
    if entity_type in ['B-PER', 'I-PER']:  # 인물 이름인 경우 처리
        korean_pronunciation = g2p(word)
        google_translation = translator.translate(word, src='en', dest='ko').text
        print(f"Korean Pronunciation of '{word}': {korean_pronunciation}")
        print(is_common)
        print(f"Korean Pronunciation of '{word}': {google_translation} from google")
        
        
        if is_common:  # 동일한 이름이 양쪽에 있을 경우
            if detect_language(google_translation): # 구글 번역이 한글일 경우
                return google_translation
            elif detect_language(korean_pronunciation): # G2P 가 한글일 경우
                return korean_pronunciation
            else:
                return google_translation
        elif context_words:  # Context words가 존재하는지 확인
            if detect_language(google_translation) and detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                
                if similarity_score <= similarity_score_google:
                    best_match = best_match_google
                    similarity_score = similarity_score_google
                
                if similarity_score > 0.6:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
            elif detect_language(google_translation):   
                                
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                

                best_match = best_match_google
                similarity_score = similarity_score_google
                if similarity_score > 0.6:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
                
            elif detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                
                if similarity_score > 0.6:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return korean_pronunciation
            else:
                return word
                
        elif detect_language(google_translation):
            return google_translation
        elif detect_language(korean_pronunciation):
            return korean_pronunciation
        else:
            return word
    else:
        return word  # 기타 개체는 그대로 유지

# 엔티티 인식 후 변환하는 메인 함수
def process_text(row):
    dialogue = row['dialogue']
    summary = row['summary']
    
    english_words_dialogue = extract_english_words(dialogue)
    print(english_words_dialogue)
    english_words_summary = extract_english_words(summary)
    print(english_words_summary)
    english_words = list(set(english_words_dialogue + english_words_summary))
    entities = classify_entities(english_words, ner_en)  # 영어 NER 사용
    
    context_words = re.findall(r'\b[가-힣]+\b', dialogue + " " + summary)  # 한국어 단어 추출
    print(context_words)

    for word, entity_type in entities.items():
        is_common = word in english_words_dialogue and word in english_words_summary
        converted_word = convert_word(word, entity_type, context_words, is_common)
        dialogue = dialogue.replace(word, converted_word)
        summary = summary.replace(word, converted_word)
    
    row['new_dialogue'] = dialogue
    row['new_summary'] = summary
    return row

# 영어 단어 추출 함수
def extract_english_words(text):
    # 정규 표현식을 사용하여 영어 단어만 추출
    english_words = re.findall(r'\b[A-Za-z]+\b', text)
    return english_words


# 엔티티 분류 함수
def classify_entities(words, ner_pipeline):
    entities = {}
    for word in words:
        ner_results = ner_pipeline(word)
        if ner_results:
            entities[word] = ner_results[0]['entity']
    print(f"Extracted Entities: {entities}")  # NER 모델 출력 확인
    return entities




df_filtered['dialogue'] = df_filtered['dialogue'].apply(replace_special_tokens)
df_filtered['summary'] = df_filtered['summary'].apply(replace_special_tokens)

df_filtered = df_filtered.apply(process_text, axis=1)

# 특수 토큰 복원
df_filtered['dialogue'] = df_filtered['dialogue'].apply(restore_special_tokens)
df_filtered['summary'] = df_filtered['summary'].apply(restore_special_tokens)

df_filtered['new_dialogue'] = df_filtered['new_dialogue'].apply(restore_special_tokens)
df_filtered['new_summary'] = df_filtered['new_summary'].apply(restore_special_tokens)



In [None]:
import pandas as pd
from googletrans import Translator

# 번역기 객체 생성
translator = Translator()

# 예제 DataFrame (실제 df로 대체하세요)


# 1. english_words_count가 0이 아닌 행 필터링
df_filtered = new_ordered_df[new_ordered_df['english_words_count'] > 0].copy()

# 2. dialogue와 summary를 영어로 번역한 뒤 다시 한국어로 재번역
df_filtered['dialogue_translated'] = df_filtered['dialogue'].apply(lambda x: translator.translate(translator.translate(x, src='ko', dest='en').text, src='en', dest='ko').text)
df_filtered['summary_translated'] = df_filtered['summary'].apply(lambda x: translator.translate(translator.translate(x, src='ko', dest='en').text, src='en', dest='ko').text)

# 3. 원래 df에 재번역된 값 채워넣기
new_ordered_df.loc[new_ordered_df['english_words_count'] > 0, 'dialogue'] = df_filtered['dialogue_translated']
new_ordered_df.loc[new_ordered_df['english_words_count'] > 0, 'summary'] = df_filtered['summary_translated']

# 결과 확인
print(new_ordered_df[['dialogue', 'summary']])

In [5]:
df_filtered = new_ordered_df[new_ordered_df['english_words_count'] > 0].copy()

In [6]:
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from g2pk import G2p
import pandas as pd
from googletrans import Translator

# G2P 객체 생성
g2p = G2p()
translator =Translator()

# NER 모델 로드
tokenizer_en = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model_en = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_en = pipeline("ner", model=model_en, tokenizer=tokenizer_en)



special_tokens = {
    '#Address#': '#임시주소153#',
    '#CarNumber#': '#임시차량번호154#',
    '#CardNumber#': '#임시카드번호155#',
    '#DateOfBirth#': '#임시생년월일156#',
    '#Email#': '#임시이메일157#',
    '#PassportNumber#': '#임시여권번호158#',
    '#Person#': '#임시인물159#',
    '#Person1#': '#임시인물160#',
    '#Person2#': '#임시인물161#',
    '#Person3#': '#임시인물162#',
    '#Person4#': '#임시인물163#',
    '#Person5#': '#임시인물164#',
    '#Person6#': '#임시인물165#',
    '#Person7#': '#임시인물166#',
    '#PhoneNumber#': '#임시전화번호167#',
    '#SSN#': '#임시주민번호168#'
}
# 특수 토큰을 임시로 대체하는 함수
def replace_special_tokens(text):
    for token, replacement in special_tokens.items():
        text = text.replace(token, replacement)
    return text

# 특수 토큰을 원래 상태로 복원하는 함수
def restore_special_tokens(text):
    for replacement, token in special_tokens.items():
        text = text.replace(token, replacement)
    return text

def detect_language(word):
    is_korean = all('가' <= char <= '힣' for char in word)

    if is_korean:
        return True
    else:
        return False

# 한글 자모 분리를 위한 유니코드 조작 함수
def decompose_hangul(syllable):
    base_code = ord(syllable) - 0xAC00
    choseong = base_code // 588
    jungseong = (base_code - (choseong * 588)) // 28
    jongseong = base_code % 28
    choseong_base = 0x1100
    jungseong_base = 0x1161
    jongseong_base = 0x11A7

    if jongseong != 0:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), chr(jongseong_base + jongseong)
    else:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), None  # 종성이 없으면 None

def split_syllables(text):
    result = []
    for char in text:
        if '가' <= char <= '힣':  # 한글 음절 여부 확인
            result.append(decompose_hangul(char))
        else:
            result.append((char,))
    return result

def normalize_consonant(consonant):
    if consonant is None:
        return None
    consonant_code = ord(consonant)
    
    # 초성을 종성으로 변환
    if 0x1100 <= consonant_code <= 0x1112:  # 초성 범위
        return chr(consonant_code + 169)  # 초성을 종성으로 변환
    
    # 종성을 초성으로 변환
    elif 0x11A8 <= consonant_code <= 0x11C2:  # 종성 범위
        return chr(consonant_code - 169)  # 종성을 초성으로 변환
    
    return consonant

# 음절 단위로 유사도 계산 및 초성/종성 변환 후 비교
def calculate_similarity(word1, word2):
    word1_split = split_syllables(word1)
    word2_split = split_syllables(word2)

    match_count = 0
    max_count = max(len(word1_split), len(word2_split))

    i = 0
    while i < min(len(word1_split), len(word2_split)):
        syllable1 = word1_split[i]
        syllable2 = word2_split[i]

        # print(f"Comparing syllable '{syllable1}' with '{syllable2}'")

        # 초성, 중성 비교
       
        if syllable1[0] == syllable2[0]:
            match_count += 4/11  # 각 성분이 일치할 때마다 1/3점 부여
            # print(f"Partial match for '{syllable1[0]}' and '{syllable2[0]}'. Match count: {match_count}")
        # else:
            # print(f"No match for '{syllable1[0]}' and '{syllable2[0]}'")
               
        if syllable1[1] == syllable2[1]:
            match_count += 1/3  # 각 성분이 일치할 때마다 1/3점 부여
            # print(f"Partial match for '{syllable1[1]}' and '{syllable2[1]}'. Match count: {match_count}")
        # else:
            # print(f"No match for '{syllable1[1]}' and '{syllable2[1]}'")

        # 종성 비교
        if syllable1[2] == syllable2[2]:  # 종성이 일치할 때
            match_count += 1/3
            # print(f"Match for 종성 '{syllable1[2]}' and '{syllable2[2]}'. Match count: {match_count}")
        elif syllable1[2] is None and syllable2[2] is not None:  # 첫 단어에 종성이 없고, 두 번째 단어에 종성이 있는 경우
            if i + 1 < len(word1_split):
                next_choseong = normalize_consonant(word1_split[i + 1][0])
                if next_choseong == syllable2[2]:
                    match_count += 1/3
                #     print(f"Match for 종성 '{syllable2[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                # else:
                #     print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        elif syllable2[2] is None and syllable1[2] is not None:  # 첫 단어에 종성이 있고, 두 번째 단어에 종성이 없는 경우
            if i + 1 < len(word2_split):
                next_choseong = normalize_consonant(word2_split[i + 1][0])
                if next_choseong == syllable1[2]:
                    match_count += 1/3
                #     print(f"Match for 종성 '{syllable1[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                # else:
                #     print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        # else:
            # print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")

        i += 1

    # 유사도 비율 계산
    similarity = match_count / max_count
    # print(f"Final similarity: {similarity:.2f}\n")

    return similarity

# 영어 이름을 한국어 발음으로 변환하고, 동일한 이름이 양쪽에 있으면 그대로 변환
def convert_word(word, entity_type, context_words, is_common):
    if entity_type in ['B-PER', 'I-PER']:  # 인물 이름인 경우 처리
        korean_pronunciation = g2p(word)
        google_translation = translator.translate(word, src='en', dest='ko').text
        # print(f"Korean Pronunciation of '{word}': {korean_pronunciation}")
        # print(is_common)
        # print(f"Korean Pronunciation of '{word}': {google_translation} from google")
        
        
        if is_common:  # 동일한 이름이 양쪽에 있을 경우
            if detect_language(google_translation): # 구글 번역이 한글일 경우
                return google_translation
            elif detect_language(korean_pronunciation): # G2P 가 한글일 경우
                return korean_pronunciation
            else:
                return google_translation
        elif context_words:  # Context words가 존재하는지 확인
            if detect_language(google_translation) and detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                
                if similarity_score <= similarity_score_google:
                    best_match = best_match_google
                    similarity_score = similarity_score_google
                
                if similarity_score > 0.69:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
            elif detect_language(google_translation):   
                                
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                

                best_match = best_match_google
                similarity_score = similarity_score_google
                if similarity_score > 0.69:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
                
            elif detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                
                if similarity_score > 0.69:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return korean_pronunciation
            else:
                return word
                
        elif detect_language(google_translation):
            return google_translation
        elif detect_language(korean_pronunciation):
            return korean_pronunciation
        else:
            return word
    else:
        return word  # 기타 개체는 그대로 유지

# 엔티티 인식 후 변환하는 메인 함수
def process_text(row):
    dialogue = row['dialogue']
    summary = row['summary']
    
    english_words_dialogue = extract_english_words(dialogue)
    # print(english_words_dialogue)
    english_words_summary = extract_english_words(summary)
    # print(english_words_summary)
    english_words = list(set(english_words_dialogue + english_words_summary))
    entities = classify_entities(english_words, ner_en)  # 영어 NER 사용
    
    context_words = re.findall(r'\b[가-힣]+\b', dialogue + " " + summary)  # 한국어 단어 추출
    # print(context_words)

    for word, entity_type in entities.items():
        is_common = word in english_words_dialogue and word in english_words_summary
        converted_word = convert_word(word, entity_type, context_words, is_common)
        dialogue = dialogue.replace(word, converted_word)
        summary = summary.replace(word, converted_word)
    
    row['new_dialogue'] = dialogue
    row['new_summary'] = summary
    return row

# 영어 단어 추출 함수
def extract_english_words(text):
    # 정규 표현식을 사용하여 영어 단어만 추출
    english_words = re.findall(r'\b[A-Za-z]+\b', text)
    return english_words


# 엔티티 분류 함수
def classify_entities(words, ner_pipeline):
    entities = {}
    for word in words:
        ner_results = ner_pipeline(word)
        if ner_results:
            entities[word] = ner_results[0]['entity']
    print(f"Extracted Entities: {entities}")  # NER 모델 출력 확인
    return entities




df_filtered['dialogue'] = df_filtered['dialogue'].apply(replace_special_tokens)
df_filtered['summary'] = df_filtered['summary'].apply(replace_special_tokens)

df_filtered = df_filtered.apply(process_text, axis=1)

# 특수 토큰 복원
df_filtered['dialogue'] = df_filtered['dialogue'].apply(restore_special_tokens)
df_filtered['summary'] = df_filtered['summary'].apply(restore_special_tokens)

df_filtered['new_dialogue'] = df_filtered['new_dialogue'].apply(restore_special_tokens)
df_filtered['new_summary'] = df_filtered['new_summary'].apply(restore_special_tokens)



Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracted Entities: {}
Extracted Entities: {'RAM': 'I-ORG'}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {'J': 'I-ORG'}
Extracted Entities: {}
Extracted Entities: {'PAD': 'I-ORG'}
Extracted Entities: {'ACRE': 'I-ORG', 'GRADE': 'I-ORG'}
Extracted Entities: {'ARM': 'I-ORG', 'IT': 'I-LOC'}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {'VISA': 'I-ORG'}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {'NBA': 'I-ORG'}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {'KTCTWY': 'I-ORG'}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
Extracted Entities: {}
E

Best match for '러너' in context: '먼저' with similarity 0.67
Best match for '탐' in context: '톰' with similarity 0.70

In [2]:
import pandas as pd

new_ordered_df = pd.read_csv('/dj/new_filtered.csv')


In [12]:
df = pd.DataFrame({
    'dialogue': [
        "#Person1#: 안녕, Peter Johnson! 어떻게 지냈어? 우리가 마지막으로 만난 지 꽤 됐네. "
        "#Person2#: 안녕, 죤 스미트! 난 잘 지냈어. 어제 뉴욕에서 돌아왔어. "
        "#Person1#: 뉴욕에서 좋은 시간 보냈어? 거기서 뭘 했어? "
        "#Person2#: 네, 정말 좋은 시간이었어. 주로 친구들을 만나고 몇 가지 비즈니스 미팅도 했어. "
        "#Person1#: 좋았겠네! 나도 뉴욕에 가본 지 오래됐어. 다음에 같이 가자. "
        "#Person2#: 좋은 생각이야. 그때는 우리 둘 다 여유를 좀 가질 수 있었으면 좋겠어. "
    ],
    'summary': [
        "#Person7# Peter Johnson and John Smith catch up after not seeing each other for a while. "
        "John mentions that he recently returned from New York, where he spent time meeting friends and attending business meetings. "
        "They both agree to plan a trip to New York together in the future."
    ]
})


import pandas as pd
import re


# 특수 토큰 설정 및 대체 단어 정의
special_tokens = {
    '#Address#': '#임시주소153#',
    '#CarNumber#': '#임시차량번호154#',
    '#CardNumber#': '#임시카드번호155#',
    '#DateOfBirth#': '#임시생년월일156#',
    '#Email#': '#임시이메일157#',
    '#PassportNumber#': '#임시여권번호158#',
    '#Person#': '#임시인물159#',
    '#Person1#': '#임시인물160#',
    '#Person2#': '#임시인물161#',
    '#Person3#': '#임시인물162#',
    '#Person4#': '#임시인물163#',
    '#Person5#': '#임시인물164#',
    '#Person6#': '#임시인물165#',
    '#Person7#': '#임시인물166#',
    '#PhoneNumber#': '#임시전화번호167#',
    '#SSN#': '#임시주민번호168#'
}
# 특수 토큰을 임시로 대체하는 함수
def replace_special_tokens(text):
    for token, replacement in special_tokens.items():
        text = text.replace(token, replacement)
    return text

# 특수 토큰을 원래 상태로 복원하는 함수
def restore_special_tokens(text):
    for replacement, token in special_tokens.items():
        text = text.replace(token, replacement)
    return text



df['dialogue'] = df['dialogue'].apply(replace_special_tokens)
df['summary'] = df['summary'].apply(replace_special_tokens)


# 특수 토큰 복원
df['dialogue'] = df['dialogue'].apply(restore_special_tokens)
df['summary'] = df['summary'].apply(restore_special_tokens)

In [14]:
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from g2pk import G2p
import pandas as pd
from googletrans import Translator
from konlpy.tag import Mecab
mecab = Mecab()

# G2P 객체 생성
g2p = G2p()
translator =Translator()

# NER 모델 로드
tokenizer_en = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model_en = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_en = pipeline("ner", model=model_en, tokenizer=tokenizer_en)

def detect_language(word):
    is_korean = all('가' <= char <= '힣' for char in word)

    if is_korean:
        return True
    else:
        return False

# 한글 자모 분리를 위한 유니코드 조작 함수
def decompose_hangul(syllable):
    base_code = ord(syllable) - 0xAC00
    choseong = base_code // 588
    jungseong = (base_code - (choseong * 588)) // 28
    jongseong = base_code % 28
    choseong_base = 0x1100
    jungseong_base = 0x1161
    jongseong_base = 0x11A7

    if jongseong != 0:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), chr(jongseong_base + jongseong)
    else:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), None  # 종성이 없으면 None

def split_syllables(text):
    result = []
    for char in text:
        if '가' <= char <= '힣':  # 한글 음절 여부 확인
            result.append(decompose_hangul(char))
        else:
            result.append((char,))
    return result

def normalize_consonant(consonant):
    if consonant is None:
        return None
    consonant_code = ord(consonant)
    
    # 초성을 종성으로 변환
    if 0x1100 <= consonant_code <= 0x1112:  # 초성 범위
        return chr(consonant_code + 169)  # 초성을 종성으로 변환
    
    # 종성을 초성으로 변환
    elif 0x11A8 <= consonant_code <= 0x11C2:  # 종성 범위
        return chr(consonant_code - 169)  # 종성을 초성으로 변환
    
    return consonant

# 음절 단위로 유사도 계산 및 초성/종성 변환 후 비교
def calculate_similarity(word1, word2):
    word1_split = split_syllables(word1)
    word2_split = split_syllables(word2)

    match_count = 0
    max_count = max(len(word1_split), len(word2_split))

    i = 0
    while i < min(len(word1_split), len(word2_split)):
        syllable1 = word1_split[i]
        syllable2 = word2_split[i]

        print(f"Comparing syllable '{syllable1}' with '{syllable2}'")

        # 초성, 중성 비교
       
        if syllable1[0] == syllable2[0]:
            match_count += 4/11  # 각 성분이 일치할 때마다 1/3점 부여
            print(f"Partial match for '{syllable1[0]}' and '{syllable2[0]}'. Match count: {match_count}")
        else:
            print(f"No match for '{syllable1[0]}' and '{syllable2[0]}'")
               
        if syllable1[1] == syllable2[1]:
            match_count += 1/3  # 각 성분이 일치할 때마다 1/3점 부여
            print(f"Partial match for '{syllable1[1]}' and '{syllable2[1]}'. Match count: {match_count}")
        else:
            print(f"No match for '{syllable1[1]}' and '{syllable2[1]}'")

        # 종성 비교
        if syllable1[2] == syllable2[2]:  # 종성이 일치할 때
            match_count += 1/3
            print(f"Match for 종성 '{syllable1[2]}' and '{syllable2[2]}'. Match count: {match_count}")
        elif syllable1[2] is None and syllable2[2] is not None:  # 첫 단어에 종성이 없고, 두 번째 단어에 종성이 있는 경우
            if i + 1 < len(word1_split):
                next_choseong = normalize_consonant(word1_split[i + 1][0])
                if next_choseong == syllable2[2]:
                    match_count += 1/3
                    print(f"Match for 종성 '{syllable2[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                else:
                    print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        elif syllable2[2] is None and syllable1[2] is not None:  # 첫 단어에 종성이 있고, 두 번째 단어에 종성이 없는 경우
            if i + 1 < len(word2_split):
                next_choseong = normalize_consonant(word2_split[i + 1][0])
                if next_choseong == syllable1[2]:
                    match_count += 1/3
                    print(f"Match for 종성 '{syllable1[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                else:
                    print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        else:
            print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")

        i += 1

    # 유사도 비율 계산
    similarity = match_count / max_count
    print(f"Final similarity: {similarity:.2f}\n")

    return similarity

# 영어 이름을 한국어 발음으로 변환하고, 동일한 이름이 양쪽에 있으면 그대로 변환
def convert_word(word, entity_type, context_words, is_common):
    if entity_type in ['B-PER', 'I-PER']:  # 인물 이름인 경우 처리
        korean_pronunciation = g2p(word)
        google_translation = translator.translate(word, src='en', dest='ko').text
        print(f"Korean Pronunciation of '{word}': {korean_pronunciation}")
        print(is_common)
        print(f"Korean Pronunciation of '{word}': {google_translation} from google")
        
        
        if is_common:  # 동일한 이름이 양쪽에 있을 경우
            if detect_language(google_translation): # 구글 번역이 한글일 경우
                return google_translation
            elif detect_language(korean_pronunciation): # G2P 가 한글일 경우
                return korean_pronunciation
            else:
                return google_translation
        elif context_words:  # Context words가 존재하는지 확인
            if detect_language(google_translation) and detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                
                if similarity_score <= similarity_score_google:
                    best_match = best_match_google
                    similarity_score = similarity_score_google
                
                if similarity_score > 0.6:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
            elif detect_language(google_translation):   
                                
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                

                best_match = best_match_google
                similarity_score = similarity_score_google
                if similarity_score > 0.6:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
                
            elif detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                
                if similarity_score > 0.6:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return korean_pronunciation
            else:
                return word
                
        elif detect_language(google_translation):
            return google_translation
        elif detect_language(korean_pronunciation):
            return korean_pronunciation
        else:
            return word
    else:
        return word  # 기타 개체는 그대로 유지

# 엔티티 인식 후 변환하는 메인 함수
def process_text(row):
    dialogue = row['dialogue']
    summary = row['summary']
    
    english_words_dialogue = extract_english_words(dialogue)
    print(english_words_dialogue)
    english_words_summary = extract_english_words(summary)
    print(english_words_summary)
    english_words = list(set(english_words_dialogue + english_words_summary))
    entities = classify_entities(english_words, ner_en)  # 영어 NER 사용
    
    # context_words = re.findall(r'\b[가-힣]+\b', dialogue + " " + summary)  # 한국어 단어 추출
    
    context_words = mecab.nouns(dialogue + " " + summary)
    print(context_words)

    for word, entity_type in entities.items():
        is_common = word in english_words_dialogue and word in english_words_summary
        converted_word = convert_word(word, entity_type, context_words, is_common)
        dialogue = dialogue.replace(word, converted_word)
        summary = summary.replace(word, converted_word)
    
    row['new_dialogue'] = dialogue
    row['new_summary'] = summary
    return row

# 영어 단어 추출 함수
def extract_english_words(text):
    # 정규 표현식을 사용하여 영어 단어만 추출
    english_words = re.findall(r'\b[A-Za-z]+\b', text)
    return english_words


# 엔티티 분류 함수
def classify_entities(words, ner_pipeline):
    entities = {}
    for word in words:
        ner_results = ner_pipeline(word)
        if ner_results:
            entities[word] = ner_results[0]['entity']
    print(f"Extracted Entities: {entities}")  # NER 모델 출력 확인
    return entities

# 예제 데이터프레임
df = pd.DataFrame({
    'dialogue': [
        "#Person1#: 안녕, Peter Johnson! 어떻게 지냈어? 우리가 마지막으로 만난 지 꽤 됐네. "
        "#Person2#: 안녕, 죤 스미트! 난 잘 지냈어. 어제 뉴욕에서 돌아왔어. "
        "#Person1#: 뉴욕에서 좋은 시간 보냈어? 거기서 뭘 했어? "
        "#Person2#: 네, 정말 좋은 시간이었어. 주로 친구들을 만나고 몇 가지 비즈니스 미팅도 했어. "
        "#Person1#: 좋았겠네! 나도 뉴욕에 가본 지 오래됐어. 다음에 같이 가자. "
        "#Person2#: 좋은 생각이야. 그때는 우리 둘 다 여유를 좀 가질 수 있었으면 좋겠어. "
    ],
    'summary': [
        "Peter Johnson and John Smith Dalene catch up after not seeing each other for a while. "
        "John mentions that he recently returned from New York, where he spent time meeting friends and attending business meetings. "
        "They both agree to plan a trip to New York together in the future."
    ]
}) 

# 처리 실행
df = df.apply(process_text, axis=1)

# 결과 확인
print(df['dialogue'])
print(df['summary'])


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['Peter', 'Johnson']
['Peter', 'Johnson', 'and', 'John', 'Smith', 'Dalene', 'catch', 'up', 'after', 'not', 'seeing', 'each', 'other', 'for', 'a', 'while', 'John', 'mentions', 'that', 'he', 'recently', 'returned', 'from', 'New', 'York', 'where', 'he', 'spent', 'time', 'meeting', 'friends', 'and', 'attending', 'business', 'meetings', 'They', 'both', 'agree', 'to', 'plan', 'a', 'trip', 'to', 'New', 'York', 'together', 'in', 'the', 'future']
Extracted Entities: {'Peter': 'I-PER', 'New': 'I-ORG', 'York': 'I-ORG', 'Johnson': 'I-PER', 'Dalene': 'I-PER', 'John': 'I-PER', 'Smith': 'I-PER'}
['우리', '마지막', '지', '죤', '스미트', '난', '뉴욕', '뉴욕', '시간', '거기', '뭘', '시간', '친구', '가지', '비즈니스', '미팅', '나', '뉴욕', '지', '다음', '생각', '그때', '우리', '둘', '여유', '수']
Korean Pronunciation of 'Peter': 피터
True
Korean Pronunciation of 'Peter': 베드로 from google
Korean Pronunciation of 'Johnson': 잔선
True
Korean Pronunciation of 'Johnson': 존슨 from google
Korean Pronunciation of 'Dalene': Dalene
False
Korean Pronunciation of 'Dale

In [15]:
print(df['new_dialogue'])
print(df['new_summary'])

0    #Person1#: 안녕, 베드로 존슨! 어떻게 지냈어? 우리가 마지막으로 만난 지...
Name: new_dialogue, dtype: object
0    베드로 존슨 and 죤 스미트 달렌 catch up after not seeing ...
Name: new_summary, dtype: object


In [13]:
from konlpy.tag import Mecab
mecab = Mecab()
topic = """
        "#Person1#: 안녕, Peter Johnson! 어떻게 지냈어? 우리가 마지막으로 만난 지 꽤 됐네. "
        "#Person2#: 안녕, 죤 스미트! 난 잘 지냈어. 어제 뉴욕에서 돌아왔어. "
        "#Person1#: 뉴욕에서 좋은 시간 보냈어? 거기서 뭘 했어? "
        "#Person2#: 네, 정말 좋은 시간이었어. 주로 친구들을 만나고 몇 가지 비즈니스 미팅도 했어. "
        "#Person1#: 좋았겠네! 나도 뉴욕에 가본 지 오래됐어. 다음에 같이 가자. "
        "#Person2#: 좋은 생각이야. 그때는 우리 둘 다 여유를 좀 가질 수 있었으면 좋겠어. 신디는? 뭐 먹어?
"""
tokens = mecab.nouns(topic)
print(tokens)

['우리', '마지막', '지', '죤', '스미트', '난', '뉴욕', '뉴욕', '시간', '거기', '뭘', '시간', '친구', '가지', '비즈니스', '미팅', '나', '뉴욕', '지', '다음', '생각', '그때', '우리', '둘', '여유', '수', '신디']


In [21]:
from transformers import ElectraTokenizer, ElectraForTokenClassification, pipeline

# Load KoELECTRA tokenizer and model for NER
tokenizer = ElectraTokenizer.from_pretrained("monologg/kocharelectra-base-modu-ner-all")
model = ElectraForTokenClassification.from_pretrained("monologg/kocharelectra-base-modu-ner-all")

# Create a pipeline for named entity recognition
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample Korean text with names to extract
text = """
        "#Person1#: 안녕, Peter Johnson! 어떻게 지냈어? 우리가 마지막으로 만난 지 꽤 됐네."
        "#Person2#: 안녕, 죤 스미트! 난 잘 지냈어. 어제 뉴욕에서 돌아왔어."
        "#Person1#: 뉴욕에서 좋은 시간 보냈어? 거기서 뭘 했어?"
        "#Person2#: 네, 정말 좋은 시간이었어. 주로 친구들을 만나고 몇 가지 비즈니스 미팅도 했어."
        "#Person1#: 좋았겠네! 나도 뉴욕에 가본 지 오래됐어. 다음에 같이 가자."
        "#Person2#: 좋은 생각이야. 그때는 우리 둘 다 여유를 좀 가질 수 있었으면 좋겠어."
"""

# Run the NER pipeline
ner_results = ner_pipeline(text)

# Print results
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")


tokenizer_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/47.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

In [22]:
special_tokens = {
    '#Address#': '#임시주소153#',
    '#CarNumber#': '#임시차량번호154#',
    '#CardNumber#': '#임시카드번호155#',
    '#DateOfBirth#': '#임시생년월일156#',
    '#Email#': '#임시이메일157#',
    '#PassportNumber#': '#임시여권번호158#',
    '#Person#': '#임시인물159#',
    '#Person1#': '#임시인물160#',
    '#Person2#': '#임시인물161#',
    '#Person3#': '#임시인물162#',
    '#Person4#': '#임시인물163#',
    '#Person5#': '#임시인물164#',
    '#Person6#': '#임시인물165#',
    '#Person7#': '#임시인물166#',
    '#PhoneNumber#': '#임시전화번호167#',
    '#SSN#': '#임시주민번호168#'
}
special_tokens = ['#Address#', '#CarNumber#', '#CardNumber#', '#DateOfBirth#', '#Email#', '#PassportNumber#', '#Person#', '#Person1#', '#Person2#', '#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#', '#PhoneNumber#', '#SSN#']

[]

In [2]:
special_tokens = ['#Address#', '#CarNumber#', '#CardNumber#', '#DateOfBirth#', '#Email#', '#PassportNumber#', '#Person#', '#Person1#', '#Person2#', '#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#', '#PhoneNumber#', '#SSN#']
special_tokens

['#Address#',
 '#CarNumber#',
 '#CardNumber#',
 '#DateOfBirth#',
 '#Email#',
 '#PassportNumber#',
 '#Person#',
 '#Person1#',
 '#Person2#',
 '#Person3#',
 '#Person4#',
 '#Person5#',
 '#Person6#',
 '#Person7#',
 '#PhoneNumber#',
 '#SSN#']

In [16]:
df_filtered = new_ordered_df[new_ordered_df['english_words_count'] > 0].copy()

In [23]:
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from g2pk import G2p
import pandas as pd
from googletrans import Translator
from konlpy.tag import Mecab
mecab = Mecab()

# G2P 객체 생성
g2p = G2p()
translator =Translator()

# NER 모델 로드
tokenizer_en = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model_en = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
ner_en = pipeline("ner", model=model_en, tokenizer=tokenizer_en)



special_tokens = {
    '#Address#': '#임시주소153#',
    '#CarNumber#': '#임시차량번호154#',
    '#CardNumber#': '#임시카드번호155#',
    '#DateOfBirth#': '#임시생년월일156#',
    '#Email#': '#임시이메일157#',
    '#PassportNumber#': '#임시여권번호158#',
    '#Person#': '#임시인물159#',
    '#Person1#': '#임시인물160#',
    '#Person2#': '#임시인물161#',
    '#Person3#': '#임시인물162#',
    '#Person4#': '#임시인물163#',
    '#Person5#': '#임시인물164#',
    '#Person6#': '#임시인물165#',
    '#Person7#': '#임시인물166#',
    '#PhoneNumber#': '#임시전화번호167#',
    '#SSN#': '#임시주민번호168#'
}
# 특수 토큰을 임시로 대체하는 함수
def replace_special_tokens(text):
    for token, replacement in special_tokens.items():
        text = text.replace(token, replacement)
    return text

# 특수 토큰을 원래 상태로 복원하는 함수
def restore_special_tokens(text):
    for replacement, token in special_tokens.items():
        text = text.replace(token, replacement)
    return text

def detect_language(word):
    is_korean = all('가' <= char <= '힣' for char in word)

    if is_korean:
        return True
    else:
        return False

# 한글 자모 분리를 위한 유니코드 조작 함수
def decompose_hangul(syllable):
    base_code = ord(syllable) - 0xAC00
    choseong = base_code // 588
    jungseong = (base_code - (choseong * 588)) // 28
    jongseong = base_code % 28
    choseong_base = 0x1100
    jungseong_base = 0x1161
    jongseong_base = 0x11A7

    if jongseong != 0:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), chr(jongseong_base + jongseong)
    else:
        return chr(choseong_base + choseong), chr(jungseong_base + jungseong), None  # 종성이 없으면 None

def split_syllables(text):
    result = []
    for char in text:
        if '가' <= char <= '힣':  # 한글 음절 여부 확인
            result.append(decompose_hangul(char))
        else:
            result.append((char,))
    return result

def normalize_consonant(consonant):
    if consonant is None:
        return None
    consonant_code = ord(consonant)
    
    # 초성을 종성으로 변환
    if 0x1100 <= consonant_code <= 0x1112:  # 초성 범위
        return chr(consonant_code + 169)  # 초성을 종성으로 변환
    
    # 종성을 초성으로 변환
    elif 0x11A8 <= consonant_code <= 0x11C2:  # 종성 범위
        return chr(consonant_code - 169)  # 종성을 초성으로 변환
    
    return consonant

# 음절 단위로 유사도 계산 및 초성/종성 변환 후 비교
def calculate_similarity(word1, word2):
    word1_split = split_syllables(word1)
    word2_split = split_syllables(word2)

    match_count = 0
    max_count = max(len(word1_split), len(word2_split))

    i = 0
    while i < min(len(word1_split), len(word2_split)):
        syllable1 = word1_split[i]
        syllable2 = word2_split[i]

        # print(f"Comparing syllable '{syllable1}' with '{syllable2}'")

        # 초성, 중성 비교
       
        if syllable1[0] == syllable2[0]:
            match_count += 4/11  # 각 성분이 일치할 때마다 1/3점 부여
            # print(f"Partial match for '{syllable1[0]}' and '{syllable2[0]}'. Match count: {match_count}")
        # else:
            # print(f"No match for '{syllable1[0]}' and '{syllable2[0]}'")
               
        if syllable1[1] == syllable2[1]:
            match_count += 1/3  # 각 성분이 일치할 때마다 1/3점 부여
            # print(f"Partial match for '{syllable1[1]}' and '{syllable2[1]}'. Match count: {match_count}")
        # else:
            # print(f"No match for '{syllable1[1]}' and '{syllable2[1]}'")

        # 종성 비교
        if syllable1[2] == syllable2[2]:  # 종성이 일치할 때
            match_count += 4/11
            # print(f"Match for 종성 '{syllable1[2]}' and '{syllable2[2]}'. Match count: {match_count}")
        elif syllable1[2] is None and syllable2[2] is not None:  # 첫 단어에 종성이 없고, 두 번째 단어에 종성이 있는 경우
            if i + 1 < len(word1_split):
                next_choseong = normalize_consonant(word1_split[i + 1][0])
                if next_choseong == syllable2[2]:
                    match_count += 4/11
                #     print(f"Match for 종성 '{syllable2[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                # else:
                #     print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        elif syllable2[2] is None and syllable1[2] is not None:  # 첫 단어에 종성이 있고, 두 번째 단어에 종성이 없는 경우
            if i + 1 < len(word2_split):
                next_choseong = normalize_consonant(word2_split[i + 1][0])
                if next_choseong == syllable1[2]:
                    match_count += 4/11
                #     print(f"Match for 종성 '{syllable1[2]}' with next 초성 '{next_choseong}'. Match count: {match_count}")
                # else:
                #     print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")
        # else:
            # print(f"No match for 종성 '{syllable1[2]}' and '{syllable2[2]}'")

        i += 1

    # 유사도 비율 계산
    similarity = match_count / max_count
    # print(f"Final similarity: {similarity:.2f}\n")

    return similarity

# 영어 이름을 한국어 발음으로 변환하고, 동일한 이름이 양쪽에 있으면 그대로 변환
def convert_word(word, entity_type, context_words, is_common):
    if entity_type in ['B-PER', 'I-PER']:  # 인물 이름인 경우 처리
        korean_pronunciation = g2p(word)
        google_translation = translator.translate(word, src='en', dest='ko').text
        # print(f"Korean Pronunciation of '{word}': {korean_pronunciation}")
        # print(is_common)
        # print(f"Korean Pronunciation of '{word}': {google_translation} from google")
        
        
        if is_common:  # 동일한 이름이 양쪽에 있을 경우
            if detect_language(google_translation): # 구글 번역이 한글일 경우
                return google_translation
            elif detect_language(korean_pronunciation): # G2P 가 한글일 경우
                return korean_pronunciation
            else:
                return google_translation
        elif context_words:  # Context words가 존재하는지 확인
            if detect_language(google_translation) and detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                
                if similarity_score <= similarity_score_google:
                    best_match = best_match_google
                    similarity_score = similarity_score_google
                
                if similarity_score > 0.72:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
            elif detect_language(google_translation):   
                                
                best_match_google = max(context_words, key=lambda w: calculate_similarity(google_translation, w), default=google_translation)
                similarity_score_google = calculate_similarity(google_translation, best_match_google)
                print(f"Best match for '{google_translation}' in context: '{best_match_google}' with similarity {similarity_score_google:.2f}")
                

                best_match = best_match_google
                similarity_score = similarity_score_google
                if similarity_score > 0.72:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return google_translation
                
            elif detect_language(korean_pronunciation):
                
                best_match = max(context_words, key=lambda w: calculate_similarity(korean_pronunciation, w), default=korean_pronunciation)
                similarity_score = calculate_similarity(korean_pronunciation, best_match)
                print(f"Best match for '{korean_pronunciation}' in context: '{best_match}' with similarity {similarity_score:.2f}")
                
                if similarity_score > 0.72:  # 유사도 임계값을 낮춰서 매칭 범위를 확장
                    return best_match
                else:
                    return korean_pronunciation
            else:
                return word
                
        elif detect_language(google_translation):
            return google_translation
        elif detect_language(korean_pronunciation):
            return korean_pronunciation
        else:
            return word
    else:
        return word  # 기타 개체는 그대로 유지

# 엔티티 인식 후 변환하는 메인 함수
def process_text(row):
    dialogue = row['dialogue']
    summary = row['summary']
    
    english_words_dialogue = extract_english_words(dialogue)
    # print(english_words_dialogue)
    english_words_summary = extract_english_words(summary)
    # print(english_words_summary)
    english_words = list(set(english_words_dialogue + english_words_summary))
    entities = classify_entities(english_words, ner_en)  # 영어 NER 사용
    
    # context_words = re.findall(r'\b[가-힣]+\b', dialogue + " " + summary)  # 한국어 단어 추출
    # print(context_words)
    
    # context_words = re.findall(r'\b[가-힣]+\b', dialogue + " " + summary)  # 한국어 단어 추출
    
    context_words = mecab.nouns(dialogue + " " + summary)

    for word, entity_type in entities.items():
        is_common = word in english_words_dialogue and word in english_words_summary
        converted_word = convert_word(word, entity_type, context_words, is_common)
        dialogue = dialogue.replace(word, converted_word)
        summary = summary.replace(word, converted_word)
    
    row['new_dialogue'] = dialogue
    row['new_summary'] = summary
    return row

# 영어 단어 추출 함수
def extract_english_words(text):
    # 정규 표현식을 사용하여 영어 단어만 추출
    english_words = re.findall(r'\b[A-Za-z]+\b', text)
    return english_words


# 엔티티 분류 함수
def classify_entities(words, ner_pipeline):
    entities = {}
    for word in words:
        ner_results = ner_pipeline(word)
        if ner_results:
            entities[word] = ner_results[0]['entity']
    # print(f"Extracted Entities: {entities}")  # NER 모델 출력 확인
    return entities




df_filtered['dialogue'] = df_filtered['dialogue'].apply(replace_special_tokens)
df_filtered['summary'] = df_filtered['summary'].apply(replace_special_tokens)

df_filtered = df_filtered.apply(process_text, axis=1)

# 특수 토큰 복원
df_filtered['dialogue'] = df_filtered['dialogue'].apply(restore_special_tokens)
df_filtered['summary'] = df_filtered['summary'].apply(restore_special_tokens)

df_filtered['new_dialogue'] = df_filtered['new_dialogue'].apply(restore_special_tokens)
df_filtered['new_summary'] = df_filtered['new_summary'].apply(restore_special_tokens)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Best match for '저스턴' in context: '점수' with similarity 0.47
Best match for '저스틴' in context: '점수' with similarity 0.47
Best match for '코렐' in context: '고려' with similarity 0.53
Best match for '코렐' in context: '고려' with similarity 0.53
Best match for '폴' in context: '폴' with similarity 1.06
Best match for '폴' in context: '폴' with similarity 1.06
Best match for '헤이필드' in context: '헤이필드' with similarity 1.06
Best match for '마거릳' in context: '마무리' with similarity 0.71
Best match for '마가렛' in context: '마무리' with similarity 0.60
Best match for '킴벌' in context: '며칠' with similarity 0.36
Best match for '킴블' in context: '며칠' with similarity 0.36
Best match for '탐' in context: '임시' with similarity 0.18
Best match for '데이비드' in context: '토요일' with similarity 0.36
Best match for '데이비드' in context: '토요일' with similarity 0.36
Best match for '제인' in context: '즈이' with similarity 0.71
Best match for '계집애' in context: '거기' with similarity 0.35
Best match for '마크' in context: '나라' with similarity 0.53
Be

In [25]:
check = df_filtered.copy()

In [26]:
check['dialogue'] = check['new_dialogue']

In [27]:
check['summary'] = check['new_summary']

In [28]:
new_ordered_df.update(check[['dialogue', 'summary']])

# Drop the temporary columns 'new_dialogue' and 'new_summary' from 'check'
check.drop(columns=['new_dialogue', 'new_summary'], inplace=True)

In [29]:
new_ordered_df

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,fname,dialogue,summary,topic,combined_text,similarity,topic_nouns,topic_percent,topic_similarity,combined_similarity,k_percent,k_percent_dialogue,topic_dialogue_similarity,english_words,english_words_count
0,0,0,0,train_0,"#Person1#: 안녕하세요, 스미스씨. 저는 호킨스 의사입니다. 오늘 왜 오셨나...","스미스씨가 건강검진을 받고 있고, 호킨스 의사는 매년 건강검진을 받는 것을 권장합니...",건강검진 받기,"건강검진 받기 #Person1#: 안녕하세요, 스미스씨. 저는 호킨스 의사입니다. ...",0.590560,"['건강', '검진']",1.0,0.438091,1.028651,1.000000,0.896552,0.322930,[],0
1,1,1,1,train_1,"#Person1#: 안녕하세요, 파커 부인, 어떻게 지내셨나요?\n#Person2#...",파커 부인이 리키를 데리고 백신 접종을 하러 갔다. 피터스 박사는 기록을 확인한 후...,백신,"백신 #Person1#: 안녕하세요, 파커 부인, 어떻게 지내셨나요?\n#Perso...",0.645074,['백신'],1.0,0.322626,0.967700,1.000000,0.906250,0.314679,"['B', 'A']",2
2,2,2,2,train_2,"#Person1#: 실례합니다, 열쇠 한 묶음 보셨나요?\n#Person2#: 어떤...","#Person1#은 열쇠 한 묶음을 찾고 있고, 그것을 찾기 위해 #Person2#...",열쇠 찾기,"열쇠 찾기 #Person1#: 실례합니다, 열쇠 한 묶음 보셨나요?\n#Person...",0.669059,['열쇠'],1.0,0.629212,1.298272,0.846154,0.836066,0.403980,[],0
3,3,3,3,train_3,#Person1#: 왜 너는 여자친구가 있다는 걸 말해주지 않았어?\n#Person...,#Person1#은 #Person2#가 여자친구가 있고 그녀와 결혼할 것이라는 사실...,여자친구가 있다,여자친구가 있다 #Person1#: 왜 너는 여자친구가 있다는 걸 말해주지 않았어?...,0.756981,"['여자', '친구']",1.0,0.489802,1.246783,0.785714,0.857143,0.275326,[],0
4,4,4,4,train_4,"#Person1#: 안녕, 숙녀분들! 오늘 밤 당신들은 정말 멋져 보여. 이 춤을 ...",말릭이 니키에게 춤을 요청한다. 말릭이 발을 밟는 것을 신경 쓰지 않는다면 니키는 ...,댄스,"댄스 #Person1#: 안녕, 숙녀분들! 오늘 밤 당신들은 정말 멋져 보여. 이 ...",0.399786,['댄스'],0.0,0.316758,0.716544,1.000000,0.898551,0.251543,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12452,12452,12452,12452,train_12455,#Person1#: 실례합니다. 맨체스터 출신의 그린 씨이신가요?\n#Person2...,탄 링은 흰머리와 수염으로 쉽게 인식되는 그린 씨를 만나 호텔로 데려갈 예정입니다....,누군가를 태우다,누군가를 태우다 #Person1#: 실례합니다. 맨체스터 출신의 그린 씨이신가요?\...,0.443081,['누군가'],0.0,0.181848,0.624929,1.000000,0.907407,0.218529,[],0
12453,12453,12453,12453,train_12456,#Person1#: 이윙 씨가 우리가 컨퍼런스 센터에 오후 4시에 도착해야 한다고 ...,#Person1#과 #Person2#는 이윙 씨가 늦지 않도록 요청했기 때문에 컨퍼...,컨퍼런스 센터,컨퍼런스 센터 #Person1#: 이윙 씨가 우리가 컨퍼런스 센터에 오후 4시에 도...,0.657182,"['컨퍼런스', '센터']",1.0,0.335172,0.992354,0.857143,0.942857,0.156387,[],0
12454,12454,12454,12454,train_12457,#Person1#: 오늘 어떻게 도와드릴까요?\n#Person2#: 차를 빌리고 싶...,#Person2#는 #Person1#의 도움으로 5일 동안 소형 차를 빌립니다.,차 렌트,차 렌트 #Person1#: 오늘 어떻게 도와드릴까요?\n#Person2#: 차를 ...,0.715181,"['차', '렌트']",0.5,0.476258,1.191439,0.750000,0.848485,0.347420,[],0
12455,12455,12455,12455,train_12458,#Person1#: 오늘 좀 행복해 보이지 않아. 무슨 일 있어?\n#Person2...,#Person2#의 엄마가 일자리를 잃었다. #Person2#는 엄마가 우울해하지 ...,실직,실직 #Person1#: 오늘 좀 행복해 보이지 않아. 무슨 일 있어?\n#Pers...,0.689397,['실직'],0.0,0.154494,0.843891,0.764706,0.904762,0.143657,[],0


In [30]:
new_ordered_df.to_csv('/dj2/new_transformed.csv')

In [31]:
new_df = new_ordered_df[['fname', 'dialogue', 'summary', 'topic']]

# Save this new dataframe to a CSV file
new_df.to_csv('/dj2/new_transformed_train.csv', index=False)