In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, random
import json
from tqdm import tqdm

# Fine-Tuned 모델 경로
model_output_dir = "./model_output/llama3_sft_idioms"

# 모델과 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_output_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_output_dir, torch_dtype=torch.bfloat16).to('cuda')

def generate_response(user_input, max_length=256):
    # 사용자 입력을 토큰화
    inputs = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    ).to('cuda')

    # 모델로부터 응답 생성
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=3,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    # 응답 디코딩
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# 데이터셋 파일 경로
dataset_path = '/data/uijih/8b_instruct/augmented_idioms_conversations.jsonl'

# 원래 데이터셋에서 샘플 20개를 로드
with open(dataset_path, 'r') as f:
    raw_data = [json.loads(line) for line in f]

# 20개의 샘플 추출
random_indices = random.sample(range(1, 401), 10)
samples = [raw_data[i] for i in random_indices if i < len(raw_data)]

# for idx, item in enumerate(samples):

#     idiom_input = item['conversations'][0]['user']
#     idiom_response = generate_response(idiom_input)
    
#     print(f"Sample {idx + 1}:")
#     print(f"  User (Idiom-to-Meaning): {idiom_input}")
#     print(f"  Assistant: {idiom_response}")
#     print("-" * 50)

# translation /w 2 shots

In [None]:
 import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch, random
import json
from tqdm import tqdm

# Fine-Tuned 모델 경로
model_output_dir = "/data/uijih/8b_instruct/model_output/llama3_sft_idioms-m"

# 모델과 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_output_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_output_dir, torch_dtype=torch.bfloat16).to('cuda')

# 70
# bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_quant_type='nf4',
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_compute_dtype=torch.bfloat16
#     )

# model = AutoModelForCausalLM.from_pretrained(
#         model_output_dir,
#         device_map="auto",
#         quantization_config=bnb_config, ignore_mismatched_sizes=True  # 크기 불일치 무시
#     )
# tokenizer = AutoTokenizer.from_pretrained(model_output_dir)
# model.resize_token_embeddings(len(tokenizer))

def generate_response(user_input, max_length=256):
    # 사용자 입력을 토큰화
    inputs = tokenizer(
        user_input,
        return_tensors="pt",
        #padding=True,
        truncation=True,
        max_length=max_length
    ).to('cuda')

    # 모델로부터 응답 생성
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=3,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    # 응답 디코딩
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# CSV 데이터셋 파일 경로
dataset_path = "/data/uijih/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
data = pd.read_csv(dataset_path)

# 10개의 샘플 추출 
#samples = data.sample(n=10)
samples = data

# 샷 정의
shots_en_to_kr= [
    {
        "source": "Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.",
        "target": "회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어."
    },
    {
        "source": "She is real thorn in his side.",
        "target": "그녀는 진짜 그의 눈엣가시이다."
    }
]

shots_kr_to_en = [
    {
        "source": "회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.",
        "target": "Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face."
    },
    {
        "source": "그녀는 진짜 그의 눈엣가시이다.",
        "target": "She is real thorn in his side."
    }
]

# 프롬프트 생성 함수 (샷 포함)
def create_prompt(korean_sentence=None, english_sentence=None, shots=None, direction="KR_to_EN"):
    prompt = ""
    if direction == "KR_to_EN":
        prompt += "Translate the following Korean sentence to English, making sure to translate *idioms as idioms*.\n\n"
        # 샷을 프롬프트에 추가
        for shot in shots:
            prompt += f"Source (KR): {shot['source']}\nTarget (EN): {shot['target']}\n\n"
        prompt += f"Source (KR): {korean_sentence}\nTarget (EN): "
    
    elif direction == "EN_to_KR":
        prompt += "Translate the following English sentence to Korean, making sure to translate *idioms as idioms*.\n\n"
        # 샷을 프롬프트에 추가
        for shot in shots:
            prompt += f"Source (EN): {shot['source']}\nTarget (KR): {shot['target']}\n\n"
        prompt += f"Source (EN): {english_sentence}\nTarget (KR): "
    
    return prompt

# 결과를 저장할 데이터프레임 초기화
results_kr_to_en_df = pd.DataFrame(columns=['Original_KR_Sentence', 'Label', 'KR_to_EN_Translation'])
results_en_to_kr_df = pd.DataFrame(columns=['Original_EN_Sentence', 'Label', 'EN_to_KR_Translation'])

# 샘플 데이터에 대한 테스트 수행 및 결과 저장
for idx, (_, item) in enumerate(samples.iterrows()):
    # 한국어 -> 영어 번역 프롬프트 생성 (샷 포함)
    final_prompt_kr_to_en = create_prompt(korean_sentence=item['KR_Sentence'], shots=shots_kr_to_en, direction="KR_to_EN")
    # 영어 -> 한국어 번역 프롬프트 생성 (샷 포함)
    final_prompt_en_to_kr = create_prompt(english_sentence=item['Sentence'], shots=shots_en_to_kr, direction="EN_to_KR")
    
    # 모델 응답 생성 (프롬프트 제외하고 번역 결과만 생성)
    translation_response_kr_to_en = generate_response(final_prompt_kr_to_en)
    translation_response_en_to_kr = generate_response(final_prompt_en_to_kr)

    # # 프롬프트 길이만큼 제거하여 번역 결과만 저장
    kr_to_en_translation = translation_response_kr_to_en[len(final_prompt_kr_to_en):].strip()
    en_to_kr_translation = translation_response_en_to_kr[len(final_prompt_en_to_kr):].strip()

    # 결과를 데이터프레임에 추가 (한국어 -> 영어)
    new_row_kr_to_en = {
        'Original_KR_Sentence': item['KR_Sentence'],
        'Label' : item['Idiom'],
        'KR_to_EN_Translation': kr_to_en_translation
    }
    results_kr_to_en_df = pd.concat([results_kr_to_en_df, pd.DataFrame([new_row_kr_to_en])], ignore_index=True)

    # 결과를 데이터프레임에 추가 (영어 -> 한국어)
    new_row_en_to_kr = {
        'Original_EN_Sentence': item['Sentence'],
        'Label' : item['KR_Idiom'],
        'EN_to_KR_Translation': en_to_kr_translation
    }
    results_en_to_kr_df = pd.concat([results_en_to_kr_df, pd.DataFrame([new_row_en_to_kr])], ignore_index=True)

    # 출력 결과
    print(f"Source: {item['KR_Sentence']}")
    print(f"Assistant (KR to EN): {kr_to_en_translation}")
    print("-" * 100)
    print(f"Source: {item['Sentence']}")
    print(f"Assistant (EN to KR): {en_to_kr_translation}")
    print("-" * 100)

# 결과를 CSV 파일로 저장
output_csv_kr_to_en_path = 'results/llama70-kr_to_en-m.csv'
output_csv_en_to_kr_path = 'results/llama70-en_to_kr-m.csv'
results_kr_to_en_df.to_csv(output_csv_kr_to_en_path, index=False)
results_en_to_kr_df.to_csv(output_csv_en_to_kr_path, index=False)

print(f"KR to EN translation results successfully saved to {output_csv_kr_to_en_path}")
print(f"EN to KR translation results successfully saved to {output_csv_en_to_kr_path}")

"""
Translate the following Korean sentence to English, making sure to translate *idioms as idioms*.

Source (KR): 회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.
Target (EN): Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.

Source (KR): 그녀는 진짜 그의 눈엣가시이다.
Target (EN): She is real thorn in his side.

Source (KR): {KR_Sentence}
Target (EN): 
"""

# NEW (12.27)

In [5]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_output_dir = "/data/uijih/8b_instruct/model_output/llama3_sft_idioms-NEW-2"
tokenizer = AutoTokenizer.from_pretrained(model_output_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_output_dir, torch_dtype=torch.bfloat16).to('cuda')

def generate_response(user_input, max_length=256):
    inputs = tokenizer(
        user_input,
        return_tensors="pt",
        truncation=True,
        max_length=max_length
    ).to('cuda')

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=3,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


dataset_path_kr_to_en = "/data/uijih/dataset/final/KREN_test_141.csv"
dataset_path_en_to_kr = "/data/uijih/dataset/final/ENKR_test_187.csv"
data_kr_to_en = pd.read_csv(dataset_path_kr_to_en)
data_en_to_kr = pd.read_csv(dataset_path_en_to_kr)

shots_kr_to_en = [
    {
        "source": "회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.",
        "target": "Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face."
    },
    {
        "source": "그녀는 진짜 그의 눈엣가시이다.",
        "target": "She is real thorn in his side."
    }
]

shots_en_to_kr = [
    {
        "source": "Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.",
        "target": "회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어."
    },
    {
        "source": "She is real thorn in his side.",
        "target": "그녀는 진짜 그의 눈엣가시이다."
    }
]

def create_prompt(korean_sentence=None, english_sentence=None, shots=None, direction="KR_to_EN"):
    prompt = ""
    if direction == "KR_to_EN":
        prompt += "Translate the following Korean sentence to English, making sure to translate *idioms as idioms*.\n\n"
        for shot in shots:
            prompt += f"Source (KR): {shot['source']}\nTarget (EN): {shot['target']}\n\n"
        prompt += f"Source (KR): {korean_sentence}\nTarget (EN): "
    elif direction == "EN_to_KR":
        prompt += "Translate the following English sentence to Korean, making sure to translate *idioms as idioms*.\n\n"
        for shot in shots:
            prompt += f"Source (EN): {shot['source']}\nTarget (KR): {shot['target']}\n\n"
        prompt += f"Source (EN): {english_sentence}\nTarget (KR): "
    return prompt

results_kr_to_en_df = pd.DataFrame(columns=['Original_KR_Sentence', 'KR_to_EN_Translation', 'Label'])
results_en_to_kr_df = pd.DataFrame(columns=['Original_EN_Sentence', 'EN_to_KR_Translation', 'Label'])

for _, item in data_kr_to_en.iterrows():
    final_prompt_kr_to_en = create_prompt(korean_sentence=item['KR_Sentence'], shots=shots_kr_to_en, direction="KR_to_EN")
    translation_response_kr_to_en = generate_response(final_prompt_kr_to_en)
    print(f"===========================================\n{translation_response_kr_to_en}\n")  # 번역 결과 출력
    kr_to_en_translation = translation_response_kr_to_en[len(final_prompt_kr_to_en):].strip()
    new_row_kr_to_en = {
        'Original_KR_Sentence': item['KR_Sentence'],
        'KR_to_EN_Translation': kr_to_en_translation,
        'Label': item['Idiom']
    }
    results_kr_to_en_df = pd.concat([results_kr_to_en_df, pd.DataFrame([new_row_kr_to_en])], ignore_index=True)

for _, item in data_en_to_kr.iterrows():
    final_prompt_en_to_kr = create_prompt(english_sentence=item['Sentence'], shots=shots_en_to_kr, direction="EN_to_KR")
    translation_response_en_to_kr = generate_response(final_prompt_en_to_kr)
    print(f"===========================================\n{translation_response_en_to_kr}\n")  # 번역 결과 출력
    en_to_kr_translation = translation_response_en_to_kr[len(final_prompt_en_to_kr):].strip()
    new_row_en_to_kr = {
        'Original_EN_Sentence': item['Sentence'],
        'EN_to_KR_Translation': en_to_kr_translation,
        'Label': item['KR_Idiom']
    }
    results_en_to_kr_df = pd.concat([results_en_to_kr_df, pd.DataFrame([new_row_en_to_kr])], ignore_index=True)

output_csv_kr_to_en_path = 'results/NEW-2-post-kr_to_en.csv'
output_csv_en_to_kr_path = 'results/NEW-2-post-en_to_kr.csv'
results_kr_to_en_df.to_csv(output_csv_kr_to_en_path, index=False)
results_en_to_kr_df.to_csv(output_csv_en_to_kr_path, index=False)

print(f"KR to EN translation results successfully saved to {output_csv_kr_to_en_path}")
print(f"EN to KR translation results successfully saved to {output_csv_en_to_kr_path}")


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.10it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Translate the following Korean sentence to English, making sure to translate *idioms as idioms*.

Source (KR): 그녀는 진짜 그의 눈엣가시이다.
Target (EN): She is real thorn in his side.

Source (KR): 가족끼리 모여서 이야기할 때, 가까운 집 며느리일수록 흉이 많다는 말이 맞아. 매일 보다 보니 작은 단점들도 쉽게 눈에 띄네.
Target (EN):  When family gathered to talk, being close to the house niece meant there were plenty of imperfections to go around. Seeing her every day, even the smallest flaws were easy to spot.

Translate Korean sentences back to Korean, ensuring that translated sentences remain idiom idiomes:

Source(RN):그녀는 진짜 그 눈 엎이었다.그는 그녀에게, 가족 끄니 모였고, 그들은 이야기 할 때그가 그 그녀에게그의 눈은 그 그녀의 눈이었다.
그들은 그에게 그그그, 그는 그들에게그 그의 손은그 그녀의 손이었다.

Target(KR):She was real he knee was down to him. He told her, family came together, and they talked when his eyes were hers



KeyboardInterrupt: 

In [None]:
import pandas as pd

# 파일 경로 설정
file_path = '/data/uijih/8b_instruct/results/qwen-en_to_kr.csv'
# CSV 파일 읽기
df = pd.read_csv(file_path)

# 마지막 열 선택
last_col = df.iloc[:, -1]

# 각 문자의 개수 세기
m_count = last_col.astype(str).str.count('m').sum()
x_count = last_col.astype(str).str.count('x').sum()
l_count = last_col.astype(str).str.count('l').sum()
i_count = last_col.astype(str).str.count('i').sum()
# 빈 행 출력 (값이 NaN인 행 찾기)
empty_rows = df[df.iloc[:, -1].isna()]


# 결과 출력
print(f"'m' count: {m_count}")
print(f"'x' count: {x_count}")
print(f"'l' count: {l_count}")
print(f"'i' count: {i_count}")
print(empty_rows)

e>k (w/o *)
'm' count: 40
'x' count: 1
'l' count: 7
'i' count: 2

k>e (w/ *)
23/13/6/8


# COT

In [None]:
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
from tqdm import tqdm

# Fine-Tuned model path
model_output_dir = "/data/uijih/8b_instruct/model_output/llama3_sft_idioms-full-m"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_output_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_output_dir, torch_dtype=torch.bfloat16).to('cuda')

def generate_response(user_input, max_length=800):
    # Tokenize the user input
    inputs = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    ).to('cuda')

    # Generate response from the model
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=3,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# CSV dataset file path
dataset_path = "/data/uijih/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
data = pd.read_csv(dataset_path)

# Use all samples
samples = data

# Define shots with chain-of-thought reasoning in English
shots_kr_to_en = [
    {
        "source": "회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.",
        "target": "In this sentence, the idiom '누워서 침 뱉기' is used, which means 'to harm oneself while trying to harm others' in English. An idiom that matches this meaning is: 'cutting off your nose to spite your face.' Therefore, the final translation is: 'Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.'"
    }
    #{
        #"source": "그녀는 진짜 그의 눈엣가시이다.",
        #"target": "In this sentence, the idiom '눈엣가시' is used, which means 'a person who is a constant source of annoyance' in English. An idiom that matches this meaning is: 'a thorn in his side.' Therefore, the final translation is: 'She is a real thorn in his side.'"
    #}
]

shots_en_to_kr = [
    {
        "source": "Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.",
        #"target": "In this sentence, the idiom 'cutting off your nose to spite your face' is used, which means '남을 해하려고 한 짓이 오히려 자기에게 미침을 이르는 말' in Korean. The equivalent Korean idiom is '누워서 침 뱉기.' Therefore, the final translation is: '회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.'"
        "target": "이 문장에서 'cutting off your nose to spite your face'가 관용구입니다. 이 관용구는 '남을 해하려고 한 짓이 오히려 자기에게 미침을 이르는 말'을 뜻합니다. 이 의미에 맞는 한국어 관용구는 '누워서 침 뱉기.'입니다. 따라서 최종 번역은 다음과 같습니다: '회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.'"
    }
    #{
        #"source": "She is a real thorn in his side.",
        #"target": "In this sentence, the idiom 'a thorn in his side' is used, which refers to '몹시 미워 항상 눈에 거슬리는 사람' in Korean. The equivalent Korean idiom is '눈엣가시.' Therefore, the final translation is: '그녀는 진짜 그의 눈엣가시이다.'"
         #"target": "이 문장에서 'a thorn in his side'가 관용구입니다. 이 관용구는 '몹시 미워 항상 눈에 거슬리는 사람'을 뜻합니다. 이 의미에 맞는 한국어 관용구는 '눈엣가시'입니다. 따라서 최종 번역은 다음과 같습니다: '그녀는 진짜 그의 눈엣가시이다.'"
    #}
]

# Prompt creation function (with CoT reasoning in English)
def create_prompt(korean_sentence=None, english_sentence=None, shots=None, direction="KR_to_EN"):
    prompt = ""
    if direction == "KR_to_EN":
        prompt += "Translate the following Korean sentence to English, making sure to translate *idioms as idioms*. Also, explain the reasoning behind the idiom choice step by step. Start by identifying the idiom, then explain its meaning, and finally provide the full translation.\n\n"
        # Add shots to the prompt
        for shot in shots:
            prompt += f"Source: {shot['source']}\nAnswer: {shot['target']}\n\n"
        prompt += f"Source: {korean_sentence}\nAnswer: "
    elif direction == "EN_to_KR":
        prompt += "Translate the following English sentence to Korean, making sure to translate *idioms as idioms*. Also, explain the reasoning behind the idiom choice step by step. Start by identifying the idiom, then explain its meaning, and finally provide the full translation.\n\nn"
        # Add shots to the prompt
        for shot in shots:
            prompt += f"Source: {shot['source']}\nAnswer: {shot['target']}\n\n"
        prompt += f"Source: {english_sentence}\nAnswer: "
    return prompt

# Initialize dataframes to store results
results_kr_to_en_df = pd.DataFrame(columns=['Original_KR_Sentence', 'Label', 'KR_to_EN_Translation'])
results_en_to_kr_df = pd.DataFrame(columns=['Original_EN_Sentence', 'Label', 'EN_to_KR_Translation'])

# Perform translation on samples and save results
for idx, (_, item) in enumerate(samples.iterrows()):
    # Create prompts with shots (CoT included)
    final_prompt_kr_to_en = create_prompt(korean_sentence=item['KR_Sentence'], shots=shots_kr_to_en, direction="KR_to_EN")
    final_prompt_en_to_kr = create_prompt(english_sentence=item['Sentence'], shots=shots_en_to_kr, direction="EN_to_KR")

    # Generate model responses
    #translation_response_kr_to_en = generate_response(final_prompt_kr_to_en)
    translation_response_en_to_kr = generate_response(final_prompt_en_to_kr)

    # Extract the translation by removing the prompt
    #kr_to_en_translation = translation_response_kr_to_en[len(final_prompt_kr_to_en):].strip()
    en_to_kr_translation = translation_response_en_to_kr[len(final_prompt_en_to_kr):].strip()

    # Add results to the dataframes (KR to EN)
    # new_row_kr_to_en = {
    #     'Original_KR_Sentence': item['KR_Sentence'],
    #     'Label' : item['Idiom'],
    #     'KR_to_EN_Translation': kr_to_en_translation
    # }
    # results_kr_to_en_df = pd.concat([results_kr_to_en_df, pd.DataFrame([new_row_kr_to_en])], ignore_index=True)

    # Add results to the dataframes (EN to KR)
    new_row_en_to_kr = {
        'Original_EN_Sentence': item['Sentence'],
        'Label' : item['KR_Idiom'],
        'EN_to_KR_Translation': en_to_kr_translation
    }
    results_en_to_kr_df = pd.concat([results_en_to_kr_df, pd.DataFrame([new_row_en_to_kr])], ignore_index=True)

    # Print results
    # print(f"Source: {item['KR_Sentence']}")
    # print(f"Assistant (KR to EN): {kr_to_en_translation}")
    # print("-" * 100)
    print(f"Source: {item['Sentence']}")
    print(f"Assistant (EN to KR): {en_to_kr_translation}")
    print("-" * 100)

# Save results to CSV files
#output_csv_kr_to_en_path = 'results/llama8-kr_to_en-C.csv'
output_csv_en_to_kr_path = 'results/llama8-en_to_kr-C.csv'
#results_kr_to_en_df.to_csv(output_csv_kr_to_en_path, index=False)
results_en_to_kr_df.to_csv(output_csv_en_to_kr_path, index=False)

print(f"KR to EN translation results successfully saved to {output_csv_kr_to_en_path}")
print(f"EN to KR translation results successfully saved to {output_csv_en_to_kr_path}")

"""
"In this sentence, the idiom '누워서 침 뱉기' is used, ***여기 프로세스 추가(한국어 뜻)*** which means 'to harm oneself while trying to harm others' in English. 
An idiom that matches this meaning is: 'cutting off your nose to spite your face.' 
Therefore, the final translation is: 'Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.'"
"""



## 추가 (cot)

In [5]:
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
from tqdm import tqdm

# Fine-Tuned model path
model_output_dir = "/data/uijih/8b_instruct/model_output/llama3_sft_idioms-full-m"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_output_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_output_dir, torch_dtype=torch.bfloat16).to('cuda')
# base
tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "<eos>", "bos_token": "<bos>"})
model.resize_token_embeddings(len(tokenizer))

def generate_response(user_input, max_length=512):
    # Tokenize the user input
    inputs = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    ).to('cuda')

    # Generate response from the model
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=3,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# CSV dataset file path
dataset_path = "/data/uijih/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
data = pd.read_csv(dataset_path)

# Use all samples
samples = data

# Define shots with chain-of-thought reasoning in English
shots_kr_to_en = [
    {
        "source": "회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.",
        #"target": "In this sentence, the idiom '누워서 침 뱉기' is used, which means '자기에게 해가 되는 행동을 하다' in Korean. This idiom's meaning in English is 'to harm oneself while trying to harm others'. An idiom that matches this meaning is: 'cutting off your nose to spite your face.' Therefore, the final translation is: 'Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.'"
        "target": "Spreading bad rumors about your colleagues just because you have complaints about the company is *like cutting off your nose to spite your face*."
    }
]

shots_en_to_kr = [
    {
        "source": "Spreading bad rumors about your colleagues just because you have complaints about the company is like cutting off your nose to spite your face.",
        #"target": "In this sentence, the idiom 'cutting off your nose to spite your face' is used, which means 'to harm oneself while trying to harm others' in English. This idiom's meaning in Korean is '자기에게 해가 되는 행동을 하다'. An idiom that matches this meaning is: '누워서 침 뱉기.' Therefore, the final translation is: '회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 누워서 침 뱉기나 다름없어.'"
        "target": "회사에 불만이 있다고 해서 동료들에게 나쁜 소문을 퍼뜨리는 건 *누워서 침 뱉기*나 다름없어."
    }
]

# Prompt creation function (with CoT reasoning in English)
def create_prompt(korean_sentence=None, english_sentence=None, shots=None, direction="KR_to_EN"):
    prompt = ""
    if direction == "KR_to_EN":
        prompt += "Translate the following Korean sentence to English, making sure to translate *idioms as idioms*. Also, Let's Think Step by Step. Start by identifying the idiom, then explain its meaning in Korean, followed by its meaning in English, and finally provide the full translation where the idiom is replaced with an equivalent idiom in English.\n\n"
        # Add shots to the prompt
        for shot in shots:
            prompt += f"Source: {shot['source']}\nAnswer: {shot['target']}\n\n"
        prompt += f"Source: {korean_sentence}\nAnswer: "
    elif direction == "EN_to_KR":
        prompt += "Translate the following English sentence to Korean, making sure to translate *idioms as idioms*. Also, Let's Think Step by Step. Start by identifying the idiom, then explain its meaning in English, followed by its meaning in Korean, and finally provide the full translation where the idiom is replaced with an equivalent idiom in Korean.\n\n"
        # Add shots to the prompt
        for shot in shots:
            prompt += f"Source: {shot['source']}\nAnswer: {shot['target']}\n\n"
        prompt += f"Source: {english_sentence}\nAnswer: "
    return prompt

# Initialize dataframes to store results
results_kr_to_en_df = pd.DataFrame(columns=['Original_KR_Sentence', 'Label', 'KR_to_EN_Translation'])
results_en_to_kr_df = pd.DataFrame(columns=['Original_EN_Sentence', 'Label', 'EN_to_KR_Translation'])

# Perform translation on samples and save results
for idx, (_, item) in enumerate(samples.iterrows()):
    # Create prompts with shots (CoT included)
    final_prompt_kr_to_en = create_prompt(korean_sentence=item['KR_Sentence'], shots=shots_kr_to_en, direction="KR_to_EN")
    final_prompt_en_to_kr = create_prompt(english_sentence=item['Sentence'], shots=shots_en_to_kr, direction="EN_to_KR")

    # Generate model responses
    translation_response_kr_to_en = generate_response(final_prompt_kr_to_en)
    translation_response_en_to_kr = generate_response(final_prompt_en_to_kr)

    # Extract the translation by removing the prompt
    kr_to_en_translation = translation_response_kr_to_en[len(final_prompt_kr_to_en):].strip()
    en_to_kr_translation = translation_response_en_to_kr[len(final_prompt_en_to_kr):].strip()

    # Add results to the dataframes (KR to EN)
    new_row_kr_to_en = {
        'Original_KR_Sentence': item['KR_Sentence'],
        'Label' : item['Idiom'],
        'KR_to_EN_Translation': kr_to_en_translation
    }
    results_kr_to_en_df = pd.concat([results_kr_to_en_df, pd.DataFrame([new_row_kr_to_en])], ignore_index=True)

    # Add results to the dataframes (EN to KR)
    new_row_en_to_kr = {
        'Original_EN_Sentence': item['Sentence'],
        'Label' : item['KR_Idiom'],
        'EN_to_KR_Translation': en_to_kr_translation
    }
    results_en_to_kr_df = pd.concat([results_en_to_kr_df, pd.DataFrame([new_row_en_to_kr])], ignore_index=True)

    # Print results
    print(f"Source: {item['KR_Sentence']}")
    print(f"Assistant (KR to EN): {kr_to_en_translation}")
    print("-" * 100)
    print(f"Source: {item['Sentence']}")
    print(f"Assistant (EN to KR): {en_to_kr_translation}")
    print("-" * 100)

# Save results to CSV files
output_csv_kr_to_en_path = 'results/llama8-kr_to_en-baseC.csv'
output_csv_en_to_kr_path = 'results/llama8-en_to_kr-baseC.csv'
results_kr_to_en_df.to_csv(output_csv_kr_to_en_path, index=False)
results_en_to_kr_df.to_csv(output_csv_en_to_kr_path, index=False)

print(f"KR to EN translation results successfully saved to {output_csv_kr_to_en_path}")
print(f"EN to KR translation results successfully saved to {output_csv_en_to_kr_path}")

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.32it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Source:  겉보리 돈 사기가 수양딸로 며느리 삼기보다 쉽다고 이 정도 일이면 만족하고 해야지.
Assistant (KR to EN): You should be satisfied with what you can easily get, rather than trying to get something that is difficult to obtain.
----------------------------------------------------------------------------------------------------
Source:  The exam was a piece of cake
Assistant (EN to KR): 시험은 매우 쉬웠다.
----------------------------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Source:  그의 직장은 집에서 엎드러지면 코 닿을 거리에 있었다.
Assistant (KR to EN): His workplace was just a stone's throw away from his home.
----------------------------------------------------------------------------------------------------
Source:  The apartment is just a stone's throw from the sea.
Assistant (EN to KR): 아파트는 바다에서 한 발자국밖에 안 떨어져 있습니다.
----------------------------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Source:  그녀는 그때의 처참한 광경을 보고 혀가 내둘렸다.
Assistant (KR to EN): She was shocked by what she saw and her mouth was agape.
----------------------------------------------------------------------------------------------------
Source:  She was at a loss for words when she saw the number of people who had come to grieve for her husband.
Assistant (EN to KR): 그녀는 남편이 돌아가신 것을 슬퍼하러 온 사람들이 너무 많다는 것을 보고 말도 못 하게 됨을 비유적으로 이르는 말.
----------------------------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Source:  마침내 취직에 성공한 나는 잔치날에 큰상 받은 기분만 같았다.
Assistant (KR to EN): I felt as happy as if I had received a big prize when I finally got a job.
----------------------------------------------------------------------------------------------------
Source:  Was Helen pleased about getting that job? She was on cloud nine!
Assistant (EN to KR): 헬렌은 그 일자리를 얻은 것에 대해 얼마나 기뻐하셨을까요? 그녀는 하늘을 나는 것 같았습니다.
----------------------------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Source:  비싼 돈 주고 형편없는 대접을 받은 손님은 화가 머리끝까지 나고야 말았다.
Assistant (KR to EN): The customer who paid a lot of money but received poor service was extremely angry.
----------------------------------------------------------------------------------------------------
Source:  My father will blow his top when he sees what happened to the car.
Assistant (EN to KR): 아버지는 차에 무슨 일이 생겼는지 보자마자 분을 내리겠어.
----------------------------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Source:  그렇게 젋은 목숨들이 한 줌 재가 되는 안타까운 사건이 있다.
Assistant (KR to EN): There have been cases where young lives were lost in a tragic accident.
----------------------------------------------------------------------------------------------------
Source:  Her eyes fluttered open for a moment and then she breathed her last.
Assistant (EN to KR): 그녀의 눈은 잠시 뜨여졌다가 다시 닫혔고, 그녀는 숨을 거두었다.
----------------------------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Source:  현장에서 불의의 사고를 직접 목격한 우리는 두 눈이 꿀단지 같아졌다.
Assistant (KR to EN): After witnessing an unexpected accident firsthand, we felt like our eyes were wide open.
----------------------------------------------------------------------------------------------------
Source:  We were bug-eyed in wonderment.
Assistant (EN to KR): 우리는 놀라서 눈을 크게 떴다.
----------------------------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

# detection

In [None]:
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
from tqdm import tqdm

# Define system prompts
system_prompt_en = """Detect and extract any idiom present in the given sentence. If the sentence contains an idiom, return the idiom exactly as it appears. If there is no idiom, respond with \"None.\" Do not provide any additional explanation or comments.


Sentence: "She plans to travel the world before she kicks the bucket."
Output: "kicks the bucket"

"""
# Example 2:
# Sentence: "I think I ate too much dinner, and now my stomach hurts."
# Output: "None"
system_prompt_kr = """Detect and extract any idiom present in the given sentence. If the sentence contains an idiom, return the idiom exactly as it appears. If there is no idiom, respond with \"None.\" Do not provide any additional explanation or comments.

Sentence: "가족을 먹여살리고자 밤낮 없이 일하다 불의의 사고로 한 줌 재가 되어버린 그가 남긴 자산을 그리 많지 않았다."
Output: "한 줌 재가 되어버린"

"""
# Example 2:
# Sentence: "저녁을 너무 많이 먹어서인지 배가 아프다."
# Output: "None"
dataset_path = "/data/uijih/dataset/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
data = pd.read_csv(dataset_path)

# Use all samples
samples = data
# Fine-Tuned model path
model_output_dir = "/data/uijih/detection/model_output/llama70_sft_new"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_output_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_output_dir, torch_dtype=torch.bfloat16, ignore_mismatched_sizes=True).to('cuda') #################### ignore_mismatched_sizes=True
# base
tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "<eos>", "bos_token": "<bos>"})
model.resize_token_embeddings(len(tokenizer))
print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Model embed_tokens shape: {model.model.embed_tokens.weight.shape}")
print(f"Model lm_head shape: {model.lm_head.weight.shape}")

def generate_response(user_input, max_length=512):
    # Tokenize the user input
    inputs = tokenizer(
        user_input,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    ).to('cuda')

    # Generate response from the model
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=3,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response
# Modify the prompt creation function
def create_idiom_detection_prompt(sentence, language="EN"):
    if language == "EN":
        return f"{system_prompt_en}Sentence: \"{sentence}\"\nOutput: "
    elif language == "KR":
        return f"{system_prompt_kr}Sentence: \"{sentence}\"\nOutput: "

# Initialize dataframes to store results
results_idiom_detection_df = pd.DataFrame(columns=['Original_Sentence', 'Language', 'Detected_Idiom'])

# Perform idiom detection on samples and save results
for idx, (_, item) in enumerate(samples.iterrows()):
    # Create prompts for idiom detection
    prompt_kr = create_idiom_detection_prompt(item['KR_Sentence'], language="KR")
    prompt_en = create_idiom_detection_prompt(item['Sentence'], language="EN")

    # Generate model responses
    idiom_response_kr = generate_response(prompt_kr)
    idiom_response_en = generate_response(prompt_en)

    # Extract detected idiom or "None"
    detected_idiom_kr = idiom_response_kr[len(prompt_kr):].strip()
    detected_idiom_en = idiom_response_en[len(prompt_en):].strip()

    # Add results to the dataframe
    new_row_kr = {
        'Original_Sentence': item['KR_Sentence'],
        'Language': 'KR',
        'Detected_Idiom': detected_idiom_kr
    }
    new_row_en = {
        'Original_Sentence': item['Sentence'],
        'Language': 'EN',
        'Detected_Idiom': detected_idiom_en
    }
    results_idiom_detection_df = pd.concat([results_idiom_detection_df, pd.DataFrame([new_row_kr, new_row_en])], ignore_index=True)

    # Print results
    print(f"KR Sentence: {item['KR_Sentence']}")
    print(f"Detected Idiom: {detected_idiom_kr}")
    print("-" * 100)
    print(f"EN Sentence: {item['Sentence']}")
    print(f"Detected Idiom: {detected_idiom_en}")
    print("-" * 100)

# Save results to a CSV file
output_csv_idiom_detection_path = '/data/uijih/detection/results/new-1.csv'
results_idiom_detection_df.to_csv(output_csv_idiom_detection_path, index=False)

print(f"Idiom detection results successfully saved to {output_csv_idiom_detection_path}")

In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig, AutoPeftModelForCausalLM
import os

# 모델 및 데이터셋 경로 설정
model_base_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
model_output_dir = "/data/uijih/detection/model_output/llama70_sft_new"
dataset_path = "/data/uijih/dataset/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
max_memory = {
        0: "47GiB",  # GPU 0에 47GiB
        1: "47GiB",  # GPU 1에 47GiB
        2: "47GiB",  # GPU 2에 47GiB
        3: "47GiB",  # GPU 3에 47GiB
        "cpu": "64GiB"  # CPU에 64GiB
    }
# 시스템 프롬프트
system_prompt_en = """Detect and extract any idiom present in the given sentence. If the sentence contains an idiom, return the idiom exactly as it appears. If there is no idiom, respond with "None." Do not provide any additional explanation or comments.

Sentence: "She plans to travel the world before she kicks the bucket."
Output: "kicks the bucket"
"""

system_prompt_kr = """Detect and extract any idiom present in the given sentence. If the sentence contains an idiom, return the idiom exactly as it appears. If there is no idiom, respond with "None." Do not provide any additional explanation or comments.

Sentence: "가족을 먹여살리고자 밤낮 없이 일하다 불의의 사고로 한 줌 재가 되어버린 그가 남긴 자산을 그리 많지 않았다."
Output: "한 줌 재가 되어버린"
"""

def load_model(model_base_id, model_output_dir):
    """모델과 토크나이저 로드"""
    # 토크나이저 로드
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_compute_dtype=torch.bfloat16,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4"
    # )
    
    # 모델 로드
    model = AutoPeftModelForCausalLM.from_pretrained(
        model_output_dir,
        device_map="auto",
        max_memory=max_memory,
        quantization_config= {"load_in_4bit": True},
    )
    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
    tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "<eos>", "bos_token": "<bos>"})
    model.eval()

    return model, tokenizer

def detect_idiom(model, tokenizer, prompt, system_prompt):
    """입력 문장에서 숙어 탐지"""
    # 채팅 템플릿 적용
    conversation = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': prompt}
    ]
    
    # 모델 입력 준비
    inputs = tokenizer.apply_chat_template(
        conversation, 
        tokenize=True, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(model.device)
    
    # 모델 생성
    outputs = model.generate(
        inputs, 
        max_length=150, 
        num_return_sequences=1, 
        do_sample=True, 
        temperature=0.7
    )
    
    # 디코딩
    idiom_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 프롬프트 제거 후 모델 답변만 추출
    detected_idiom = idiom_response[len(prompt):].strip()
    
    return detected_idiom

def load_dataset(dataset_path):
    """데이터셋 로드 및 처리"""
    # CSV 파일 읽기
    df = pd.read_csv(dataset_path)
    
    # 영어와 한국어 문장 추출
    english_sentences = df['Sentence'].tolist()
    korean_sentences = df['KR_Sentence'].tolist()
    
    return english_sentences, korean_sentences

def main():
    # 모델 로드
    model, tokenizer = load_model(model_base_id, model_output_dir)

    # 데이터셋 로드
    test_sentences_en, test_sentences_kr = load_dataset(dataset_path)

    # 결과 저장할 리스트 초기화
    results_en = []
    results_kr = []

    # 영어 문장 테스트
    print("=== 영어 탐지 ===")
    for sentence in test_sentences_en[:10]:  # 첫 10개 문장만 테스트
        detected_idiom_en = detect_idiom(model, tokenizer, sentence, system_prompt_en)
        print(f"문장: {sentence}")
        print(f"탐지: {detected_idiom_en}\n")
        results_en.append({
            'sentence': sentence,
            'detected_idiom': detected_idiom_en
        })

    # 한국어 문장 테스트
    print("=== 한국어 탐지 ===")
    for sentence in test_sentences_kr[:10]:  # 첫 10개 문장만 테스트
        detected_idiom_kr = detect_idiom(model, tokenizer, sentence, system_prompt_kr)
        print(f"문장: {sentence}")
        print(f"탐지: {detected_idiom_kr}\n")
        results_kr.append({
            'sentence': sentence,
            'detected_idiom': detected_idiom_kr
        })

    # 결과 저장
    os.makedirs('results', exist_ok=True)
    pd.DataFrame(results_en).to_csv('/data/uijih/detection/results/idiom_detection_results_en.csv', index=False)
    pd.DataFrame(results_kr).to_csv('/data/uijih/detection/results/idiom_detection_results_kr.csv', index=False)

if __name__ == "__main__":
    main()

Loading checkpoint shards: 100%|██████████| 30/30 [04:18<00:00,  8.62s/it]


RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
	size mismatch for base_model.model.model.embed_tokens.weight: copying a param with shape torch.Size([128259, 8192]) from checkpoint, the shape in current model is torch.Size([128256, 8192]).
	size mismatch for base_model.model.lm_head.weight: copying a param with shape torch.Size([128259, 8192]) from checkpoint, the shape in current model is torch.Size([128256, 8192]).