In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# 모델과 토크나이저 로드
model_path = "./saveded_instruct-full-1" 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True, device_map="auto")
model.eval()

# Prompt template
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id>
"""

# System prompts
system_prompt_idiom_en = """
You are an expert with deep knowledge of idioms. Your role is to provide the user with an accurate and detailed explanation of the idiom.
"""
system_prompt_guess_idiom_en = """
You are an expert with deep knowledge of idioms. Your role is to provide the user with the correct idiom.
"""


data_path = "/data/uijih/previous/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
data = pd.read_csv(data_path)

# Extract relevant columns
idioms_en = data['Idiom'].tolist()        # English idioms
meanings_en = data['Meaning'].tolist()    # English meanings
idioms_kr = data['KR_Idiom'].tolist()     # Korean idioms
meanings_kr = data['KR_Meaning'].tolist() # Korean meanings

# Initialize result lists
result_idiom_to_meaning = []
result_meaning_to_idiom = []

# Inference for idiom to meaning and vice versa
for idiom_en, meaning_en, idiom_kr, meaning_kr in zip(idioms_en, meanings_en, idioms_kr, meanings_kr):
    # 1. Idiom to Meaning (English)
    system_prompt = system_prompt_idiom_en
    user_input = f'What does the idiom "{idiom_en}" mean?'
    
    # Create prompt
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract response after 'assistant<|end_header_id>'
    assistant_marker = "assistant<|end_header_id>"
    assistant_response = decoded_output.split(assistant_marker, 1)[1].strip() if assistant_marker in decoded_output else decoded_output.strip()
    
    # Save result
    result_idiom_to_meaning.append({
        "Idiom": idiom_en,
        "Generated Meaning": assistant_response,
        "Label": meaning_en
    })
    
    # 2. Meaning to Idiom (English)
    system_prompt = system_prompt_guess_idiom_en
    user_input = f'What is the idiom that means "{meaning_en}"?'
    
    # Create prompt
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract response
    assistant_response = decoded_output.split(assistant_marker, 1)[1].strip() if assistant_marker in decoded_output else decoded_output.strip()
    
    # Save result
    result_meaning_to_idiom.append({
        "Meaning": meaning_en,
        "Generated Idiom": assistant_response,
        "Label": idiom_en
    })
    
    # 3. Idiom to Meaning (Korean)    
    # Create prompt
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract response
    assistant_response = decoded_output.split(assistant_marker, 1)[1].strip() if assistant_marker in decoded_output else decoded_output.strip()
    
    # Save result
    result_idiom_to_meaning.append({
        "KR_Idiom": idiom_kr,
        "Generated KR Meaning": assistant_response,
        "Label": meaning_kr
    })
    
    # 4. Meaning to Idiom (Korean)    
    # Create prompt
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract response
    assistant_response = decoded_output.split(assistant_marker, 1)[1].strip() if assistant_marker in decoded_output else decoded_output.strip()
    
    # Save result
    result_meaning_to_idiom.append({
        "KR_Meaning": meaning_kr,
        "Generated KR Idiom": assistant_response,
        "Label": idiom_kr
    })

# Save results to CSV
df_idiom_to_meaning = pd.DataFrame(result_idiom_to_meaning)
df_meaning_to_idiom = pd.DataFrame(result_meaning_to_idiom)

df_idiom_to_meaning.to_csv("idiom_to_meaning_results.csv", index=False, encoding="utf-8-sig")
df_meaning_to_idiom.to_csv("meaning_to_idiom_results.csv", index=False, encoding="utf-8-sig")

print("Inference completed and results saved successfully.")


# 그냥 translation 하게하기

In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# 모델과 토크나이저 로드
model_path = "./saveded_instruct-full-1" 

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True, device_map="auto")
model.eval()

# 데이터 로드
data_path = "/data/uijih/previous/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
data = pd.read_csv(data_path)
df = pd.DataFrame(data)
sampled_df = df

# 필요한 열을 리스트로 저장
en_sentences = sampled_df['Sentence'].tolist()  # 영어 문장 (예문)
kr_sentences = sampled_df['KR_Sentence'].tolist() # 한국어 문장 (예문)
kr_idioms = sampled_df['KR_Idiom'].tolist()     # 한국어 관용구
kr_meanings = sampled_df['KR_Meaning'].tolist() # 한국어 관용구 의미
en_idioms = sampled_df['Idiom'].tolist()        # 영어 관용구

# 프롬프트 템플릿 사용
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id>
"""

# 시스템 프롬프트 정의
system_prompt_translate_kr_to_en = "You are a professional translator proficient in Korean and English. Please translate the given Korean sentence into English accurately."
system_prompt_translate_en_to_kr = "You are a professional translator proficient in Korean and English. Please translate the given English sentence into Korean accurately."

# 결과 저장 리스트 초기화
result_kr_to_en = []
result_en_to_kr = []

# Inference 실행
for en_sentence, kr_sentence, kr_idiom, en_idiom in zip(en_sentences, kr_sentences, kr_idioms, en_idioms):
    # 1. 영어 문장을 한국어로 번역 (en_sentence -> kr)
    system_prompt = system_prompt_translate_en_to_kr
    user_input = f'Translate the sentence "{en_sentence}" into Korean.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    assistant_marker = "assistant<|end_header_id>"
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_en_to_kr.append({
        "Source EN Sentence": en_sentence,
        "Generated KR Translation": assistant_response,
        "label": kr_idiom
    })
    # 출력
    print(f"EN Sentence: {en_sentence}")
    print(f"KR Translation: {assistant_response}")
    print(f"label: {kr_idiom}")
    print("-" * 50)

    # 2. 한국어 문장을 영어로 번역 (kr_sentence -> en)
    system_prompt = system_prompt_translate_kr_to_en
    user_input = f'Translate the sentence "{kr_sentence}" into English.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_kr_to_en.append({
        "Source KR Sentence": kr_sentence,
        "Generated EN Translation": assistant_response,
        "label": en_idiom
    })
    # 출력
    print(f"KR Sentence: {kr_sentence}")
    print(f"EN Translation: {assistant_response}")
    print(f"label: {en_idiom}")
    print("-" * 50)

# 3. 결과를 CSV 파일로 저장
df_kr_to_en = pd.DataFrame(result_kr_to_en)
df_en_to_kr = pd.DataFrame(result_en_to_kr)

df_kr_to_en.to_csv("kr_sentence_to_en_translation.csv", index=False, encoding="utf-8-sig")
df_en_to_kr.to_csv("en_sentence_to_kr_translation.csv", index=False, encoding="utf-8-sig")

print("CSV files have been saved successfully.")

# instruct 추가

In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# 데이터 로드
data_path = "/data/uijih/previous/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
data = pd.read_csv(data_path)
df = pd.DataFrame(data)

sampled_df = df

# 필요한 열을 리스트로 저장
en_sentences = sampled_df['Sentence'].tolist()  # 영어 문장 (예문)
kr_sentences = sampled_df['KR_Sentence'].tolist() # 한국어 문장 (예문)
kr_idioms = sampled_df['KR_Idiom'].tolist()     # 한국어 관용구
kr_meanings = sampled_df['KR_Meaning'].tolist() # 한국어 관용구 의미
en_idioms = sampled_df['Idiom'].tolist()        # 영어 관용구

# 프롬프트 템플릿 사용
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id>
"""
# 필요한 열을 리스트로 저장
en_sentences = sampled_df['Sentence'].tolist()  # 영어 문장 (예문)
kr_sentences = sampled_df['KR_Sentence'].tolist() 
kr_idioms = sampled_df['KR_Idiom'].tolist()     # 한국어 관용구
kr_meanings = sampled_df['KR_Meaning'].tolist() # 한국어 관용구 의미
en_idioms = sampled_df['Idiom'].tolist()        # 영어 관용구

# 프롬프트 템플릿 사용
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id>
"""

# 시스템 프롬프트 정의
system_prompt_translate_kr_to_en = """
You are a professional idiom translator proficient in Korean Idioms and English Idioms. Please translate the given Korean sentence into English accurately, ensuring that any Korean idiom is translated into its correct English idiom equivalent. Non-idiomatic parts of the sentence should be translated naturally and fluently.
"""
system_prompt_translate_en_to_kr = """
You are a professional idiom translator proficient in English Idioms and Korean Idioms. Please translate the given English sentence into Korean accurately, ensuring that any English idiom is translated into its correct Korean idiom equivalent. Non-idiomatic parts of the sentence should be translated naturally and fluently.
"""

# 결과 저장 리스트 초기화
result_kr_to_en = []
result_en_to_kr = []

# Inference 실행
for en_sentence, kr_sentence, kr_idiom, en_idiom in zip(en_sentences, kr_sentences, kr_idioms, en_idioms):
    # 1. 영어 문장을 한국어로 번역 (en_sentence -> kr)
    system_prompt = system_prompt_translate_en_to_kr
    user_input = f'Translate the sentence "{en_sentence}" into Korean.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    assistant_marker = "assistant<|end_header_id>"
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_en_to_kr.append({
        "Source EN Sentence": en_sentence,
        "Generated KR Translation": assistant_response,
        "label": kr_idiom
    })
    # 출력
    print(f"EN Sentence: {en_sentence}")
    print(f"KR Translation: {assistant_response}")
    print(f"label: {kr_idiom}")
    print("-" * 50)

    # 2. 한국어 문장을 영어로 번역 (kr_sentence -> en)
    system_prompt = system_prompt_translate_kr_to_en
    user_input = f'Translate the sentence "{kr_sentence}" into English.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_kr_to_en.append({
        "Source KR Sentence": kr_sentence,
        "Generated EN Translation": assistant_response,
        "label": en_idiom
    })
    # 출력
    print(f"KR Sentence: {kr_sentence}")
    print(f"EN Translation: {assistant_response}")
    print(f"label: {en_idiom}")
    print("-" * 50)

# 3. 결과를 CSV 파일로 저장
df_kr_to_en = pd.DataFrame(result_kr_to_en)
df_en_to_kr = pd.DataFrame(result_en_to_kr)

df_kr_to_en.to_csv("kr_sentence_to_en_translation_i.csv", index=False, encoding="utf-8-sig")
df_en_to_kr.to_csv("en_sentence_to_kr_translation_i.csv", index=False, encoding="utf-8-sig")

print("CSV files have been saved successfully.")

# shot 주고

In [None]:
# 데이터 로드
data_path = '/data/uijih/previous/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv'
data = pd.read_csv(data_path)
df = pd.DataFrame(data)

# 'Sentence' 열에서 NaN 값 제거
df = df.dropna(subset=['Sentence'])

sampled_df = df.head(25)  # 25개의 샘플을 사용

# 필요한 열을 리스트로 저장
en_sentences = sampled_df['Sentence'].tolist()  # 영어 문장 (예문)
kr_sentences = sampled_df['KR_Sentence'].tolist() 
kr_idioms = sampled_df['KR_Idiom'].tolist()     # 한국어 관용구
kr_meanings = sampled_df['KR_Meaning'].tolist() # 한국어 관용구 의미
en_idioms = sampled_df['Idiom'].tolist()        # 영어 관용구

# 프롬프트 템플릿 사용
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id>
"""

# 시스템 프롬프트 정의
system_prompt_translate_kr_to_en = """
You are a professional idiom translator proficient in Korean Idioms and English Idioms. Please translate the given Korean sentence into English accurately, ensuring that any Korean idiom is translated into its correct English idiom equivalent. Non-idiomatic parts of the sentence should be translated naturally and fluently.

Example 1:
Korean: "그녀는 그녀의 시험 결과에 입이 가로 터졌다."
English: "She was over the moon with her exam results."

Example 2:
Korean: "그들이 아무리 노력해봐도, 그녀는 절대 무릎 꿇지 않았다."
English: "No matter how hard they tried, she wouldn't say uncle."

Now, translate the following sentence.
"""

system_prompt_translate_en_to_kr = """
You are a professional idiom translator proficient in English Idioms and Korean Idioms. Please translate the given English sentence into Korean accurately, ensuring that any English idiom is translated into its correct Korean idiom equivalent. Non-idiomatic parts of the sentence should be translated naturally and fluently.

Example 1:
English: "She was over the moon with her exam results."
Korean: "그녀는 그녀의 시험 결과에 입이 가로 터졌다."

Example 2:
English: "No matter how hard they tried, she wouldn't say uncle."
Korean: "그들이 아무리 노력해봐도, 그녀는 절대 무릎 꿇지 않았다."

Now, translate the following sentence.
"""


# 결과 저장 리스트 초기화
result_kr_to_en = []
result_en_to_kr = []

# Inference 실행
for en_sentence, kr_sentence, kr_idiom, en_idiom in zip(en_sentences, kr_sentences, kr_idioms, en_idioms):
    # 1. 영어 문장을 한국어로 번역 (en_sentence -> kr)
    system_prompt = system_prompt_translate_en_to_kr
    user_input = f'Translate the sentence "{en_sentence}" into Korean.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    assistant_marker = "assistant<|end_header_id>"
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_en_to_kr.append({
        "Source EN Sentence": en_sentence,
        "Generated KR Translation": assistant_response,
        "label": kr_idiom
    })
    # 출력
    print(f"EN Sentence: {en_sentence}")
    print(f"KR Translation: {assistant_response}")
    print(f"label: {kr_idiom}")
    print("-" * 50)

    # 2. 한국어 문장을 영어로 번역 (kr_sentence -> en)
    system_prompt = system_prompt_translate_kr_to_en
    user_input = f'Translate the sentence "{kr_sentence}" into English.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_kr_to_en.append({
        "Source KR Sentence": kr_sentence,
        "Generated EN Translation": assistant_response,
        "label": en_idiom
    })
    # 출력
    print(f"KR Sentence: {kr_sentence}")
    print(f"EN Translation: {assistant_response}")
    print(f"label: {en_idiom}")
    print("-" * 50)

# 3. 결과를 CSV 파일로 저장
df_kr_to_en = pd.DataFrame(result_kr_to_en)
df_en_to_kr = pd.DataFrame(result_en_to_kr)

df_kr_to_en.to_csv("kr_sentence_to_en_translation_p.csv", index=False, encoding="utf-8-sig")
df_en_to_kr.to_csv("en_sentence_to_kr_translation_p.csv", index=False, encoding="utf-8-sig")

print("CSV files have been saved successfully.")

# COT

In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer

# 모델과 토크나이저 로드
#model_path = "/data/uijih/instruct/llama3.1-8b-Instruct-full/checkpoint-385"

#tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True, device_map="auto")
model.eval()

# 특수 토큰 추가
special_tokens_dict = {
    'additional_special_tokens': [
        '<|begin_of_text|>', '<|end_of_text|>', '<|finetune_right_pad_id|>',
        '<|start_header_id|>', '<|end_header_id|>', '<|eom_id|>', '<|eot_id|>',
        '<|python_tag|>'
    ]
}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

# 데이터 로드
data_path = '/data/uijih/previous/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv'
data = pd.read_csv(data_path)
df = pd.DataFrame(data)

# 'Sentence' 열에서 NaN 값 제거
df = df.dropna(subset=['Sentence'])

sampled_df = df.head(25)  # 25개의 샘플을 사용

# 필요한 열을 리스트로 저장
en_sentences = sampled_df['Sentence'].tolist()  # 영어 문장 (예문)
kr_sentences = sampled_df['KR_Sentence'].tolist() 
kr_idioms = sampled_df['KR_Idiom'].tolist()     # 한국어 관용구
kr_meanings = sampled_df['KR_Meaning'].tolist() # 한국어 관용구 의미
en_idioms = sampled_df['Idiom'].tolist()        # 영어 관용구

# 프롬프트 템플릿 사용
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id>
"""

# 시스템 프롬프트 정의
system_prompt_translate_en_to_kr_cot = """
You are a professional idiom translator proficient in English Idioms and Korean Idioms. Follow a step-by-step process to translate the given English sentence into Korean, ensuring that any English idiom is accurately translated into its corresponding Korean idiom equivalent. Translate non-idiomatic parts naturally and fluently.

Step 1: Detect the idiom in the given sentence.
Step 2: Explain the meaning of the detected idiom in English.
Step 3: Translate the meaning into Korean.
Step 4: Find a Korean idiom that conveys the same meaning.
Step 5: Translate the entire sentence into Korean, using the identified Korean idiom.

Example 1:
English: "She was over the moon with her exam results."
Step 1: Detected idiom: "over the moon"
Step 2: Meaning: "extremely happy or delighted"
Step 3: Translated meaning: "매우 행복하거나 기쁘다"
Step 4: Korean idiom: "입이 가로 터지다"
Step 5: Translated sentence: "그녀는 그녀의 시험 결과에 입이 가로 터졌다."

Example 2:
English: "No matter how hard they tried, she wouldn't say uncle."
Step 1: Detected idiom: "say uncle"
Step 2: Meaning: "to admit defeat or surrender"
Step 3: Translated meaning: "패배를 인정하거나 항복하다"
Step 4: Korean idiom: "무릎을 꿇다"
Step 5: Translated sentence: "그들이 아무리 노력해봐도, 그녀는 절대 무릎 꿇지 않았다."

Now, follow the steps to translate the following sentence.
"""


system_prompt_translate_kr_to_en_cot = """
You are a professional idiom translator proficient in Korean Idioms and English Idioms. Follow a step-by-step process to translate the given Korean sentence into English, ensuring that any Korean idiom is accurately translated into its corresponding English idiom equivalent. Translate non-idiomatic parts naturally and fluently.

Step 1: Detect the idiom in the given sentence.
Step 2: Explain the meaning of the detected idiom in Korean.
Step 3: Translate the meaning into English.
Step 4: Find an English idiom that conveys the same meaning.
Step 5: Translate the entire sentence into English, using the identified English idiom.

Example 1:
Korean: "그녀는 그녀의 시험 결과에 입이 가로 터졌다."
Step 1: Detected idiom: "입이 가로 터지다"
Step 2: Meaning: "매우 기쁘거나 행복하다"
Step 3: Translated meaning: "extremely happy or delighted"
Step 4: English idiom: "over the moon"
Step 5: Translated sentence: "She was over the moon with her exam results."

Example 2:
Korean: "그들이 아무리 노력해봐도, 그녀는 절대 무릎 꿇지 않았다."
Step 1: Detected idiom: "무릎을 꿇다"
Step 2: Meaning: "패배를 인정하거나 항복하다"
Step 3: Translated meaning: "to admit defeat or surrender"
Step 4: English idiom: "say uncle"
Step 5: Translated sentence: "No matter how hard they tried, she wouldn't say uncle."

Now, follow the steps to translate the following sentence.
"""


# 결과 저장 리스트 초기화
result_kr_to_en = []
result_en_to_kr = []

# Inference 실행
for en_sentence, kr_sentence, kr_idiom, en_idiom in zip(en_sentences, kr_sentences, kr_idioms, en_idioms):
    # 1. 영어 문장을 한국어로 번역 (en_sentence -> kr)
    system_prompt = system_prompt_translate_en_to_kr
    user_input = f'Translate the sentence "{en_sentence}" into Korean.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    assistant_marker = "assistant<|end_header_id>"
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_en_to_kr.append({
        "Source EN Sentence": en_sentence,
        "Generated KR Translation": assistant_response,
        "label": kr_idiom
    })
    # 출력
    print(f"EN Sentence: {en_sentence}")
    print(f"KR Translation: {assistant_response}")
    print(f"label: {kr_idiom}")
    print("-" * 50)

    # 2. 한국어 문장을 영어로 번역 (kr_sentence -> en)
    system_prompt = system_prompt_translate_kr_to_en
    user_input = f'Translate the sentence "{kr_sentence}" into English.'
    
    # 프롬프트 생성
    input_prompt = prompt_template.format(system_prompt=system_prompt, user_input=user_input)
    inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=150, num_beams=5, early_stopping=True, repetition_penalty=1.1)
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 'assistant<|end_header_id>' 이후의 텍스트 추출
    if assistant_marker in decoded_output:
        assistant_response = decoded_output.split(assistant_marker, 1)[1].strip()
    else:
        assistant_response = decoded_output.strip()
    
    result_kr_to_en.append({
        "Source KR Sentence": kr_sentence,
        "Generated EN Translation": assistant_response,
        "label": en_idiom
    })
    # 출력
    print(f"KR Sentence: {kr_sentence}")
    print(f"EN Translation: {assistant_response}")
    print(f"label: {en_idiom}")
    print("-" * 50)

# 3. 결과를 CSV 파일로 저장
df_kr_to_en = pd.DataFrame(result_kr_to_en)
df_en_to_kr = pd.DataFrame(result_en_to_kr)

df_kr_to_en.to_csv("kr_sentence_to_en_translation_c.csv", index=False, encoding="utf-8-sig")
df_en_to_kr.to_csv("en_sentence_to_kr_translation_c.csv", index=False, encoding="utf-8-sig")

print("CSV files have been saved successfully.")