In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from tqdm import tqdm
import csv

def setup_model_and_tokenizer(model_path):
    config = PeftConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
    special_tokens = {"pad_token": "<pad>", "eos_token": "<eos>", "bos_token": "<bos>"}
    tokenizer.add_special_tokens(special_tokens)
    
    base_model = AutoModelForCausalLM.from_pretrained(
        config.base_model_name_or_path,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        load_in_4bit=True,
    )
    base_model.resize_token_embeddings(len(tokenizer))    
    model = PeftModel.from_pretrained(base_model, model_path)
    
    return model, tokenizer

def generate_response(prompt, model, tokenizer, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=200)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response[len(prompt):].strip()
    return response

def process_csv(input_path, output_path, system_prompt, sentence_column, model, tokenizer):
    df = pd.read_csv(input_path)
    #print("Columns in CSV:", df.columns)  

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([sentence_column, 'detected_idiom'])
        
        for sentence in tqdm(df[sentence_column], desc=f"Processing sentences in {sentence_column}"):
            full_prompt = system_prompt + f"\nSentence: \"{sentence}\"\nOutput:"
            detected_idiom = generate_response(full_prompt, model, tokenizer)
            print(f"{sentence} : {detected_idiom}")
            writer.writerow([sentence, detected_idiom])

def main():
    model_path = "/data/uijih/detection/saveded_instruct-70-detection-new"
    input_csv_path = '/data/uijih/uiji_seed.csv'
    output_csv_path_en = "./70b-inst-en-none.csv"
    output_csv_path_kr = "./70b-inst-kr-none.csv"
    
    # 사용자 요청 원문 프롬프트
    system_prompt_en = """The task is to detect any idiom present in the given sentence and return it exactly as it appears in the sentence. If there is no idiom, respond with \"None.\" and provide no additional text or explanation. Follow the examples below:

    Example 1:
    Sentence: "She plans to travel the world before she kicks the bucket."
    Output: "kicks the bucket"

    Example 2:
    Sentence: "I think I ate too much dinner, and now my stomach hurts."
    Output: "None"

    Now, process the following sentence.
    """
    
    system_prompt_kr = """The task is to detect any idiom present in the given sentence and return it exactly as it appears in the sentence. If there is no idiom, respond with \"None.\" and provide no additional text or explanation. Follow the examples below:

    Example 1:
    Sentence: "가족을 먹여살리고자 밤낮 없이 일하다 불의의 사고로 한 줌 재가 되어버린 그가 남긴 자산을 그리 많지 않았다."
    Output: "한 줌 재가 되어버린"

    Example 2:
    Sentence: "저녁을 너무 많이 먹어서인지 배가 아프다."
    Output: "None"

    Now, process the following sentence.
    """
    
    print("Loading model and tokenizer...")  
    model, tokenizer = setup_model_and_tokenizer(model_path)
    
    print("Processing with English system prompt...")
    process_csv(input_csv_path, output_csv_path_en, system_prompt_en, 'Sentence', model, tokenizer)
    
    print("Processing with Korean system prompt...")
    process_csv(input_csv_path, output_csv_path_kr, system_prompt_kr, 'KRSentence', model, tokenizer)
    
    print("Inference completed successfully!")

if __name__ == "__main__":
    main()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import random

# 1. 추론 함수 정의
def inference(model_path, tokenizer, input_data, system_prompts_en, labels=None, language='en'):
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    model.eval()

    # 영어 프롬프트 템플릿
    prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|>
<|start_header_id|>user<|end_header_id|>

{user_input}<|eot_id|>
<|start_header_id|>assistant<|end_header_id>
"""

    results = []
    for idx, sentence in enumerate(input_data):
        # 영어 또는 한국어 프롬프트 사용
        system_prompt = random.choice(system_prompts_en)
        
        if language == 'en':
            user_input = f'Is there an idiom in the sentence "{sentence}"? If yes, return only the detected idiom. Generate just the idiom in its original form without any additional explanation or text. If there is no idiom, answer "none".'
        # else:  # 한국어 문장에 대해 처리
        #     user_input = f'"{sentence}"라는 문장에서 관용구가 있니? 있으면 관용구만 알려줘. 없으면 "없음"이라고 대답해.'

        # 프롬프트 템플릿에 시스템 프롬프트와 사용자 입력 삽입
        input_prompt = prompt_template.format(
            system_prompt=system_prompt,
            user_input=user_input
        )

        # 토크나이즈
        inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")

        # 모델에 입력
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=256,
                do_sample=False,
                num_beams=5, 
                early_stopping=True, 
                repetition_penalty=1.1
            )

        # 토큰 디코딩 및 정리된 결과만 추가
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        cleaned_output = decoded_output.split('assistant<|end_header_id>')[-1].strip()  # 불필요한 부분 제거
        # 결과만 추가
        results.append(cleaned_output)

        # 바로 출력
        if labels:
            print(f"\nSentence: {sentence}")
            print(f"Label: {labels[idx]}")
            print(f"Prediction: {cleaned_output}")
    
    return results

# 2. 추론 및 결과 저장 함수
def run_inference_and_save_results(input_csv_path, output_csv_path, model_path, system_prompts_en, language='en'):
    # 데이터 로드
    data = pd.read_csv(input_csv_path)
    
    # 문장과 레이블 분리 (영어/한국어에 따라 분리)
    if language == 'en':
        sentences = data['Sentence'].tolist()  # 추론할 문장 리스트
        labels = data['Idiom'].tolist()  # 실제 레이블 (정답)
    else:
        sentences = data['KRSentence'].tolist()  # 한국어 문장
        labels = data['KRIdiom'].tolist()  # 실제 레이블 (한국어 관용구)

    # 모델 및 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # 추론 실행
    print(f"Running inference with model: {model_path} ({language})")
    model_predictions = inference(model_path, tokenizer, sentences, system_prompts_en, labels, 'en') # en으로 input language 통일 (학습때도 그랬음)

    # 결과 데이터프레임 생성 및 저장
    result_df = pd.DataFrame({
        'Sentence': sentences,
        'Label': labels,  
        'Model_Prediction': model_predictions  
    })
    result_df.to_csv(output_csv_path, index=False)
    print(f"\nResults saved to {output_csv_path}")

    return result_df

# 3. 실행
if __name__ == "__main__":
    model_path = "/data/uijih/detection/saveded_instruct-full-detection-1"
    #model_path = "./saveded_instruct-70-detection-l"
    #model_path = "meta-llama/Meta-Llama-3.1-70B-Instruct"
    #input_csv_path = "/data/uijih/previous/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
    input_csv_path = "/data/uijih/uiji_seed.csv"
    output_csv_path_en = "./8b-inst-full-en.csv"  
    output_csv_path_kr = "./8b-inst-full-kr.csv"  

    # 학습 때 사용했던 영어 시스템 프롬프트만 사용 (일단 하나만)
    system_prompts_en = [
        "Detect if there is an idiom in the following sentence. If there is, return only the detected idiom in its original form. If there are no idioms, answer 'none'."
        # "Check if the following sentence contains any idioms. If so, return the idiom only. If none, respond with 'none'.",
        # "Look for an idiom in the following sentence and return the idiom only if found. Otherwise, return 'none'."
    ]

    result_df_en = run_inference_and_save_results(input_csv_path, output_csv_path_en, model_path, system_prompts_en, language='en')    
    result_df_kr = run_inference_and_save_results(input_csv_path, output_csv_path_kr, model_path, system_prompts_en, language='kr')

    # Display the DataFrames for visualization in Jupyter
    # import ace_tools as tools
    # tools.display_dataframe_to_user(name="English Inference Results", dataframe=result_df_en)
    # tools.display_dataframe_to_user(name="Korean Inference Results", dataframe=result_df_kr)


## 8b

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import random

# 1. 추론 함수 정의
def inference(model_path, tokenizer, input_data, system_prompts_en, labels=None, language='en'):
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    model.eval()

    # 영어와 한국어 프롬프트 템플릿 정의
    prompt_template_en = """
    Detect if there is an idiom in the following sentence. If there is, return only the detected idiom. Generate just the idiom without any additional explanation or text. If there are no idioms, answer 'none'.

    # Sentence:
    {user_input}
    
    # Detected Idiom:
    """
    results = []
    for idx, sentence in enumerate(input_data):
        if language == 'en':
            input_prompt = prompt_template_en.format(user_input=sentence)  # 영어 프롬프트
        # else:
        #     user_input = f'{sentence}'
        #     input_prompt = prompt_template_kr.format(user_input=user_input)  # 한국어 -> 영어 번역 프롬프트

        # 토크나이즈
        inputs = tokenizer(input_prompt, return_tensors="pt").to("cuda")

        # 모델에 입력
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=256,
                do_sample=False,
                num_beams=5, 
                early_stopping=True, 
                repetition_penalty=1.1
            )

        # 토큰 디코딩 및 정리된 결과만 추가
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        cleaned_output = decoded_output.split('# Detected Idiom:')[-1].strip()  # 불필요한 부분 제거
        # 결과만 추가
        results.append(cleaned_output)

        # 바로 출력
        if labels:
            print(f"\nSentence: {sentence}")
            print(f"Label: {labels[idx]}")
            print(f"Prediction: {cleaned_output}")
    
    return results

# 2. 추론 및 결과 저장 함수
def run_inference_and_save_results(input_csv_path, output_csv_path, model_path, system_prompts_en, language='en'):
    # 데이터 로드
    data = pd.read_csv(input_csv_path)
    
    # 문장과 레이블 분리 (영어/한국어에 따라 분리)
    if language == 'en':
        sentences = data['Sentence'].tolist()  # 추론할 문장 리스트
        labels = data['Idiom'].tolist()  # 실제 레이블 (정답)
    else:
        sentences = data['KR_Sentence'].tolist()  # 한국어 문장
        labels = data['KR_Idiom'].tolist()  # 실제 레이블 (한국어 관용구)

    # 모델 및 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # 추론 실행
    print(f"Running inference with model: {model_path} ({language})")
    model_predictions = inference(model_path, tokenizer, sentences, system_prompts_en, labels, 'en')  # 'kr' 또는 'en'에 따라 추론

    # 결과 데이터프레임 생성 및 저장
    result_df = pd.DataFrame({
        'Sentence': sentences,
        'Label': labels,  
        'Model_Prediction': model_predictions  
    })
    result_df.to_csv(output_csv_path, index=False)
    print(f"\nResults saved to {output_csv_path}")

    return result_df

# 3. 실행
if __name__ == "__main__": 
    model_path = "meta-llama/Meta-Llama-3.1-8B"
    input_csv_path = "/data/uijih/previous/Seed50_for_Parallel_Dataset_ENKR_idiomKB_0.8_example.csv"
    output_csv_path_en = "./8b-base-en.csv"  
    output_csv_path_kr = "./8b-base-kr.csv"  

    # 영어 시스템 프롬프트
    system_prompts_en = [
        "Detect if there is an idiom in the following sentence. If there is, return only the detected idiom. If there are no idioms, answer 'none'.",
    ]

    # 영어 및 한국어 추론 실행
    result_df_en = run_inference_and_save_results(input_csv_path, output_csv_path_en, model_path, system_prompts_en, language='en')    
    result_df_kr = run_inference_and_save_results(input_csv_path, output_csv_path_kr, model_path, system_prompts_en, language='kr')

    # Display the DataFrames for visualization in Jupyter
    import ace_tools as tools
    tools.display_dataframe_to_user(name="English Inference Results", dataframe=result_df_en)
    tools.display_dataframe_to_user(name="Korean Inference Results", dataframe=result_df_kr)


In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, json, re
from datasets import Dataset


# 8B 모델 로드
# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# tokenizer.add_special_tokens({"pad_token": "<pad>", "eos_token": "<eos>", "bos_token": "<bos>"})
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto").eval()
# # 토크나이저에 새로 추가된 토큰 적용
# model.resize_token_embeddings(len(tokenizer))

# 데이터셋 로드 함수
def load_dataset(jsonl_path, tokenizer, num_samples=1):
    with open(jsonl_path, 'r') as f:
        raw_data = [json.loads(line) for line in f]

    data = {'text': []}
    for item in raw_data[:num_samples]:  # 샘플 개수 제한
        conversation = []
        for entry in item["conversations"]:
            conversation.append({"role": "system", "content": "You are an expert in detecting idioms. Identify only the idiom exactly as it appears in the given sentence. Do not provide additional explanations or context interpretation. If there is no idiom, respond with 'None.'"})
            conversation.append({'role': 'user', 'content': entry['user']})
            conversation.append({'role': 'assistant', 'content': entry['assistant']})
        
        templated = tokenizer.apply_chat_template(conversation, tokenize=False, padding=True, max_length=400, truncation=True)
        data['text'].append(templated)

    return Dataset.from_dict(data)

# 테스트용 데이터셋 로드
test_dataset = load_dataset("test_idioms_detection_dataset.jsonl", tokenizer, num_samples=3)

# 평가 콜백 클래스
class AdvancedIdiomEvalCallback:
    def __init__(self, model, eval_dataset, tokenizer, num_examples=3):
        self.model = model
        self.eval_dataset = eval_dataset
        self.tokenizer = tokenizer
        self.num_examples = num_examples

    def extract_user_input(self, full_text):
        """
        Extracts the 'user' content from the text based on markers.
        """
        try:
            user_start = "<|start_header_id|>user<|end_header_id|>"
            user_end = "<|eot_id|>"
            user_content = full_text.split(user_start)[-1].split(user_end)[0].strip()
            return user_content
        except IndexError:
            return full_text.strip()

    def extract_ground_truth(self, full_text):
        """
        Extracts the 'assistant' content from the text based on markers.
        """
        try:
            assistant_start = "<|start_header_id|>assistant<|end_header_id|>"
            assistant_end = "<|eot_id|>"
            assistant_content = full_text.split(assistant_start)[-1].split(assistant_end)[0].strip()
            return assistant_content
        except IndexError:
            return "None"

    def evaluate(self):
        for i in range(min(self.num_examples, len(self.eval_dataset))):
            sample_data = self.eval_dataset[i]
            full_text = sample_data["text"]
            user_input = self.extract_user_input(full_text)
            ground_truth = self.extract_ground_truth(full_text)

            eval_conversation = [
                {"role": "system", "content": "You are an expert in detecting idioms. Identify only the idiom exactly as it appears in the given sentence. Do not provide additional explanations or context interpretation. If there is no idiom, respond with 'None.'"},
                {"role": "user", "content": user_input}
            ]

            # 입력 텍스트를 토큰화
            eval_text = self.tokenizer.apply_chat_template(eval_conversation, tokenize=False)
            inputs = self.tokenizer(eval_text, return_tensors="pt", padding=True, truncation=True).to(self.model.device)
            print("\n============= 샘플 평가 =============")
            print(f"입력 문장: {eval_text}") # userinput으로 해야지 실제 문장
            # 모델 출력 생성
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=256,
                num_return_sequences=1,
                do_sample=False,
                pad_token_id=self.tokenizer.pad_token_id
            )

            # 전체 생성 결과 디코딩
            decoded_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            #print(f"Decoded Output: {decoded_output}")  # 디버깅 출력

            # "assistant" 이후 텍스트 추출
            if "assistant" in decoded_output:
                detected_idiom = decoded_output.split("assistant")[-1].strip()
            else:
                detected_idiom = decoded_output.strip()
            
            print(f"Ground Truth: {ground_truth}")
            print(f"모델 예측: {detected_idiom}")



# 콜백 평가 실행
callback = AdvancedIdiomEvalCallback(model=model, eval_dataset=test_dataset, tokenizer=tokenizer)
callback.evaluate()


입력 문장: 그는 상을 받아 잔치날에 큰상 받는 기분이었다.
Ground Truth: 잔치날에 큰상 받는 기분
모델 예측: 큰 상

입력 문장: 그는 고슴도치 외 따 지듯 빚을 졌다.
Ground Truth: 고슴도치 외 따 지듯
모델 예측: 고슴도치 외 따 지듯

입력 문장: 그 집은 아름다운 정원이 있어요.
Ground Truth: None
모델 예측: None
