## Library

In [None]:
import os
import time
import json
import logging
import warnings
from datetime import datetime

import pandas as pd
import torch
from dotenv import load_dotenv
from tqdm import tqdm
from pydantic import BaseModel, Field
from typing import Literal

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from sklearn.model_selection import train_test_split

from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.callbacks import get_openai_callback

from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

load_dotenv()

In [None]:
seed = 1
num_k = 5
weight_config = [0.4, 0.6]

## Data

In [None]:
train = pd.read_csv(f'../seed{seed}/seed{seed}_train.csv')
validation = pd.read_csv(f'../seed{seed}/seed{seed}_validation.csv')
test = pd.read_csv(f'../seed{seed}/seed{seed}_test.csv')

## Vectorstore

In [None]:
device = torch.device("cpu")

class KUREEmbedding:
    def __init__(self, model_name="nlpai-lab/KURE-v1"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True).to(device)
        
    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

class KoE5Embedding(KUREEmbedding):
    def __init__(self, model_name="nlpai-lab/KoE5"):
        super().__init__(model_name)

In [None]:
vectorstore_path = f"../seed{seed}/faiss_index_seed{seed}_koe5"

embeddings = KoE5Embedding()

vectorstore = FAISS.load_local(
        vectorstore_path, embeddings.embed_query, allow_dangerous_deserialization=True
    )
faiss_retriever = vectorstore.as_retriever(search_kwargs={'k': num_k})

## Classifier

In [None]:
class SentenceClassification(BaseModel):
    prediction: str = Field(description="Classification of the sentence: True, False")

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
structured_llm_grader = llm.with_structured_output(SentenceClassification)

## Prompt & Chain

In [None]:
classification_prompt = ChatPromptTemplate.from_messages(
    [("system", "Classify the sentence into one of : True , False. "
                "Judge as True if it is unethical.\n"
                "Return in JSON: {{\"prediction\": \"True\" or \"False\"}}"),
     ("human", "Reference sentence: \n\n {sentences} \n\n Input sentence: {question}")])

classification_grader = classification_prompt | structured_llm_grader

## Hybrid Search

In [None]:
texts = train["문장"].tolist()
labels = train["비도덕여부"].tolist()
docs = [f"{text} (Label: {label})" for text, label in zip(texts, labels)]

# BM25 Retriever
bm25_retriever = BM25Retriever.from_texts(texts)

# Hybrid Retriever
retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=weight_config
)  

## Prediction

In [None]:
results = []

for idx, row in tqdm(
    test.iterrows(), total=len(test), desc="Processing"
):
    question = row["문장"]
    actual_label = row["비도덕여부"]
    retrieved_sentences = retriever.invoke(question)

    prediction = classification_grader.invoke(
        {"question": question, "sentences": retrieved_sentences}
    )

    results.append(
        {
            "question": question,
            "answer": actual_label,
            "prediction": prediction.prediction,
        })

df_results = pd.DataFrame(results)

## Evaluation

In [None]:
y_true = df_results["answer"].map({True: 1, False: 0})
y_pred = df_results["prediction"].map({"True": 1, "False": 0})

accuracy = accuracy_score(y_true, y_pred)
precision_macro = precision_score(y_true, y_pred, average="macro")
recall_macro = recall_score(y_true, y_pred, average="macro")
f1_macro = f1_score(y_true, y_pred, average="macro")
f1_weighted = f1_score(y_true, y_pred, average="weighted")

conf_matrix = confusion_matrix(y_true, y_pred)
classification_rep = classification_report(y_true, y_pred, output_dict=True)

print("\n===== Classification Performance Results =====")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Weighted): {f1_weighted:.4f}")

print("\n===== Classification Confusion Matrix =====")
print(conf_matrix)

print("\n===== Detailed Classification Report =====")
print(classification_report(y_true, y_pred))