## Library

In [None]:
import os
import warnings

import pandas as pd
import torch
from dotenv import load_dotenv
from tqdm import tqdm
from pydantic import BaseModel, Field

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from sklearn.model_selection import train_test_split

from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.callbacks import get_openai_callback

from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

load_dotenv()

In [None]:
seed = 1
num_k = 3

## Data

In [None]:
train_df = pd.read_csv("train_형사.csv")
val_df = pd.read_csv("validation_형사.csv")
test_df = pd.read_csv("test_형사.csv")

combined_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

datasets = {}

test_size = 1000
train_data, test_data = train_test_split(
    combined_df,
    test_size=test_size,
    stratify=combined_df["answer"],
    random_state=seed,
)

val_size = 100
train_data, validation_data = train_test_split(
    train_data,
    test_size=val_size,
    stratify=train_data["answer"],
    random_state=seed,
)

datasets[seed] = {
    "train_data": train_data,
    "test_data": test_data,
    "validation_data": validation_data,
}

train = datasets[seed]["train_data"]
validation = datasets[seed]["validation_data"]
test = datasets[seed]["test_data"]

## Vectorstore

In [None]:
device = torch.device("cpu")


class KUREEmbedding:
    def __init__(self, model_name="nlpai-lab/KURE-v1"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True).to(device)

    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]


class KoE5Embedding(KUREEmbedding):
    def __init__(self, model_name="nlpai-lab/KoE5"):
        super().__init__(model_name)

In [None]:
vectorstore_path = f"embedding_comparison/faiss_index_koe5_seed{seed}"
if os.path.exists(vectorstore_path):
    embeddings = KoE5Embedding()

    vectorstore = FAISS.load_local(
        vectorstore_path,
        embeddings.embed_query,
        allow_dangerous_deserialization=True,
    )

    retriever = vectorstore.as_retriever(search_kwargs={"k": num_k})

## Classifier

In [None]:
class CategoryClassification(BaseModel):
    prediction: str = Field(description="Classification of the document: True, False")


llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
structured_llm_grader = llm.with_structured_output(CategoryClassification)

## Prompt & Chain 

In [None]:
# System prompt
classification_prompt = ChatPromptTemplate.from_messages([
    ("system", "Classify the query into one of : True , False \n Return in JSON: {{\"prediction\": \"category\"}}"),
    ("human", "Query: {query}\nRelevant cases: {similar_cases}")
])

# HyDE prompt
hyde_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "Generate a hypothetical passage that is relevant to the given query. \n" # response
                       "Limit the response to a maximum of 300 characters."),
            ("human", "User query: {query}")
        ]
    )

# Chain
hyde_chain = hyde_prompt | llm
classification_grader = classification_prompt | structured_llm_grader

## HyDE 

In [None]:
def predict_label_with_hyde_rag(user_query: str):
    hyde_response = hyde_chain.invoke({"query": user_query}).content.strip()
    similar_docs = retriever.get_relevant_documents(hyde_response.strip())
    retrieved_texts = (
        "\n".join([doc.page_content for doc in similar_docs])
        if similar_docs
        else "No similar cases found."
    )

    prediction = classification_grader.invoke(
        {"query": user_query, "similar_cases": retrieved_texts}
    )

    return prediction

## Prediction

In [None]:
results = []

for idx, row in tqdm(test.iterrows(), total=len(test), desc="Processing"):
    question = row["question"]
    actual_label = row["answer"]
    prediction = predict_label_with_hyde_rag(question)

    results.append(
        {
            "question": question,
            "answer": actual_label,
            "prediction": prediction.prediction,
        }
    )

df_results = pd.DataFrame(results)

## Evaluation

In [None]:
y_true = df_results["answer"].map({True: 1, False: 0})
y_pred = df_results["prediction"].map({"True": 1, "False": 0})

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1_final = f1_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")
f1_weighted = f1_score(y_true, y_pred, average="weighted")

conf_matrix = confusion_matrix(y_true, y_pred)

print("\n===== Classification Performance Results =====")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_final:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Weighted): {f1_weighted:.4f}")

print("\n===== Classification Confusion Matrix =====")
print(conf_matrix)

print("\n===== Detailed Classification Report =====")
print(classification_report(y_true, y_pred))