## Library

In [None]:
import os
import time
import logging
import warnings
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from pydantic import BaseModel, Field
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.chains import LLMChain
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.callbacks import get_openai_callback
from langchain_huggingface import HuggingFaceEmbeddings

from dotenv import load_dotenv
load_dotenv()


warnings.filterwarnings("ignore")

In [None]:
seed=1
retrieve_k = 3

## Data

In [None]:
train_df = pd.read_csv("민원_train.csv", encoding="utf-8")
test_df = pd.read_csv("민원_validation.csv", encoding="utf-8")

combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset="Q_refined", keep="first")

min_threshold = 1600
fixed_sample_size = 1600

balanced_df = pd.DataFrame()
category_counts = combined_df['predication'].value_counts()

for category, count in category_counts.items():
    subset = combined_df[combined_df['predication'] == category]
    if count < min_threshold:
        continue  
    else:
        subset = subset.sample(fixed_sample_size, random_state=42)
    balanced_df = pd.concat([balanced_df, subset])

test_size = 1000
train_data, test_data = train_test_split(balanced_df, test_size=test_size, stratify=balanced_df['predication'], random_state=seed)

val_size = 200
train_data, validation_data = train_test_split(train_data, test_size=val_size, stratify=train_data['predication'], random_state=seed)

train = train_data.copy()
validation = validation_data.copy()
test = test_data.copy()

## Vectorstore

In [None]:
device = torch.device("cpu")

class KUREEmbedding:
    def __init__(self, model_name="nlpai-lab/KURE-v1"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True).to(device)

    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

class KoE5Embedding(KUREEmbedding):
    def __init__(self, model_name="nlpai-lab/KoE5"):
        super().__init__(model_name)

In [None]:
vectorstore_path = f"../seed{seed}/faiss_index_koe5_seed{seed}"
if os.path.exists(vectorstore_path):
    embeddings = KoE5Embedding()

    vectorstore = FAISS.load_local(
        vectorstore_path,
        embeddings.embed_query,
        allow_dangerous_deserialization=True,
    )

    retriever = vectorstore.as_retriever(search_kwargs={"k": retrieve_k})

## Classifier

In [None]:
class StatementClassification(BaseModel):
    prediction: str = Field(description="Predicted classification label for the statement. Possible labels: 요청/개선, 문의(질의), 건의/제기, 항의, 고충/토로, 협조, 감사.")

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
structured_llm_labeler = llm.with_structured_output(StatementClassification)

## Prompt & Chain

In [None]:
multiquery_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an AI assistant that generates multiple rephrased versions of a given user query for better search retrieval."
                "Generate three alternative versions of the given query, ensuring that all queries are written in Korean only."
                "Limit the response to a maximum of 50 characters."),
        ("human", "Original query: {query}")
    ])
 
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(),
    llm=llm,
    prompt=multiquery_prompt
)

classification_prompt = ChatPromptTemplate.from_messages([
        ("system", "Classify the query into one of: 요청/개선, 문의(질의), 건의/제기, 항의, 고충/토로, 협조, 감사.\n"
            "Return in JSON: {{\"prediction\": \"category\"}}"),
    ("human", "Query: {query}\nRelevant cases: {similar_cases}")])

## Multi-Query

In [None]:
def predict_label_with_multiquery_rag(statement: str, idx: int, total: int, max_docs: int = 5):

    queries = [statement]
    
    formatted_prompt = multiquery_prompt.format(query=statement)
    generated_queries = llm.invoke(formatted_prompt).content.strip().split("\n")
    queries.extend(generated_queries)
    
    all_retrieved_docs = []
    
    for query in queries:
        retrieved_docs = retriever_from_llm.invoke(query)[:max_docs]
        all_retrieved_docs.extend(retrieved_docs)
        
    unique_docs = list({doc.page_content: doc for doc in all_retrieved_docs}.values())

    retrieved_texts_str = "\n".join([doc.page_content for doc in unique_docs])
    
    formatted_prompt = classification_prompt.format(query=statement, similar_cases=retrieved_texts_str)
    
    prediction = structured_llm_labeler.invoke(formatted_prompt)
    
    return prediction

## Prediction

In [None]:
results = []

for idx, row in tqdm(test.iterrows(), total=len(test), desc="Processing"):
    question = row["Q_refined"]
    actual_label = row["predication"]
    
    prediction = predict_label_with_multiquery_rag(question, idx+1, len(test))

    results.append(
        {"question": question,
         "actual_label": actual_label,
         "prediction_label": prediction.predicted_label,
        }
    )

df_results = pd.DataFrame(results)

In [None]:
# Post-processing
df_results['prediction_label'] = df_results['prediction_label'].replace({'고충': '고충/토로', '토로': '고충/토로'})
df_results['prediction_label'] = df_results['prediction_label'].replace({'요청': '요청/개선', '개선': '요청/개선'})
df_results['prediction_label'] = df_results['prediction_label'].replace({'건의': '건의/제기', '제기': '건의/제기'})
df_results['prediction_label'] = df_results['prediction_label'].replace({'문의': '문의(질의)', '질의': '문의(질의)'})

## Evaluation

In [None]:
y_true = df_results["actual_label"]
y_pred = df_results["prediction_label"]

accuracy = accuracy_score(y_true, y_pred)
precision_macro = precision_score(y_true, y_pred, average="macro")
recall_macro = recall_score(y_true, y_pred, average="macro")
f1_macro = f1_score(y_true, y_pred, average="macro")
f1_weighted = f1_score(y_true, y_pred, average="weighted")

conf_matrix = confusion_matrix(y_true, y_pred)
classification_rep = classification_report(y_true, y_pred, output_dict=True)

print("\n===== Classification Performance Results =====")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Weighted): {f1_weighted:.4f}")

print("\n===== Classification Confusion Matrix =====")
print(conf_matrix)

print("\n===== Detailed Classification Report =====")
print(classification_report(y_true, y_pred))