## Library

In [None]:
from dotenv import load_dotenv
import os
import time
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm 
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

import torch
from transformers import (BertModel, BertTokenizer, ElectraModel, ElectraTokenizer)
from kobert_transformers import get_tokenizer
from sentence_transformers import SentenceTransformer

from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

load_dotenv()

In [None]:
seed = 1

In [None]:
# import shutil
# cache_dir = os.path.expanduser("~/.cache/huggingface")
# if os.path.exists(cache_dir):
#     shutil.rmtree(cache_dir)
#     print("Hugging Face 캐시 삭제 완료.")
# else:
#     print("Hugging Face 캐시가 이미 삭제되었거나 존재하지 않습니다.")

## Data

In [None]:
train_df = pd.read_csv("민원_train.csv", encoding="utf-8")
test_df = pd.read_csv("민원_validation.csv", encoding="utf-8")

combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset="Q_refined", keep="first")

min_threshold = 1600
fixed_sample_size = 1600

balanced_df = pd.DataFrame()
category_counts = combined_df['predication'].value_counts()

for category, count in category_counts.items():
    subset = combined_df[combined_df['predication'] == category]
    if count < min_threshold:
        continue  
    else:
        subset = subset.sample(fixed_sample_size, random_state=42)
    balanced_df = pd.concat([balanced_df, subset])

test_size = 1000
train_data, test_data = train_test_split(balanced_df, test_size=test_size, stratify=balanced_df['predication'], random_state=seed)

val_size = 200
train_data, validation_data = train_test_split(train_data, test_size=val_size, stratify=train_data['predication'], random_state=seed)

train = train_data.copy()
validation = validation_data.copy()
test = test_data.copy()

## Embedding Models

In [None]:
device = torch.device("cpu")

# Jina-v3
class JinaEmbedding:
    def __init__(self, model_name="jinaai/jina-embeddings-v3"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True).to(device)
        
    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

# KoBERT
class KoBERTEmbedding:
    def __init__(self, model_name="monologg/kobert"): 
        self.tokenizer = get_tokenizer()
        self.model = BertModel.from_pretrained(model_name).to(device)
        
    def embed_documents(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings = embeddings.numpy()
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

# KoELECTRA
class KoELECTRAEmbedding:
    def __init__(self, model_name="monologg/koelectra-base-v3-discriminator"):
        self.tokenizer = ElectraTokenizer.from_pretrained(model_name)
        self.model = ElectraModel.from_pretrained(model_name).to(device)

    def embed_documents(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings = embeddings.numpy()
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

# KURE-V1
class KUREEmbedding:
    def __init__(self, model_name="nlpai-lab/KURE-v1"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True).to(device)
        
    def embed_documents(self, texts):
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

# KoE5
class KoE5Embedding(KUREEmbedding):
    def __init__(self, model_name="nlpai-lab/KoE5"):
        super().__init__(model_name)

In [None]:
EMBEDDING_MODELS = {
    "text-embedding-3-small": OpenAIEmbeddings(model="text-embedding-3-small"),
    "text-embedding-ada-002": OpenAIEmbeddings(),
    "jina-v3": JinaEmbedding(),
    "kobert": KoBERTEmbedding(),
    "koelectra": KoELECTRAEmbedding(),
    "kure-v1": KUREEmbedding(),
    "koe5": KoE5Embedding()
}

## Creat Vectorstore

In [None]:
def create_vectorstore(df, embedding_model, seed):

    texts = df["Q_refined"].tolist()
    labels = df["predication"].tolist()
    docs = [f"{text} (Label: {label})" for text, label in zip(texts, labels)]

    model = EMBEDDING_MODELS[embedding_model]

    batch_size = 16 
    embeddings = []
    
    for i in range(0, len(docs), batch_size):
        batch = docs[i : i + batch_size]
        batch_embeddings = model.embed_documents(batch)
        embeddings.extend(batch_embeddings)

    text_embedding_pairs = list(zip(docs, embeddings))
    vectorstore = FAISS.from_embeddings(text_embedding_pairs, model)

    folder_path = "embedding_comparison2"
    os.makedirs(folder_path, exist_ok=True)
    save_path = os.path.join(folder_path, f"faiss_index_{embedding_model}_seed{seed}")
    vectorstore.save_local(save_path)

    return vectorstore

In [None]:
def create_all_vectorstores():
    for model_name in EMBEDDING_MODELS.keys():
        create_vectorstore(train, model_name, seed)

create_all_vectorstores()

## Evaluation

In [None]:
def load_vectorstore(embedding_model, seed):
    load_path = f"embedding_comparison/faiss_index_{embedding_model}_seed{seed}"
    
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"Cannot find Vectorstore: {load_path}")

    return FAISS.load_local(load_path, EMBEDDING_MODELS[embedding_model].embed_query, allow_dangerous_deserialization=True)

In [None]:
def evaluate_model_retrieval(df_test, embedding_model, seed, k_values=[1, 3, 5, 10]):
    vectorstore = load_vectorstore(embedding_model, seed)

    results = {k: {"Recall": 0, "MRR": 0} for k in k_values}
    reciprocal_ranks = {k: [] for k in k_values}
    recall_counts = {k: 0 for k in k_values}
    
    for query, true_label in tqdm(zip(df_test["Q_refined"], df_test["predication"]), total=len(df_test), desc="Evaluating"):
        similar_docs = vectorstore.similarity_search(query, k=max(k_values))
        retrieved_labels = [doc.page_content.split("(Label: ")[-1].strip(")") for doc in similar_docs]
        
        for k in k_values:
            top_k_labels = retrieved_labels[:k]
            if true_label in top_k_labels:
                recall_counts[k] += 1
                rank = top_k_labels.index(true_label) + 1
                reciprocal_ranks[k].append(1 / rank)
            else:
                reciprocal_ranks[k].append(0)
    
    for k in k_values:
        results[k]["Recall"] = recall_counts[k] / len(df_test)
        results[k]["MRR"] = np.mean(reciprocal_ranks[k])
    
    return results

In [None]:
results = {}

for model_name in EMBEDDING_MODELS.keys():
    model_results = evaluate_model_retrieval(test, model_name, seed, k_values=[1, 3, 5, 10])
    results[model_name] = model_results

final_results = []
for model_name, metrics in results.items():
    recall_values = {f"Recall@{k}": metrics[k]["Recall"] for k in [1, 3, 5, 10]}
    mrr_values = {f"MRR@{k}": metrics[k]["MRR"] for k in [1, 3, 5, 10]}

    final_results.append({
        "Seed": seed,
        "Model": model_name,
        **recall_values,
        **mrr_values
    })

final_results = pd.DataFrame(final_results)