## Library

In [None]:
from dotenv import load_dotenv

import os
import torch
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
from pydantic import BaseModel, Field

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from sklearn.model_selection import train_test_split

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.callbacks import get_openai_callback
from langchain.vectorstores import FAISS

from sentence_transformers import SentenceTransformer

load_dotenv()

In [None]:
seed = 1

## Data

In [None]:
train = pd.read_csv(f"../seed{seed}/seed{seed}_train_쇼핑.csv")
validation = pd.read_csv(f"../seed{seed}/seed{seed}_validation_쇼핑.csv")
test = pd.read_csv(f"../seed{seed}/seed{seed}_test_쇼핑.csv")

## Classifier

In [None]:
class CategoryClassification(BaseModel):
    prediction: str = Field(description="Predicted category of the user query")


llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
structured_llm_grader = llm.with_structured_output(CategoryClassification)

## Prompt & Chain

In [None]:
# system prompt
classification_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Classify the query into one of: 제품, 배송, 교환/반품/환불, 행사, AS, 포장, 구매, 웹사이트.\n"
            "Return only one category exactly as listed above. No other categories or explanations.\n"
            'Return in JSON: {{"prediction": "category"}}',
        ),
        ("human", "Query: {query}\nRelevant cases: {similar_cases}"),
    ]
)

# chain
classification_grader = classification_prompt | structured_llm_grader

## Prediction

In [None]:
results = []

for idx, row in tqdm(test.iterrows(), total=len(test), desc="Processing"):
    question = row["text"]
    actual_label = row["category"]
    prediction = classification_grader.invoke({"query": question})

    results.append(
        {
            "question": question,
            "answer": actual_label,
            "prediction": prediction.prediction,
        }
    )

df_results = pd.DataFrame(results)

In [None]:
# Post-processing
df_results["prediction_label"] = df_results["prediction_label"].replace(
    {"교환": "교환/반품/환불", "반품": "교환/반품/환불", "환불": "교환/반품/환불"}
)

## Evaluation

In [None]:
y_true = df_results["actual_label"]
y_pred = df_results["prediction_label"]

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1_macro = f1_score(y_true, y_pred, average="macro")
f1_weighted = f1_score(y_true, y_pred, average="weighted")

conf_matrix = confusion_matrix(y_true, y_pred)

print("\n===== Classification Performance Results =====")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision:.4f}")
print(f"Recall (Macro): {recall:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")
print(f"F1-score (Weighted): {f1_weighted:.4f}")

print("\n===== Classification Confusion Matrix =====")
print(conf_matrix)

print("\n===== Detailed Classification Report =====")
print(classification_report(y_true, y_pred))