# Hard VS Soft predict with LLMs

In [1]:
import ast
from concurrent.futures import ThreadPoolExecutor
import os
from typing import Literal
import re

import kagglehub  # pip install kagglehub
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import tqdm
from together import Together

from prompts import (
    SYSTEM_PROMPT,
    CLASSIFY_SOFT_PROMPT_TEMPLATE,
    CLASSIFY_HARD_PROMPT_TEMPLATE
)

from dotenv import load_dotenv # type: ignore
load_dotenv();

# Dataset

[View on Kaggle](https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification)

A **subset** of the original dataset was used for computational efficiency:

- **4 classes**
- **250 samples per class**
- **1000 samples total**


In [2]:
data_dir = kagglehub.dataset_download("saurabhshahane/ecommerce-text-classification")
csv_file = os.listdir(data_dir)[0]
csv_path = os.path.join(data_dir, csv_file)

data: pd.DataFrame = pd.read_csv(csv_path) # type: ignore
data = data.rename(columns={
                       data.columns[0]: "label",
                       data.columns[1]: "text"
                    })

classes: np.ndarray = data["label"].unique()

print("Labels: ", len(classes))
print(f"Original: {data.shape}")

# Keep only N examples of each category (N is a hyperparameter. Say, N=250)
N = 250
subset_data = pd.DataFrame()
for label in classes:
    label_data = data[data["label"] == label].head(N)
    subset_data = pd.concat([subset_data, label_data], ignore_index=True)

data = subset_data.sample(frac=1)  # shuffle data
print(f"Used: {data.shape}")

data.head()

Labels:  4
Original: (50424, 2)
Used: (1000, 2)


Unnamed: 0,label,text
119,Household,Generic Imported 30Pcs Assorted Hand Sewing Ne...
130,Household,Vardhman Bunny Mix 4 no. (6 pc Pack) Wool Ball...
566,Clothing & Accessories,Devil Boy's PU Leather Belt (Black) Devil is t...
302,Books,"Society Tea Premium Darjeeling Tea, 250g Once ..."
631,Clothing & Accessories,Probiker Half Finger Motorcycle Riding Gloves ...


In [3]:
BATCH_SIZE = 16  # for parallel calls
client = Together(api_key=os.getenv("TOGETHER_API_KEY"))

def classify_and_evaluate(mode: Literal["soft", "hard"], model_name):
    predictions = []
    targets = []

    # Choose prompt based on the mode
    prompt = CLASSIFY_HARD_PROMPT_TEMPLATE if mode == "hard" else CLASSIFY_SOFT_PROMPT_TEMPLATE

    # Prepare prompts
    all_prompts = [
        prompt.format(description=row["text"])
        for _, row in data.iterrows()
    ]
    all_labels = data["label"].tolist()

    def classify(prompt):
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": prompt}
                ]
            )
            content = response.choices[0].message.content.strip()  # type: ignore
            
            # Extract content between <answer> tags
            answer_match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL)
            if not answer_match:
                raise ValueError("No answer tags found in response")
                
            answer_content = answer_match.group(1).strip()
            answer = ast.literal_eval(answer_content)

            if mode == "soft":
                return max(answer, key=answer.get)
            
            return answer["Category"]

        except Exception as e:
            print(f"Error: {e}")
            return "ERROR 239"

    # Batch and parallelize
    for i in tqdm(range(0, len(all_prompts), BATCH_SIZE), desc="Processing Batches"):
        batch_prompts = all_prompts[i:i + BATCH_SIZE]
        batch_labels = all_labels[i:i + BATCH_SIZE]

        with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
            batch_predictions = list(executor.map(classify, batch_prompts))

        predictions.extend(batch_predictions)
        targets.extend(batch_labels)

    n_correct: int = 0

    for p, t in zip(predictions, targets):
        if p == t:
            n_correct += 1
    accuracy = n_correct / data.shape[0]
    
    print(f"""[{model_name.split("/")[-1]}][{mode}] Accuracy: {n_correct / data.shape[0]}""")
    return accuracy


# Predictions

In [4]:
modes: list[Literal["hard", "soft"]] = ["hard", "soft"]
models = ["lgai/exaone-3-5-32b-instruct"]

results = []

for mode in modes:
    for model in models:
        accuracy = classify_and_evaluate(mode=mode, model_name=model)
        results.append([mode, model, accuracy])

Processing Batches: 100%|██████████| 63/63 [01:33<00:00,  1.49s/it]


[exaone-3-5-32b-instruct][hard] Accuracy: 0.854


Processing Batches: 100%|██████████| 63/63 [02:12<00:00,  2.10s/it]

[exaone-3-5-32b-instruct][soft] Accuracy: 0.862





In [5]:
print(tabulate(results, headers=["Mode", "Model", "Accuracy"], floatfmt=".3f", tablefmt="grid"))

+--------+------------------------------+------------+
| Mode   | Model                        |   Accuracy |
| hard   | lgai/exaone-3-5-32b-instruct |      0.854 |
+--------+------------------------------+------------+
| soft   | lgai/exaone-3-5-32b-instruct |      0.862 |
+--------+------------------------------+------------+


# Conclusion

Soft prediction (label smoothing) showed slighly better performance (**+0.8%**) in comparison with the hard prediction