In [1]:
import os
import regex as re
from typing import Literal

import kagglehub  # pip install kagglehub
import numpy as np
import pandas as pd
from tabulate import tabulate
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer  # pip install accelerate auto-gptq optimum huggingface_hub[hf_xet]
from tqdm import tqdm

# Dataset

[View on Kaggle](https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification)

A **subset** of the original dataset was used for computational efficiency:

- **4 classes**
- 50 samples per class
- **200 samples total**


In [2]:
data_dir = kagglehub.dataset_download("saurabhshahane/ecommerce-text-classification")
csv_file = os.listdir(data_dir)[0]
csv_path = os.path.join(data_dir, csv_file)

data = pd.read_csv(csv_path)
data = data.rename(columns={
                       data.columns[0]: "label",
                       data.columns[1]: "text"
                    })

classes = data["label"].unique()

print("Labels: ", len(classes))
print(f"Original: {data.shape}")

# Keep only N examples of each category (N is a hyperparameter. Say, N=250)
N = 50
subset_data = pd.DataFrame()
for label in classes:
    label_data = data[data["label"] == label].head(N)
    subset_data = pd.concat([subset_data, label_data], ignore_index=True)

data = subset_data.sample(frac=1)  # shuffle data
print(f"Used: {data.shape}")

data.head()

Labels:  4
Original: (50424, 2)
Used: (200, 2)


Unnamed: 0,label,text
104,Clothing & Accessories,Kuchipoo Girl's Cotton Regular Fit T-Shirt - P...
85,Books,The Hindu View Of Life About the Author Profes...
39,Household,Sehaz Artworks 'Our Memories' Pasted Wood Phot...
5,Household,Paper Plane Design Starry Night Vangoh Wall Ar...
133,Clothing & Accessories,FabSeasons Camouflage Polyester Multi Function...


### Load `Qwen3-0.6B-GPTQ-Int8`

In [None]:
model_name = "Qwen/Qwen3-0.6B-GPTQ-Int8"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda"
)

### Inference code

In [4]:
def classify_text(text, shot_type: Literal["zero", "one", "few"] = "zero", monte_carlo: bool = False, n_samples: int = 10):
    
    examples = ""
    if shot_type == "one":
        if not monte_carlo:
            examples = """
Example:
Product description: Samsung phone with 1.5 inch display, FM radio, and 800 mAh battery.
Category: Electronics

"""
        else:
            examples = """
Example:
Product description: Samsung phone with 1.5 inch display, FM radio, and 800 mAh battery.
Scores: {"Household": 1, "Books": 0, "Clothing & Accessories": 0, "Electronics": 10}

"""
    elif shot_type == "few":
        if not monte_carlo:
            examples = """
=== Examples start ===

Product description: Canvas - 24x18" ready-to-hang wall art with vibrant colors.
Category: Household

Product description: Mystery novel with 300 pages by Author Agatha Christie.
Category: Books

Product description: Wool sweater with v-neck.
Category: Clothing & Accessories

Product description: Samsung phone with 1.5 inch display, FM radio, and 800 mAh battery.
Category: Electronics

=== Examples end ===

"""
        else:
            examples = """
Examples:

Product description: Canvas - 24x18" ready-to-hang wall art with vibrant colors.
Scores: {"Household": 10, "Books": 0, "Clothing & Accessories": 1, "Electronics": 2}

Product description: Mystery novel with 300 pages by Author Agatha Christie.
Scores: {"Household": 1, "Books": 10, "Clothing & Accessories": 0, "Electronics": 0}

Product description: Wool sweater with v-neck.
Scores: {"Household": 0, "Books": 0, "Clothing & Accessories": 10, "Electronics": 0}

Product description: Samsung phone with 1.5 inch display, FM radio, and 800 mAh battery.
Scores: {"Household": 0, "Books": 0, "Clothing & Accessories": 1, "Electronics": 10}

"""

    if not monte_carlo:
        system_prompt = """You are a product categorization expert. Your task is to classify product descriptions into exactly one of these categories:
        - Household: Home items, tools, and outdoor equipment
        - Books: Books, publications, and literature
        - Clothing & Accessories: Apparel, fashion items, and accessories
        - Electronics: Phones, devices, and electronic equipment"""

        user_prompt = f"""
        {examples}Analyze the following product description carefully and classify it into the most appropriate category.
        Return ONLY the category name without any explanation or additional text.

        Product description: {text}
        Category:"""

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # conduct text completion
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=30,
            temperature=0.7,  # Added temperature to reduce deterministic behavior
            top_p=0.9         # Added top_p sampling to improve diversity
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

        content = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
        # print(content)
        return content
    else:
        # Monte Carlo approach
        categories = ["Household", "Books", "Clothing & Accessories", "Electronics"]
        results = {category: 0 for category in categories}
        
        system_prompt = """You are a product categorization expert. Your task is to assign a confidence score from 0 to 10 for EACH of these categories:
        - Household: Home items, tools, and outdoor equipment
        - Books: Books, publications, and literature
        - Clothing & Accessories: Apparel, fashion items, and accessories
        - Electronics: Phones, devices, and electronic equipment"""

        for i in range(n_samples):
            seed = np.random.randint(0, 10000)
            temperature = np.random.uniform(0.5, 1.5)
            top_k = np.random.randint(5, 50)

            user_prompt = f"""
            {examples}Analyze the following product description carefully and assign a confidence score from 0 to 10 for EACH category.
            The highest score means the most probable class.
            Return your answer as a dictionary with categories as keys and scores as values.
            Format your response as: {{"Household": score, "Books": score, "Clothing & Accessories": score, "Electronics": score}}
            Do not include any other text, explanations, or labels.

            Product description: {text}
            Scores:"""

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]

            text_input = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False
            )
            model_inputs = tokenizer([text_input], return_tensors="pt").to(model.device)

            # Set seed for reproducibility within this iteration
            torch.manual_seed(seed)
            
            # Generate with varied parameters
            generated_ids = model.generate(
                **model_inputs,
                max_new_tokens=100,
                temperature=temperature,
                top_k=top_k,
                do_sample=True
            )
            output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
            content = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
            # print(content)

            # Parse the dictionary output
            try:
                # Extract dictionary part from the response
                dict_str = content.strip()
                scores_dict = eval(dict_str)
                # print(scores_dict)
                
                # Normalize scores to create a probability distribution
                total = sum(scores_dict.values())
                for category in categories:
                    if category in scores_dict:
                        results[category] += scores_dict[category] / total
            except:
                # If parsing fails, use equal weights
                for category in categories:
                    results[category] += 0.25

        # Find the category with the highest aggregated score
        max_category = max(results, key=results.get)
        return max_category

### Run experiments

In [5]:
def run_experiments(use_mc: bool = False):
    results = {}

    for shot in ["few", "zero", "one"]:
        n_correct = 0
        n_total = 0
        
        # Create a progress bar
        pbar = tqdm(zip(data["text"].values, data["label"].values), total=data.shape[0], 
                   desc=f"{'w/ MC' if use_mc else 'w/o MC'}, {shot} shot")
        
        for text, label in pbar:
            # Make prediction, update counters
            prediction = classify_text(text, shot_type=shot, monte_carlo=use_mc)
            n_total += 1
            if prediction.lower() == label.lower():
                n_correct += 1
            
            # Update progress bar with current accuracy
            current_accuracy = n_correct / n_total
            pbar.set_description(f"{'w/ MC' if use_mc else 'w/o MC'}, {shot} shot - Acc: {current_accuracy:.4f}")

        accuracy = n_correct / n_total
        results[shot] = {"correct": n_correct, "total": n_total, "accuracy": accuracy}
        # print(f"{shot}-shot accuracy: {accuracy:.4f} ({n_correct}/{n_total})")

    # Create and display a table with results
    table_data = []
    for shot in ["zero", "one", "few"]:
        table_data.append([
            f"{shot}-shot",
            results[shot]["correct"],
            results[shot]["total"],
            f"{results[shot]['accuracy']:.4f}"
        ])

    print("\nAccuracy Results:")
    print(tabulate(table_data, headers=["Shot Type", "Correct", "Total", "Accuracy"], tablefmt="grid"))

In [6]:
run_experiments(use_mc=False)

w/o MC, few shot - Acc: 0.5150: 100%|██████████| 200/200 [00:34<00:00,  5.84it/s]
w/o MC, zero shot - Acc: 0.5050: 100%|██████████| 200/200 [00:30<00:00,  6.57it/s]
w/o MC, one shot - Acc: 0.5800: 100%|██████████| 200/200 [00:32<00:00,  6.12it/s]


Accuracy Results:
+-------------+-----------+---------+------------+
| Shot Type   |   Correct |   Total |   Accuracy |
| zero-shot   |       101 |     200 |      0.505 |
+-------------+-----------+---------+------------+
| one-shot    |       116 |     200 |      0.58  |
+-------------+-----------+---------+------------+
| few-shot    |       103 |     200 |      0.515 |
+-------------+-----------+---------+------------+





In [7]:
run_experiments(use_mc=True)

w/ MC, few shot - Acc: 0.6300: 100%|██████████| 200/200 [53:53<00:00, 16.17s/it]
w/ MC, zero shot - Acc: 0.6000: 100%|██████████| 200/200 [53:41<00:00, 16.11s/it]
w/ MC, one shot - Acc: 0.4700: 100%|██████████| 200/200 [53:27<00:00, 16.04s/it]


Accuracy Results:
+-------------+-----------+---------+------------+
| Shot Type   |   Correct |   Total |   Accuracy |
| zero-shot   |       120 |     200 |       0.6  |
+-------------+-----------+---------+------------+
| one-shot    |        94 |     200 |       0.47 |
+-------------+-----------+---------+------------+
| few-shot    |       126 |     200 |       0.63 |
+-------------+-----------+---------+------------+





# Conclusions

- The Monte Carlo method demonstrated a significant improvement **(9.5–11.5%)**, except in the one-shot prompting scenario.

- Few-shot and one-shot prompting **outperformed** zero-shot prompting in 3 out of 4 cases.

- The anomaly where one-shot prompting underperforms zero-shot in the Monte Carlo setting is likely due to the **small dataset size** and **poor examples**.