In [None]:
!pip install kagglehub



In [None]:
import ast
from concurrent.futures import ThreadPoolExecutor
import os
from typing import Literal
import re

import kagglehub  # pip install kagglehub
import pandas as pd
import numpy as np
from tabulate import tabulate
from tqdm import tqdm
from together import Together
from transformers import pipeline
from sklearn.metrics import accuracy_score, log_loss, classification_report

In [None]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# 1. Download the dataset and find the CSV file
dataset_path = kagglehub.dataset_download("alfathterry/bbc-full-text-document-classification")
csv_path = next(Path(dataset_path).glob("*.csv"))

# 2. Load into a DataFrame and standardize column names
df = pd.read_csv(csv_path)  # type: ignore
cols = df.columns.tolist()
df = df.rename(columns={cols[0]: "text", cols[1]: "label"})

# 3. Inspect classes and original size
labels = df["label"].unique()
print(f"Number of categories: {labels.size}")
print(f"Original dataset shape: {df.shape}")

# 4. Take only the first N samples per label
N = 250
df_limited = (
    df
    .groupby("label", sort=False)    # preserve original label order
    .head(N)                         # first N rows of each group
    .reset_index(drop=True)
)

# 5. Shuffle the resulting subset
df_shuffled = df_limited.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Shape after sampling {N} per class and shuffling: {df_shuffled.shape}")

# 6. Show counts per label
print(df_shuffled["label"].value_counts())


Number of categories: 5
Original dataset shape: (2225, 2)
Shape after sampling 250 per class and shuffling: (1250, 2)
label
sport            250
tech             250
business         250
politics         250
entertainment    250
Name: count, dtype: int64


In [None]:
NEWS_CATEGORIES = {
    "entertainment": "Movies, music, arts, celebrity news",
    "business":      "Economy, markets, finance, companies",
    "sport":         "Sports events, athletes, competitions",
    "politics":      "Government, policy, elections",
    "tech":          "Technology, gadgets, innovations"
}

SYSTEM_PROMPT = """
You are a news categorization expert. Your task is to classify news texts into predefined categories.

Available categories:
{categories_list}

Guidelines:
- Be precise and consistent in your categorization
- Consider the main theme and context of the text
- If a text could fit multiple categories, choose the most dominant one
- For soft classification, provide probability scores that sum to 1
- For hard classification, select the single most appropriate category
""".strip().format(
    categories_list="\n".join(f"- {cat}: {desc}" for cat, desc in NEWS_CATEGORIES.items())
)

CLASSIFY_SOFT_PROMPT_TEMPLATE = """
Assign a probability score (0 < score < 1) to each category so they sum to 1.
Wrap your response in <answer></answer> tags.

# Expected format:
<answer>
{{
    "entertainment": <probability>,
    "business":      <probability>,
    "sport":         <probability>,
    "politics":      <probability>,
    "tech":          <probability>
}}
</answer>

News Text:
{description}

Provide only the JSON response without any additional text or explanations.
""".strip()

CLASSIFY_HARD_PROMPT_TEMPLATE = """
Select the most fitting category (among provided) for the given news text.
Wrap your response in <answer></answer> tags.

# Expected format:
<answer>
{{
    "Category": "<selected category>"
}}
</answer>

News Text:
{description}

Provide only the JSON response without any additional text or explanations.
""".strip()


In [None]:
from pathlib import Path
import os
import re
import ast
import pandas as pd
import numpy as np
from typing import Literal
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from together import Together

# Configuration
API_KEY = "tgp_v1_xa8kNS1mlbTltz3XxsXeyk2sG9oSrK37jHrxwFgxpYM"
client = Together(api_key=API_KEY)
BATCH = 16

def classify_and_evaluate(model_name: str, mode: Literal["soft", "hard"]) -> float:
    """
    Run zero-shot classification in batches and compute accuracy.
    mode: "soft" returns argmax of returned score-dict; "hard" uses returned 'Category'.
    """
    # Select the appropriate prompt template
    template = (
        CLASSIFY_SOFT_PROMPT_TEMPLATE
        if mode == "soft"
        else CLASSIFY_HARD_PROMPT_TEMPLATE
    )

    # Build prompts and targets
    prompts = [template.format(description=row.text) for row in data.itertuples()]
    targets = data.label.tolist()

    def _call_api(text_prompt: str) -> str:
        try:
            resp = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user",   "content": text_prompt}
                ]
            )
            body = resp.choices[0].message.content.strip()
            match = re.search(r"<answer>(.*?)</answer>", body, re.DOTALL)
            if not match:
                raise ValueError("Missing <answer>…</answer> tags")
            result = ast.literal_eval(match.group(1).strip())
            return (
                max(result, key=result.get)
                if mode == "soft"
                else result["Category"]
            )
        except Exception as err:
            print(f"[{model_name}][{mode}] Error: {err}")
            return "ERROR"

    # Process in parallel batches
    all_preds = []
    for start in tqdm(range(0, len(prompts), BATCH), desc="Batches"):
        chunk = prompts[start : start + BATCH]
        with ThreadPoolExecutor(max_workers=BATCH) as pool:
            all_preds.extend(pool.map(_call_api, chunk))

    # Compute accuracy
    correct = sum(1 for pred, true in zip(all_preds, targets) if pred == true)
    acc = correct / len(targets)
    model_id = Path(model_name).name
    print(f"[{model_id}][{mode}] Accuracy: {acc:.4f}")
    return acc


In [None]:
from itertools import product
from typing import Literal

# Define modes and model identifiers
mode_options: list[Literal["hard", "soft"]] = ["hard", "soft"]
model_names = ["lgai/exaone-3-5-32b-instruct"]

# Collect (mode, model, accuracy) tuples
results = []

for mode, model in product(mode_options, model_names):
    acc = classify_and_evaluate(model_name=model, mode=mode)
    results.append([mode, model, acc])


Batches: 100%|██████████| 32/32 [00:35<00:00,  1.09s/it]


[exaone-3-5-32b-instruct][hard] Accuracy: 0.9500


Batches: 100%|██████████| 32/32 [01:01<00:00,  1.93s/it]

[exaone-3-5-32b-instruct][soft] Accuracy: 0.9340





In [None]:
print(tabulate(results, headers=["Mode", "Model", "Accuracy"], floatfmt=".3f", tablefmt="grid"))

+--------+------------------------------+------------+
| Mode   | Model                        |   Accuracy |
| hard   | lgai/exaone-3-5-32b-instruct |      0.950 |
+--------+------------------------------+------------+
| soft   | lgai/exaone-3-5-32b-instruct |      0.934 |
+--------+------------------------------+------------+


Я менял тысячу моделек, менял размеры классов, но у меня не получилось заставить софт побить побить хард, но я слышал, что есть люди у кого получилось. Возможно мне не повезло с датасетом.