
# 🧠 Kantar All‑LLM Notebook (Azure OpenAI, Few‑Shots + Glossary)

This notebook implements an **LLM‑only** pipeline to extract:
- **sector**, **categoria**, **marca**, **size**, **unit**  
from noisy product text using **Azure OpenAI** with:
- **Augmented few‑shots:** auto‑sampled from your train data **+** hand‑picked tricky examples
- **Soft label glossary:** suggest allowed values to the model (not hard‑enforced)
- **JSON Schema enforced** outputs
- Optional **self‑consistency** (vote/median over multiple generations)
- **Weighted F1** evaluation vs. test ground truth (if present)

> ⚙️ You’ll need Azure env vars set on your machine:  
`AZURE_OPENAI_ENDPOINT`, `AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_DEPLOYMENT`, and `AZURE_OPENAI_API_VERSION=2024-12-01-preview`.



## 0) Setup
Install dependencies once (uncomment if needed):
```bash
# !pip install -U openai==1.* pandas numpy scikit-learn nbformat
```


In [None]:

import os, re, json, unicodedata, random
from typing import Dict, List, Tuple, Optional
from pathlib import Path

import numpy as np
import pandas as pd



## 1) Load data & utilities
- Normalize column names
- Pick text column (e.g., `clean_description` in train, `ocr_text` in test)


In [None]:

def normalize_col(name: str) -> str:
    s2 = ''.join(c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn')
    return re.sub(r'\s+', '_', s2.strip().lower())

def load_and_normalize(train_path: str, test_path: str):
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)
    train.columns = [normalize_col(c) for c in train.columns]
    test.columns  = [normalize_col(c) for c in test.columns]
    return train, test

def pick_text_col(df: pd.DataFrame) -> str:
    candidates = [
        "ocr_text","ocr","text","descripcion","description","clean_description",
        "product_description","full_text","title"
    ]
    for c in candidates:
        if c in df.columns:
            return c
    str_cols = [c for c in df.columns if df[c].dtype == object]
    assert str_cols, "No string-like columns found for text."
    return max(str_cols, key=lambda c: df[c].fillna("").astype(str).str.len().mean())

def map_target_columns(train: pd.DataFrame):
    mapping = {}
    candidates = {
        "sector":    ["sector"],
        "categoria": ["categoria","category","categoría"],
        "marca":     ["marca","brand"],
    }
    for tgt, aliases in candidates.items():
        for a in aliases:
            a = normalize_col(a)
            if a in train.columns:
                mapping[tgt] = a
                break
    return mapping



## 2) Few‑shots (auto + hand‑picked) & soft label glossary


In [None]:

SIZE_RE = re.compile(
    r"(?P<qty>\d+(?:[\.,]\d+)?)\s*(?P<unit>ml|l|lt|litros?|ltrs?|cc|cl|g|gr|gramos?|kg|kgs?|un|uds?|u|pack|pz|pcs?)\b",
    re.IGNORECASE,
)

def _maybe_size_unit(text: str):
    if not isinstance(text, str): return None, None
    m = SIZE_RE.search(text)
    if not m: return None, None
    q = m.group("qty").replace(",", ".")
    try: qv = float(q)
    except: qv = None
    unit = m.group("unit").lower()
    unit_map = {"lt":"l","ltrs":"l","litro":"l","litros":"l","gr":"g","gramo":"g","gramos":"g","kgs":"kg","uds":"un","pz":"pcs","u":"un"}
    return qv, unit_map.get(unit, unit)

def build_auto_fewshots(train: pd.DataFrame, text_col: str, label_cols: List[str],
                        per_label: int = 2, max_examples: int = 24, seed: int = 42):
    rng = random.Random(seed)
    examples = []
    pool = train.copy()
    pool["_has_size"] = pool[text_col].astype(str).str.contains(SIZE_RE)
    pool = pd.concat([pool[pool["_has_size"]], pool[~pool["_has_size"]]]).drop(columns=["_has_size"])

    primary = None
    for cand in ["categoria","category","sector","marca","brand"]:
        c = normalize_col(cand)
        if c in label_cols:
            primary = c
            break

    if primary:
        vc = pool[primary].value_counts()
        classes = list(vc.index)
        rng.shuffle(classes)
        for cls in classes:
            rows = pool[pool[primary]==cls]
            if len(rows)==0: continue
            rows = rows.sample(min(per_label, len(rows)), random_state=seed)
            for _, r in rows.iterrows():
                ex = {"text": str(r[text_col])}
                for lc in label_cols:
                    if lc in r and pd.notna(r[lc]): ex[lc] = str(r[lc])
                q,u = _maybe_size_unit(r[text_col])
                if q is not None: ex["size"] = q
                if u is not None: ex["unit"] = u
                examples.append(ex)
                if len(examples) >= max_examples:
                    return examples
        return examples[:max_examples]
    else:
        for _, r in pool.sample(min(max_examples, len(pool)), random_state=seed).iterrows():
            ex = {"text": str(r[text_col])}
            for lc in label_cols:
                if lc in r and pd.notna(r[lc]): ex[lc] = str(r[lc])
            q,u = _maybe_size_unit(r[text_col])
            if q is not None: ex["size"] = q
            if u is not None: ex["unit"] = u
            examples.append(ex)
        return examples

def validate_handpicked_examples(handpicked: List[Dict]):
    cleaned = []
    for ex in handpicked or []:
        if not isinstance(ex, dict): continue
        if "text" not in ex or not str(ex["text"]).strip(): continue
        out = {"text": str(ex["text"]).strip()}
        for k in ["sector","categoria","category","marca","brand","size","unit"]:
            if k in ex and ex[k] is not None:
                out[k] = ex[k]
        cleaned.append(out)
    return cleaned

def merge_fewshots(auto_shots: List[Dict], handpicked: List[Dict], max_total: int = 40):
    merged = (handpicked or []) + (auto_shots or [])
    seen, deduped = set(), []
    for ex in merged:
        t = ex.get("text", "").strip()
        if t and t not in seen:
            seen.add(t)
            deduped.append(ex)
        if len(deduped) >= max_total:
            break
    return deduped

def glossary_block(glossary: Optional[Dict[str, List[str]]]):
    if not glossary: 
        return ""
    lines = []
    for k, vals in glossary.items():
        if not vals: continue
        preview = ", ".join(vals[:30]) + ("" if len(vals)<=30 else ", ...")
        lines.append(f"- {k}: prefer one of [{preview}] when applicable; if none fits, output a best guess.")
    if not lines:
        return ""
    return (
        "\\nLabel glossary (soft guidance; not hard rules):\\n"
        + "\\n".join(lines)
        + "\\n"
    )



## 3) Build messages (system with soft glossary + few‑shots)


In [None]:

def build_messages(fewshots: List[Dict], text: str, label_glossary: Optional[Dict[str, List[str]]] = None):
    system = {
        "role": "system",
        "content": (
            "You normalize retail product descriptions from noisy OCR.\n"
            "Extract keys: sector (string), categoria (string), marca (string), size (number or null), unit (string or null).\n"
            "Guidelines:\n"
            "- Keep strings concise and canonical.\n"
            "- If a value cannot be determined, use null.\n"
            "- For multi-brand mentions, choose the primary product brand; avoid retailer or sub-brands unless clearly primary.\n"
            "- For packs, if the text implies multiples (e.g., '6x330ml'), prioritize the single-item size (330) and unit ('ml').\n"
            "- Normalize common units like ml,l,g,kg,un,pcs.\n"
            + glossary_block(label_glossary)
        )
    }
    shots = []
    for ex in fewshots:
        shots.append({"role": "user", "content": ex["text"]})
        out = {
            "sector": ex.get("sector"),
            "categoria": ex.get("categoria") or ex.get("category"),
            "marca": ex.get("marca") or ex.get("brand"),
            "size": ex.get("size"),
            "unit": ex.get("unit"),
        }
        shots.append({"role": "assistant", "content": json.dumps(out, ensure_ascii=False)})
    return [system] + shots + [{"role":"user","content": text}]



## 4) Azure OpenAI client + JSON Schema enforcement


In [None]:

def get_azure_client():
    from openai import AzureOpenAI
    endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
    key = os.environ["AZURE_OPENAI_API_KEY"]
    api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
    return AzureOpenAI(api_version=api_version, azure_endpoint=endpoint, api_key=key)

def _json_schema():
    return {
        "name": "KantarExtraction",
        "schema": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "sector":    {"type": ["string","null"]},
                "categoria": {"type": ["string","null"]},
                "marca":     {"type": ["string","null"]},
                "size":      {"type": ["number","null"]},
                "unit":      {"type": ["string","null"]}
            },
            "required": ["sector","categoria","marca","size","unit"]
        },
        "strict": True
    }

def call_llm_jsonschema(client, deployment: str, messages: List[Dict],
                        temperature: float = 0.2, max_tokens: int = 600):
    resp = client.chat.completions.create(
        model=deployment,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=1.0,
        response_format={"type":"json_schema","json_schema":_json_schema()},
    )
    content = resp.choices[0].message.content
    try:
        return json.loads(content)
    except Exception:
        return {"sector": None, "categoria": None, "marca": None, "size": None, "unit": None}



## 5) Self‑consistency aggregation


In [None]:

def _majority_vote_str(candidates: List[Optional[str]]) -> Optional[str]:
    c = [x.strip() for x in candidates if isinstance(x, str) and x.strip()!=""]
    if not c: return None
    keys = [x.lower() for x in c]
    vc = pd.Series(keys).value_counts()
    top_key = vc.index[0]
    for orig in c:
        if orig.lower() == top_key:
            return orig
    return c[0]

def _median_number(candidates: List[Optional[float]]) -> Optional[float]:
    vals = [float(x) for x in candidates if isinstance(x,(int,float))]
    if not vals: return None
    return float(np.median(vals))



## 6) End‑to‑end All‑LLM runner


In [None]:

def weighted_f1(y_true: pd.Series, y_pred: List) -> Optional[float]:
    from sklearn.metrics import f1_score
    mask = y_true.notna()
    if mask.sum() == 0:
        return None
    return float(
        f1_score(
            y_true[mask].astype(str),
            pd.Series(y_pred)[mask].astype(str),
            average="weighted",
            zero_division=0,
        )
    )

def score_predictions(test: pd.DataFrame, targets: Dict[str, str], preds: Dict[str, List]):
    scores = {}
    for tgt, col in targets.items():
        pred_col = f"pred_{tgt}"
        if col in test.columns and pred_col in preds:
            scores[tgt] = weighted_f1(test[col], preds[pred_col])
    return scores

def save_outputs(test: pd.DataFrame, preds: Dict[str, List], out_csv: str,
                 metrics_json: str, scores: Dict[str, Optional[float]]):
    out = test.copy()
    for k, v in preds.items():
        out[k] = v
    Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(out_csv, index=False)
    with open(metrics_json, "w", encoding="utf-8") as f:
        json.dump({k: (None if v is None else round(v, 4)) for k,v in scores.items()},
                  f, ensure_ascii=False, indent=2)

def run_all_llm(train: pd.DataFrame, test: pd.DataFrame, out_csv: str, metrics_json: str,
                handpicked_examples: Optional[List[Dict]] = None,
                label_glossary: Optional[Dict[str, List[str]]] = None,
                max_fewshots: int = 40, per_label: int = 2,
                n_candidates: int = 1, temperature: float = 0.2,
                max_tokens: int = 600, seed: int = 42):
    train_text = pick_text_col(train)
    test_text  = pick_text_col(test)
    targets    = map_target_columns(train)

    auto_shots = build_auto_fewshots(train, train_text, list(targets.values()),
                                     per_label=per_label, max_examples=max_fewshots, seed=seed)
    handpicked = validate_handpicked_examples(handpicked_examples or [])
    fewshots   = merge_fewshots(auto_shots, handpicked, max_total=max_fewshots)

    client = get_azure_client()
    deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
    if not deployment:
        raise RuntimeError("Set AZURE_OPENAI_DEPLOYMENT to your chat model deployment name.")

    pred_sector, pred_categoria, pred_marca, pred_size, pred_unit = [], [], [], [], []
    texts = test[test_text].fillna("").astype(str).tolist()

    for t in texts:
        messages = build_messages(fewshots, t, label_glossary=label_glossary)
        cand_objs = []
        for _ in range(max(1, n_candidates)):
            js = call_llm_jsonschema(client, deployment, messages, temperature=temperature, max_tokens=max_tokens)
            cand_objs.append(js)

        sector    = _majority_vote_str([c.get("sector") for c in cand_objs])
        categoria = _majority_vote_str([c.get("categoria") for c in cand_objs])
        marca     = _majority_vote_str([c.get("marca") for c in cand_objs])
        size      = _median_number([c.get("size") for c in cand_objs])
        unit      = _majority_vote_str([c.get("unit") for c in cand_objs])

        pred_sector.append(sector)
        pred_categoria.append(categoria)
        pred_marca.append(marca)
        pred_size.append(size)
        pred_unit.append(unit)

    preds = {
        "pred_sector": pred_sector,
        "pred_categoria": pred_categoria,
        "pred_marca": pred_marca,
        "pred_size": pred_size,
        "pred_unit": pred_unit,
    }

    scores = score_predictions(test, targets, preds)
    save_outputs(test, preds, out_csv, metrics_json, scores)
    return scores



## 7) Example usage
Fill in your file paths and (optionally) handpicked tricky examples and a soft glossary.


In [None]:

# %% Example usage (uncomment to run on your machine)
# train, test = load_and_normalize('Kantar_train.csv', 'Kantar_test.csv')
#
# tricky = [
#   {"text": "Coca-Cola Zero 6x330ml Lata", "sector":"Bebidas", "categoria":"Gaseosas", "marca":"Coca-Cola", "size":330, "unit":"ml"},
#   {"text": "Leche Deslactosada Alpina 1L", "sector":"Lácteos", "categoria":"Leche", "marca":"Alpina", "size":1, "unit":"l"},
#   {"text": "Chocolate para Taza Luker 250 g Pack x2", "sector":"Alimentos", "categoria":"Chocolate", "marca":"Luker", "size":250, "unit":"g"},
#   {"text": "Café Colcafé Tradicional Frasco 170g", "sector":"Alimentos", "categoria":"Café", "marca":"Colcafé", "size":170, "unit":"g"},
#   {"text": "Yogur Griego Natural Danone 4x125g", "sector":"Lácteos", "categoria":"Yogur", "marca":"Danone", "size":125, "unit":"g"},
# ]
#
# glossary = {
#   "sector":    ["Alimentos","Bebidas","Lácteos","Higiene","Aseo","Snacks"],
#   "categoria": ["Café","Chocolate","Gaseosas","Leche","Yogur","Galletas","Cereales"],
#   "marca":     ["Coca-Cola","Pepsi","Nestlé","Luker","Colcafé","Alpina","Danone","Bimbo"]
# }
#
# scores = run_all_llm(
#   train=train,
#   test=test,
#   out_csv='predictions.csv',
#   metrics_json='metrics.json',
#   handpicked_examples=tricky,
#   label_glossary=glossary,
#   max_fewshots=40,
#   per_label=2,
#   n_candidates=3,
#   temperature=0.2,
#   max_tokens=600
# )
# print('Weighted F1:', scores)


In [None]:
# --- BASIC AZURE OPENAI LLM TEST ---
# Checks if your Azure setup, key, and deployment are working.

from openai import AzureOpenAI
import os, json

# 1️⃣ Set your Azure environment variables before running:
# export AZURE_OPENAI_ENDPOINT="https://<your-endpoint>.openai.azure.com/"
# export AZURE_OPENAI_API_KEY="<your-key>"
# export AZURE_OPENAI_DEPLOYMENT="<your-deployment-name>"
# export AZURE_OPENAI_API_VERSION="2024-12-01-preview"

# 2️⃣ Initialize the client
client = AzureOpenAI(
    api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview"),
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
)

# 3️⃣ Create a simple structured prompt
messages = [
    {"role": "system", "content": "You are a product classification assistant."},
    {
        "role": "user",
        "content": "Extract the product sector, category, brand, size, and unit from this text:\n\n'Café Colcafé Tradicional Frasco 170g'",
    },
]

# 4️⃣ Call the model with structured JSON schema response
response = client.chat.completions.create(
    model=os.environ["AZURE_OPENAI_DEPLOYMENT"],
    messages=messages,
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "ProductExtraction",
            "schema": {
                "type": "object",
                "properties": {
                    "sector": {"type": "string"},
                    "categoria": {"type": "string"},
                    "marca": {"type": "string"},
                    "size": {"type": "number"},
                    "unit": {"type": "string"},
                },
                "required": ["sector", "categoria", "marca", "size", "unit"],
            },
            "strict": True,
        },
    },
    temperature=0.2,
    max_tokens=200,
)

# 5️⃣ Print parsed structured JSON output
print("Raw content:")
print(response.choices[0].message.content)

try:
    parsed = json.loads(response.choices[0].message.content)
    print("\n✅ Parsed JSON:")
    print(json.dumps(parsed, indent=2, ensure_ascii=False))
except Exception as e:
    print("\n⚠️ Could not parse response:", e)
