In [30]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm
import csv

BASE_PATH = "/Users/santhosh/Desktop/ebay"
TRAIN_FILE = f"{BASE_PATH}/Tagged_Titles_Train.tsv"
LISTING_FILE = f"{BASE_PATH}/Listing_Titles.tsv"
SUBMISSION_FILE = f"{BASE_PATH}/submission_quiz.tsv"

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"🖥 Using device: {device}")

🖥 Using device: mps


In [35]:
df = pd.read_csv(TRAIN_FILE, sep="\t", keep_default_na=False, na_values=None)
df.columns = df.columns.str.strip().str.replace("﻿","", regex=True)
df["Tag"] = df["Tag"].str.strip()
df["Category"] = df["Category"].apply(lambda x: str(x).strip())

# Replace empty tags with most frequent tag per category, else "O"
most_freq_tag_by_cat = df[df["Tag"] != ""].groupby("Category")["Tag"].agg(lambda x: x.value_counts().idxmax()).to_dict()
df["Tag"] = df.apply(lambda r: most_freq_tag_by_cat.get(r["Category"], "O") if r["Tag"] == "" else r["Tag"], axis=1)

In [36]:
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, tag2idx, max_len=64):
        self.texts, self.tags = texts, tags
        self.tokenizer, self.tag2idx = tokenizer, tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens, labels = self.texts[idx], self.tags[idx]
        enc = self.tokenizer(tokens, is_split_into_words=True, truncation=True,
                             padding="max_length", max_length=self.max_len, return_tensors="pt")
        word_ids = enc.word_ids(0)
        label_ids, prev_word = [], None
        for w in word_ids:
            if w is None:
                label_ids.append(-100)
            elif w != prev_word:
                label_ids.append(self.tag2idx.get(labels[w], self.tag2idx["O"]))
            else:
                label_ids.append(-100)
            prev_word = w
        enc = {k: v.squeeze() for k, v in enc.items()}
        enc["labels"] = torch.tensor(label_ids)
        return enc

In [37]:
def train_category_model(category):
    print(f"\n⚙️ Training for Category {category}")
    cat_df = df[df["Category"] == str(category)]

    # Group by Record Number
    records = []
    for rec_id, g in cat_df.groupby("Record Number"):
        tokens = g["Token"].tolist()
        tags = g["Tag"].tolist()
        records.append((tokens, tags))

    texts = [r[0] for r in records]
    labels = [r[1] for r in records]

    # Tag mappings
    unique_tags = sorted(set(t for seq in labels for t in seq))
    tag2idx = {t: i for i, t in enumerate(unique_tags)}
    idx2tag = {i: t for t, i in tag2idx.items()}

    # Split
    split = int(0.8 * len(texts))
    train_texts, val_texts = texts[:split], texts[split:]
    train_tags, val_tags = labels[:split], labels[split:]

    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")
    model = AutoModelForTokenClassification.from_pretrained(
        "distilbert-base-german-cased",
        num_labels=len(tag2idx),
        id2label=idx2tag,
        label2id=tag2idx
    ).to(device)

    train_loader = DataLoader(NERDataset(train_texts, train_tags, tokenizer, tag2idx), batch_size=8, shuffle=True)
    val_loader = DataLoader(NERDataset(val_texts, val_tags, tokenizer, tag2idx), batch_size=8)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    epochs, patience = 20, 5
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps),
                                                num_training_steps=total_steps)

    best_f1, patience_counter = 0, 0
    save_dir = f"{BASE_PATH}/models/Category_{category}"
    os.makedirs(save_dir, exist_ok=True)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Category {category} | Epoch {epoch}/{epochs}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        preds, labels_all = [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                logits = model(**batch).logits
                pred = torch.argmax(logits, dim=-1)
                for p, l, m in zip(pred.cpu(), batch["labels"].cpu(), batch["attention_mask"].cpu()):
                    active = m == 1
                    preds.extend(p[active].tolist())
                    labels_all.extend(l[active].tolist())
        f1 = f1_score(labels_all, preds, average="macro")
        print(f"Epoch {epoch} | Loss: {avg_loss:.4f} | F1: {f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            patience_counter = 0
            torch.save(model.state_dict(), f"{save_dir}/best_model.pt")
            print("✅ Model improved and saved.")
        else:
            patience_counter += 1
            print(f"No improvement. Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print("⏹ Early stopping.")
                break

    print(f"🏁 Finished Category {category} | Best F1: {best_f1:.4f}")
    return model

In [38]:
model_cat1 = train_category_model(1)
model_cat2 = train_category_model(2)

print("\n🎉 Training complete for both Category 1 and Category 2.")


⚙️ Training for Category 1


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Category 1 | Epoch 1/20: 100%|██████████| 250/250 [01:30<00:00,  2.75it/s]


Epoch 1 | Loss: 1.5359 | F1: 0.3181
✅ Model improved and saved.


Category 1 | Epoch 2/20: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]


Epoch 2 | Loss: 0.3532 | F1: 0.4754
✅ Model improved and saved.


Category 1 | Epoch 3/20: 100%|██████████| 250/250 [01:29<00:00,  2.78it/s]


Epoch 3 | Loss: 0.2375 | F1: 0.5150
✅ Model improved and saved.


Category 1 | Epoch 4/20: 100%|██████████| 250/250 [01:30<00:00,  2.76it/s]


Epoch 4 | Loss: 0.1806 | F1: 0.5302
✅ Model improved and saved.


Category 1 | Epoch 5/20: 100%|██████████| 250/250 [01:21<00:00,  3.08it/s]


Epoch 5 | Loss: 0.1457 | F1: 0.5399
✅ Model improved and saved.


Category 1 | Epoch 6/20: 100%|██████████| 250/250 [01:21<00:00,  3.08it/s]


Epoch 6 | Loss: 0.1203 | F1: 0.5619
✅ Model improved and saved.


Category 1 | Epoch 7/20: 100%|██████████| 250/250 [01:21<00:00,  3.06it/s]


Epoch 7 | Loss: 0.0993 | F1: 0.5287
No improvement. Patience: 1/5


Category 1 | Epoch 8/20: 100%|██████████| 250/250 [01:21<00:00,  3.06it/s]


Epoch 8 | Loss: 0.0855 | F1: 0.5453
No improvement. Patience: 2/5


Category 1 | Epoch 9/20: 100%|██████████| 250/250 [01:26<00:00,  2.89it/s]


Epoch 9 | Loss: 0.0728 | F1: 0.5520
No improvement. Patience: 3/5


Category 1 | Epoch 10/20: 100%|██████████| 250/250 [01:26<00:00,  2.88it/s]


Epoch 10 | Loss: 0.0634 | F1: 0.5402
No improvement. Patience: 4/5


Category 1 | Epoch 11/20: 100%|██████████| 250/250 [01:26<00:00,  2.89it/s]


Epoch 11 | Loss: 0.0524 | F1: 0.5168
No improvement. Patience: 5/5
⏹ Early stopping.
🏁 Finished Category 1 | Best F1: 0.5619

⚙️ Training for Category 2


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Category 2 | Epoch 1/20: 100%|██████████| 250/250 [01:27<00:00,  2.87it/s]


Epoch 1 | Loss: 1.3721 | F1: 0.1672
✅ Model improved and saved.


Category 2 | Epoch 2/20: 100%|██████████| 250/250 [01:23<00:00,  2.98it/s]


Epoch 2 | Loss: 0.2659 | F1: 0.2869
✅ Model improved and saved.


Category 2 | Epoch 3/20: 100%|██████████| 250/250 [01:20<00:00,  3.10it/s]


Epoch 3 | Loss: 0.1703 | F1: 0.3323
✅ Model improved and saved.


Category 2 | Epoch 4/20: 100%|██████████| 250/250 [01:21<00:00,  3.09it/s]


Epoch 4 | Loss: 0.1271 | F1: 0.3271
No improvement. Patience: 1/5


Category 2 | Epoch 5/20: 100%|██████████| 250/250 [01:27<00:00,  2.86it/s]


Epoch 5 | Loss: 0.0960 | F1: 0.3355
✅ Model improved and saved.


Category 2 | Epoch 6/20: 100%|██████████| 250/250 [01:34<00:00,  2.64it/s]


Epoch 6 | Loss: 0.0797 | F1: 0.3235
No improvement. Patience: 1/5


Category 2 | Epoch 7/20: 100%|██████████| 250/250 [01:34<00:00,  2.64it/s]


Epoch 7 | Loss: 0.0598 | F1: 0.3640
✅ Model improved and saved.


Category 2 | Epoch 8/20: 100%|██████████| 250/250 [01:49<00:00,  2.29it/s]


Epoch 8 | Loss: 0.0513 | F1: 0.3446
No improvement. Patience: 1/5


Category 2 | Epoch 9/20: 100%|██████████| 250/250 [01:36<00:00,  2.60it/s]


Epoch 9 | Loss: 0.0414 | F1: 0.3986
✅ Model improved and saved.


Category 2 | Epoch 10/20: 100%|██████████| 250/250 [01:43<00:00,  2.41it/s]


Epoch 10 | Loss: 0.0327 | F1: 0.3895
No improvement. Patience: 1/5


Category 2 | Epoch 11/20: 100%|██████████| 250/250 [01:38<00:00,  2.55it/s]


Epoch 11 | Loss: 0.0293 | F1: 0.3996
✅ Model improved and saved.


Category 2 | Epoch 12/20: 100%|██████████| 250/250 [01:30<00:00,  2.77it/s]


Epoch 12 | Loss: 0.0213 | F1: 0.3906
No improvement. Patience: 1/5


Category 2 | Epoch 13/20: 100%|██████████| 250/250 [01:29<00:00,  2.80it/s]


Epoch 13 | Loss: 0.0203 | F1: 0.3742
No improvement. Patience: 2/5


Category 2 | Epoch 14/20: 100%|██████████| 250/250 [01:28<00:00,  2.83it/s]


Epoch 14 | Loss: 0.0152 | F1: 0.3798
No improvement. Patience: 3/5


Category 2 | Epoch 15/20: 100%|██████████| 250/250 [02:10<00:00,  1.91it/s]


Epoch 15 | Loss: 0.0123 | F1: 0.4271
✅ Model improved and saved.


Category 2 | Epoch 16/20: 100%|██████████| 250/250 [01:26<00:00,  2.89it/s]


Epoch 16 | Loss: 0.0109 | F1: 0.3937
No improvement. Patience: 1/5


Category 2 | Epoch 17/20: 100%|██████████| 250/250 [01:29<00:00,  2.80it/s]


Epoch 17 | Loss: 0.0093 | F1: 0.3813
No improvement. Patience: 2/5


Category 2 | Epoch 18/20: 100%|██████████| 250/250 [01:28<00:00,  2.83it/s]


Epoch 18 | Loss: 0.0079 | F1: 0.3821
No improvement. Patience: 3/5


Category 2 | Epoch 19/20: 100%|██████████| 250/250 [01:28<00:00,  2.81it/s]


Epoch 19 | Loss: 0.0065 | F1: 0.3824
No improvement. Patience: 4/5


Category 2 | Epoch 20/20: 100%|██████████| 250/250 [01:29<00:00,  2.80it/s]


Epoch 20 | Loss: 0.0057 | F1: 0.3805
No improvement. Patience: 5/5
⏹ Early stopping.
🏁 Finished Category 2 | Best F1: 0.4271

🎉 Training complete for both Category 1 and Category 2.


In [46]:
def predict_for_category(category):
    model_path = f"{BASE_PATH}/models/Category_{category}/best_model.pt"
    model_name = "distilbert-base-german-cased"

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load weights and detect label count
    state_dict = torch.load(model_path, map_location=device)
    classifier_weight = [v for k, v in state_dict.items() if "classifier.weight" in k][0]
    num_labels = classifier_weight.shape[0]

    # Build model with correct label size
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    id2label = {i: f"LABEL_{i}" for i in range(num_labels)}  # generic fallback labels

    subset = quiz_df[quiz_df["Category"] == str(category)]
    submission_rows = []

    for _, row in tqdm(subset.iterrows(), total=len(subset), desc=f"Predicting Category {category}"):
        tokens = row["Title"].split()
        enc = tokenizer(tokens, is_split_into_words=True, truncation=True,
                        padding="max_length", max_length=64, return_tensors="pt").to(device)
        with torch.no_grad():
            logits = model(**enc).logits
        preds = torch.argmax(logits, dim=2)[0].cpu().numpy()
        word_ids = enc.word_ids(0)
        seen = set()
        for i, w_id in enumerate(word_ids):
            if w_id is None or w_id in seen:
                continue
            seen.add(w_id)
            tag = id2label.get(preds[i], "O")
            token = tokens[w_id]
            if tag != "O":
                submission_rows.append([row["Record Number"], row["Category"], tag, token])
    return submission_rows

In [47]:
rows_1 = predict_for_category(1)
rows_2 = predict_for_category(2)

# Combine and save
final_rows = rows_1 + rows_2
sub_df = pd.DataFrame(final_rows, columns=["Record Number", "Category", "Aspect Name", "Aspect Value"])
sub_df.to_csv(SUBMISSION_FILE, sep="\t", index=False, quoting=csv.QUOTE_NONE, encoding="utf-8")

print("✅ Submission file created:", SUBMISSION_FILE)
print("Total predictions:", len(sub_df))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Category 1: 100%|██████████| 12500/12500 [02:43<00:00, 76.27it/s]
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting Category 2: 100%|██████████| 12500/12500 [02:38<00:00, 78.74it/s]


✅ Submission file created: /Users/santhosh/Desktop/ebay/submission_quiz.tsv
Total predictions: 282349


In [43]:
BASE_PATH = "/Users/santhosh/Desktop/ebay"
LISTING_FILE = f"{BASE_PATH}/Listing_Titles.tsv"
SUBMISSION_FILE = f"{BASE_PATH}/submission_quiz.tsv"

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


In [45]:
quiz_df = pd.read_csv(LISTING_FILE, sep="\t", keep_default_na=False, na_values=None)
quiz_df["Category"] = quiz_df["Category"].astype(str)
quiz_df = quiz_df[(quiz_df["Record Number"] >= 5001) & (quiz_df["Record Number"] <= 30000)]

In [48]:
unique_tags = sorted(set(t for seq in labels for t in seq))
tag2idx = {t: i for i, t in enumerate(unique_tags)}
idx2tag = {i: t for t, i in tag2idx.items()}

In [49]:
print("All label mappings:")
for i, tag in idx2tag.items():
    print(f"{i}: {tag}")

All label mappings:
0: Anwendung
1: Anzahl_Der_Einheiten
2: Besonderheiten
3: Breite
4: Bremsscheiben-Aussendurchmesser
5: Bremsscheibenart
6: Einbauposition
7: Farbe
8: Größe
9: Hersteller
10: Herstellernummer
11: Herstellungsland_Und_-Region
12: Im_Lieferumfang_Enthalten
13: Kompatible_Fahrzeug_Marke
14: Kompatibles_Fahrzeug_Jahr
15: Kompatibles_Fahrzeug_Modell
16: Länge
17: Material
18: Maßeinheit
19: Menge
20: Modell
21: O
22: Oberflächenbeschaffenheit
23: Oe/Oem_Referenznummer(N)
24: Produktart
25: Produktlinie
26: SAE_Viskosität
27: Stärke
28: Technologie
29: Zähnezahl


In [50]:
unique_tags = sorted(set(t for seq in labels for t in seq))
tag2idx = {t: i for i, t in enumerate(unique_tags)}
idx2tag = {i: t for t, i in tag2idx.items()}

# Save all labels as a simple array
label_list = [idx2tag[i] for i in range(len(idx2tag))]
print("All label values:", label_list)

All label values: ['Anwendung', 'Anzahl_Der_Einheiten', 'Besonderheiten', 'Breite', 'Bremsscheiben-Aussendurchmesser', 'Bremsscheibenart', 'Einbauposition', 'Farbe', 'Größe', 'Hersteller', 'Herstellernummer', 'Herstellungsland_Und_-Region', 'Im_Lieferumfang_Enthalten', 'Kompatible_Fahrzeug_Marke', 'Kompatibles_Fahrzeug_Jahr', 'Kompatibles_Fahrzeug_Modell', 'Länge', 'Material', 'Maßeinheit', 'Menge', 'Modell', 'O', 'Oberflächenbeschaffenheit', 'Oe/Oem_Referenznummer(N)', 'Produktart', 'Produktlinie', 'SAE_Viskosität', 'Stärke', 'Technologie', 'Zähnezahl']


In [51]:
import pandas as pd
import json

BASE_PATH = "/Users/santhosh/Desktop/ebay"
SUBMISSION_FILE = f"{BASE_PATH}/submission_quiz.tsv"

# Load the current submission file
df = pd.read_csv(SUBMISSION_FILE, sep="\t")

# ==============================================================
# Load label mappings from each category
# ==============================================================

# Category 1
with open(f"{BASE_PATH}/models/Category_1/idx2tag.json", "r") as f:
    idx2tag_1 = json.load(f)
idx2tag_1 = {f"LABEL_{k}": v for k, v in idx2tag_1.items()}

# Category 2
with open(f"{BASE_PATH}/models/Category_2/idx2tag.json", "r") as f:
    idx2tag_2 = json.load(f)
idx2tag_2 = {f"LABEL_{k}": v for k, v in idx2tag_2.items()}

# ==============================================================
# Replace label names in Aspect Name column
# ==============================================================

def map_label(row):
    label = row["Aspect Name"]
    cat = str(row["Category"])
    if cat == "1" and label in idx2tag_1:
        return idx2tag_1[label]
    elif cat == "2" and label in idx2tag_2:
        return idx2tag_2[label]
    else:
        return label  # keep unchanged if not found

df["Aspect Name"] = df.apply(map_label, axis=1)

# ==============================================================
# Save the cleaned submission
# ==============================================================

output_file = f"{BASE_PATH}/submission_quiz_clean.tsv"
df.to_csv(output_file, sep="\t", index=False)

print("✅ Cleaned submission created:", output_file)
print("Sample preview:")
print(df.head())

ParserError: Error tokenizing data. C error: EOF inside string starting at row 200449

In [52]:
import pandas as pd

df = pd.read_csv(
    SUBMISSION_FILE,
    sep="\t",
    engine="python",          # more forgiving parser
    quoting=3,                # quotechar=None equivalent
    on_bad_lines="skip",      # skip broken lines instead of dying
    encoding="utf-8",
    dtype=str                 # avoid type inference confusion
)

In [53]:
print("Rows loaded:", len(df))
print(df.head())

Rows loaded: 282349
  Record Number Category Aspect Name Aspect Value
0          5001        1    LABEL_11         OPEL
1          5001        1    LABEL_13        ASTRA
2          5001        1    LABEL_17            H
3          5001        1    LABEL_17          1.7
4          5001        1    LABEL_17     CDTI-SET


In [56]:
df["Aspect Name"] = df.apply(label_list, axis=1)

AttributeError: 'Anwendung' is not a valid function for 'Series' object

In [55]:
label_list

['Anwendung',
 'Anzahl_Der_Einheiten',
 'Besonderheiten',
 'Breite',
 'Bremsscheiben-Aussendurchmesser',
 'Bremsscheibenart',
 'Einbauposition',
 'Farbe',
 'Größe',
 'Hersteller',
 'Herstellernummer',
 'Herstellungsland_Und_-Region',
 'Im_Lieferumfang_Enthalten',
 'Kompatible_Fahrzeug_Marke',
 'Kompatibles_Fahrzeug_Jahr',
 'Kompatibles_Fahrzeug_Modell',
 'Länge',
 'Material',
 'Maßeinheit',
 'Menge',
 'Modell',
 'O',
 'Oberflächenbeschaffenheit',
 'Oe/Oem_Referenznummer(N)',
 'Produktart',
 'Produktlinie',
 'SAE_Viskosität',
 'Stärke',
 'Technologie',
 'Zähnezahl']