# Import & Config

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets transformers==4.28.0 -q
!pip install wandb -q
!pip install seqeval -q
!pip install evaluate -q
!pip install accelerate -U -q

In [3]:
import csv
import re
import pandas as pd
import pickle
import numpy as np
import os
import random
import torch
import torch.nn as nn
from transformers import AutoModel, AutoModelForTokenClassification, AutoTokenizer, AutoConfig, DataCollatorForTokenClassification, Trainer, TrainingArguments, pipeline
from datasets import load_dataset, load_metric
import wandb
import evaluate
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm

In [4]:
# if you want to use wandb, run these blocks
# wandb.login()

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(1006)

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
# if ur running it on Mac, change it to:
# device = 'mps' if torch.backends.mps.is_available() else 'cpu'

'cuda'

# Prepare data

In [7]:
def data_preparation():
  df = pd.read_csv("/content/drive/MyDrive/2023-ebay-ml/data/Train_Tagged_Titles.tsv",
                 sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)
  Text_List = []
  Record_Number_List = list(df["Record Number"].unique())

  for number in Record_Number_List:
    sentence = ""
    append = ""
    token_list = []

    df_one_number = df[df["Record Number"] == number]
    df_one_number.reset_index(drop=True, inplace=True)

    i = 0
    while i < df_one_number.shape[0]:
      token = df_one_number.loc[i, "Token"]
      tag = df_one_number.loc[i, "Tag"]

      if pd.isna(df_one_number.loc[i, "Tag"]):
        token_list.insert(-1, token)
        if i == df_one_number.shape[0]-1:
            sentence += f"[{' '.join(token_list[:-1])}]({token_list[-1]}) "
            token_list = []
        i += 1
        continue
      if token_list != []:
        sentence += f"[{' '.join(token_list[:-1])}]({token_list[-1]}) "
      if tag == "No Tag":
        append_ = f"{token} "
        sentence += append_
        i += 1
        token_list = []
        continue
      if i == df_one_number.shape[0] - 1:
        sentence += f"[{token}]({tag}) "
        break

      token_list = [token,tag]
      i += 1

    Text_List.append(sentence[:-1])  # remove additional space at the end

  return Text_List

In [8]:
path = "/content/drive/MyDrive/2023-ebay-ml/data/"

if os.path.exists(path+"Train.pkl") and os.path.exists(path+"Eval.pkl"):
    Raw_Train_Data = pickle.load(open(path+"Train.pkl", "rb"))
    Raw_Eval_Data = pickle.load(open(path+"Eval.pkl", "rb"))

else:
    Prepared_data = data_preparation()
    #print(Prepared_data[0])

    random.shuffle(Prepared_data)  # already set seed at the beginning
    test_fraction = 0.2
    split_index = int(len(Prepared_data) * test_fraction)

    Raw_Eval_Data = Prepared_data[:split_index]
    Raw_Train_Data = Prepared_data[split_index:]

    pickle.dump(Raw_Train_Data, open(path+"Train.pkl", 'wb'))
    pickle.dump(Raw_Eval_Data, open(path+"Eval.pkl", 'wb'))

# Change format into NER

In [9]:
# using BIO tagging
def get_tokens_with_entities(raw_text: str):
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value") # [token](tag)

            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities


# customized dataset class
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
            # print("tokenized_inputs_1", tokenized_inputs.keys())

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            # print("tokenized_inputs_2", tokenized_inputs.keys())
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=self.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

In [10]:
NER_Train = NERDataMaker(Raw_Train_Data)
NER_Eval = NERDataMaker(Raw_Eval_Data)

# Training

In [11]:
CFG = dict(
    model_name="bert-base-multilingual-uncased", # TODO: consider pretrained model carefully
    output_dir=path+"results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    seed=1006,
    max_length=128
)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(CFG["model_name"], padding="max_length", max_length=CFG["max_length"])
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding="max_length", max_length=CFG["max_length"]) # enough for this competitions, or just make it another hyperparameter

In [13]:
model = AutoModelForTokenClassification.from_pretrained(CFG["model_name"])

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint a

In [14]:
new_classification_layer = nn.Linear(model.classifier.in_features, len(NER_Train.unique_entities))
model.classifier = new_classification_layer
model.id2label = NER_Train.id2label
model.label2id = NER_Train.label2id
model.num_labels = len(NER_Train.unique_entities)
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [15]:
# expand model's vocab
brand_names = []
with open("/content/drive/MyDrive/2023-ebay-ml/data/names.txt", "r") as f:
  for line in f:
    brand_names.append(line.strip())

In [16]:
tokenizer.add_tokens(brand_names)
model.resize_token_embeddings(len(tokenizer))

Embedding(109263, 768)

In [17]:
# let's see the outcome
tokenizer.tokenize("ABC design Beige Sneaker GR . 42") # cuz we're using uncased model

['abc design', 'bei', '##ge', 'sn', '##eak', '##er', 'gr', '.', '42']

In [18]:
train_ds = NER_Train.as_hf_dataset(tokenizer=tokenizer)
eval_ds = NER_Eval.as_hf_dataset(tokenizer=tokenizer)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [19]:
train_ds[0].keys()

dict_keys(['id', 'ner_tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [20]:
# remove unused columns before passing it to a dataloader, as it cannot deal with 'str'
train_ds = train_ds.remove_columns(["tokens", "id", "ner_tags"])
eval_ds = eval_ds.remove_columns(["tokens", "id", "ner_tags"])

In [21]:
train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=data_collator, batch_size=CFG["per_device_train_batch_size"])
eval_dataloader = DataLoader(eval_ds, shuffle=True, collate_fn=data_collator, batch_size=CFG["per_device_eval_batch_size"])

In [22]:
# customized training function
def train_fn(model, dl, optimizer):
  train_loss = 0
  for idx, batch in enumerate(tqdm(dl, total=len(dl))):
    input_ids = batch["input_ids"].to(device, dtype=torch.long)
    attention_mask = batch["attention_mask"].to(device, dtype=torch.long)
    token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
    labels = batch["labels"].to(device, dtype=torch.long)
    # print(input_ids.shape)
    # print(attention_mask.shape)
    # print(labels.shape)

    output = model(input_ids,
                   token_type_ids=None, # useless to this task
                   attention_mask=attention_mask,
                   labels=labels)

    step_loss = output[0]
    prediction = output[1]
    # print("pred:", prediction)

    step_loss.sum().backward()
    optimizer.step()
    train_loss += step_loss
    optimizer.zero_grad()

  return train_loss.sum()

In [23]:
def eval_fn(model, dl):
    model.eval()

    eval_loss = 0
    predictions = np.array([], dtype = np.int64).reshape(0, CFG["max_length"]) # pad to max_len
    true_labels = np.array([], dtype = np.int64).reshape(0, CFG["max_length"])

    with torch.no_grad():
        for idx, batch in enumerate(tqdm(dl, total=len(dl))):
            input_ids = batch["input_ids"].to(device, dtype=torch.long)
            attention_mask = batch["attention_mask"].to(device, dtype=torch.long)
            labels = batch["labels"].to(device, dtype=torch.long)

            output = model(input_ids,
                            token_type_ids=None,
                            attention_mask=attention_mask,
                            labels=labels)

            step_loss = output[0]
            eval_prediction = output[1]
            # print("pred:", eval_prediction)

            eval_loss += step_loss

            eval_prediction = np.argmax(eval_prediction.detach().to('cpu').numpy(), axis=2)
            # print("pred:", eval_prediction)
            actual = labels.to('cpu').numpy()
            # print("actual:", actual)

            predictions = np.concatenate((predictions, eval_prediction), axis=0)
            true_labels = np.concatenate((true_labels, actual), axis=0)

        return eval_loss.sum(), predictions, true_labels

In [24]:
def train_engine(model, train_dl, eval_dl):
    best_f1_score = 0
    params = model.parameters()
    optimizer = torch.optim.Adam(params, lr=CFG["learning_rate"])

    for i in range(CFG["num_train_epochs"]):
        train_loss = train_fn(model, train_dl, optimizer)
        eval_loss, eval_predictions, true_labels = eval_fn(model, eval_dl)
        # print(eval_predictions, true_labels)
        print(f"Epoch {i} , Train loss: {train_loss}, Eval loss: {eval_loss}")

        metric = evaluate.load("seqeval")  # for sequence labeling
        label_list = NER_Train.unique_entities
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(eval_predictions, true_labels)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(eval_predictions, true_labels)
        ]
        # need to change this based on the description of the competition
        # (it's just on the official webpage)
        results = metric.compute(predictions=true_predictions, references=true_labels)
        print("overall_f1:", results['overall_f1'])

        if results['overall_f1'] > best_f1_score:
            best_f1_score = results['overall_f1']
            print("Saving the model")
            # torch.save(model.state_dict(), '/content/drive/MyDrive/2023-ebay-ml/data/results/'+CFG['model_name'].replace('/','-')+'.pt')
            model.save_pretrained('/content/drive/MyDrive/2023-ebay-ml/data/results/'+CFG['model_name'].replace('/','-')+'/')
            tokenizer.save_pretrained('/content/drive/MyDrive/2023-ebay-ml/data/results/'+CFG['model_name'].replace('/','-')+'/')

    return model

In [25]:
model = train_engine(model, train_dataloader, eval_dataloader)

  0%|          | 0/125 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 125/125 [01:18<00:00,  1.60it/s]
100%|██████████| 32/32 [00:06<00:00,  4.60it/s]


Epoch 0 , Train loss: 119.5377197265625, Eval loss: 54.143558502197266


  _warn_prf(average, modifier, msg_start, len(result))


overall_f1: 0.6303663603172605
Saving the model


100%|██████████| 125/125 [01:21<00:00,  1.53it/s]
100%|██████████| 32/32 [00:07<00:00,  4.49it/s]


Epoch 1 , Train loss: 35.7421760559082, Eval loss: 58.441097259521484
overall_f1: 0.6405186801900116
Saving the model


100%|██████████| 125/125 [01:21<00:00,  1.53it/s]
100%|██████████| 32/32 [00:07<00:00,  4.50it/s]


Epoch 2 , Train loss: 20.01285171508789, Eval loss: 63.14146423339844
overall_f1: 0.6393411223903467


100%|██████████| 125/125 [01:21<00:00,  1.53it/s]
100%|██████████| 32/32 [00:07<00:00,  4.43it/s]


Epoch 3 , Train loss: 12.857878684997559, Eval loss: 62.42898178100586
overall_f1: 0.6455913159398209
Saving the model


100%|██████████| 125/125 [01:21<00:00,  1.53it/s]
100%|██████████| 32/32 [00:07<00:00,  4.48it/s]


Epoch 4 , Train loss: 8.328378677368164, Eval loss: 68.25306701660156
overall_f1: 0.634610472541507


# Old version

In [26]:
# training_args = TrainingArguments(
#     output_dir=CFG["output_dir"],
#     evaluation_strategy=CFG["evaluation_strategy"],
#     learning_rate=CFG["learning_rate"],
#     per_device_train_batch_size=CFG["per_device_train_batch_size"],
#     per_device_eval_batch_size=CFG["per_device_eval_batch_size"],
#     num_train_epochs=CFG["num_train_epochs"],
#     #weight_decay=CFG["weight_decay"],
#     seed=CFG["seed"],
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_ds,
#     eval_dataset=eval_ds,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

In [27]:
# run = wandb.init(
#     project='ebay-ml',
#     config=CFG,
# )

In [28]:
# trainer.train()

In [29]:
# metrics=trainer.evaluate()
# wandb.log(metrics)

# run.finish()

# Customized Inference

In [30]:
# read quiz data
path = "/content/drive/MyDrive/2023-ebay-ml/data/"

if os.path.exists(path+"Quiz.pkl"):
    df_quiz = pickle.load(open(path+"Quiz.pkl", "rb"))

else:
    df_all = pd.read_csv(path+"Listing_Titles.tsv", sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)
    df_quiz = df_all[5000:30000] # that's what the competition says
    pickle.dump(df_quiz, open(path+"Quiz.pkl", 'wb'))

In [31]:
# format: (no header, no specific ordering)
# Record Number, Aspect Name, Aspect Value

def inference(df, model, tokenizer):
  df_result = pd.DataFrame(columns=["Record Number", "Aspect Name", "Aspect Value"])

  model.eval()
  with torch.no_grad():
    for idx, row in df.iterrows():
      # print("*****************")
      id = row["Record Number"]
      title = row["Title"]
      title_list = title.split()
      # print(title_list)
      input_tensor = tokenizer(title, return_tensors="pt").to(device)
      tokens = tokenizer.convert_ids_to_tokens(input_tensor["input_ids"][0])
      logits = model(**input_tensor)["logits"]
      preds = torch.argmax(logits, dim=2).tolist()[0]

      entities = []
      current_entity = None

      for i, (tok, pred) in enumerate(zip(tokens, preds)):
        label = model.id2label[pred]
        if tok.startswith("##"):
          tokens[i] = tokens[i][2:]
          if current_entity is not None: current_entity["end"] = i
        else:
          if current_entity is not None: entities.append(current_entity)
          current_entity = {"start": i, "end": i, "label": label}

      preds_list = []
      for entity in entities[1:]:
        entity["text"] = "".join(tokens[entity["start"]:entity["end"] + 1])
        preds_list.append(entity["label"])

      # in case the preds don't follow BIO tagging rule
      # can eliminate by using different model like (BERT-BiLSTM-CRF)
      prev = preds_list[0]
      if prev.startswith('I'):
        preds_list[0] = 'B'+prev[1:]
        prev = 'B'+prev[1:]
      for i in range(1,len(preds_list)):
        if preds_list[i].startswith('I') and preds_list[i][2:] != prev[2:]:
          preds_list[i] = 'B'+preds_list[i][1:]
        prev = preds_list[i]
      # print(preds_list)

      # combine and output
      final_output = []
      current = {"word": [], "label": None}
      for word, pred in zip(title_list, preds_list):
        if pred == 'O': continue
        elif pred.startswith("B-"):
          if current["word"] == []:
            current['word'].append(word)
            current['label'] = pred[2:]
          else:
            res = pd.Series({"Record Number": id,
                              "Aspect Name": current['label'],
                              "Aspect Value": " ".join(current['word'])})
            df_result = pd.concat([df_result, res.to_frame().T], ignore_index=True)
            current['word'] = [word]
            current['label'] = pred[2:]
        elif pred.startswith("I-"):
          current['word'].append(word)
      if current['word'] != []:
        res = pd.Series({"Record Number": id,
                              "Aspect Name": current['label'],
                              "Aspect Value": " ".join(current['word'])})
        df_result = pd.concat([df_result, res.to_frame().T], ignore_index=True)

    return df_result

In [32]:
df_result = inference(df_quiz.head(5), model, tokenizer).reset_index(drop=True) # please change the ratio when submitting

In [33]:
df_result

Unnamed: 0,Record Number,Aspect Name,Aspect Value
0,5001,Marke,NIKE
1,5001,Produktlinie,FREE
2,5001,Modell,RUN
3,5001,Modell,3 SHIELD 5.0 SNEAKERS LAUFSCHUHE
4,5001,Stil,GR
5,5001,Produktart,. EU
6,5001,EU-Schuhgröße,UK 11
7,5001,US-Schuhgröße,30 CM
8,5002,Abteilung,DAMEN
9,5002,Produktart,SCHUHE
