In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
import json
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers.trainer_callback import EarlyStoppingCallback
import torch
import datetime  
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

label_list = ['O', 'I-Product', 'B-Product']
label_encoding_dict = {'O': 0, 'I-Product': 1, 'B-Product': 2}

task = "ner_2" 
model_checkpoint = "distilbert-base-uncased"
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def get_tokens_and_ner_tags(data):
    tokens = [item['words'] for item in data]
    entities = [item['ner_tags'] for item in data]
    
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})

def get_token_datasets(train_data, test_data):
    train_df = get_tokens_and_ner_tags(train_data)
    test_df = get_tokens_and_ner_tags(test_data)

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    return (train_dataset, test_dataset)

with open('data/train_data.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open('data/test_data.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

train_dataset, test_dataset = get_token_datasets(train_data, test_data)


def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0': 
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)


data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

def train_model(config):
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")  # Generate a timestamp
    output_dir = os.path.join("./output_dir_anaconda", f"test-{task}-{timestamp}")  # Created unique checkpoint output directory
    output_dir_best_model = "./output_dir_anaconda/best_model" # Directory to save best model and its weights 
    args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=config["per_device_train_batch_size"],
        per_device_eval_batch_size=config["per_device_eval_batch_size"],
        num_train_epochs=config["num_train_epochs"],
        weight_decay=config["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True, 
         logging_strategy="epoch",
    )
    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_tokenized_datasets,
        eval_dataset=test_tokenized_datasets,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()
    trainer.save_model(output_dir_best_model)
    return trainer.evaluate()

# Define the search space
search_space = {
    "learning_rate": [1e-04],
    "per_device_train_batch_size": [8],
    "per_device_eval_batch_size": [16],
    "num_train_epochs": [25],
    "weight_decay": [1e-05],
}

# Perform grid search
results = []
for lr, train_bs, eval_bs, epochs, wd in itertools.product(*search_space.values()):
    config = {
        "learning_rate": lr,
        "per_device_train_batch_size": train_bs,
        "per_device_eval_batch_size": eval_bs,
        "num_train_epochs": epochs,
        "weight_decay": wd
    }
    print("Starting training with config:", config)  # Print the pair of arguments used
    result = train_model(config)
    results.append((config, result))

# Find the best configuration
best_config, best_result = max(results, key=lambda x: x[1]["eval_f1"])
print("Best hyperparameters found:", best_config)
print("Best F1 score:", best_result["eval_f1"])



Map:   0%|          | 0/82 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Starting training with config: {'learning_rate': 0.0001, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'num_train_epochs': 25, 'weight_decay': 1e-05}


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4181,0.223836,0.488372,0.097674,0.162791,0.919664
2,0.1462,0.126587,0.674757,0.646512,0.660333,0.958319
3,0.0951,0.120732,0.69869,0.744186,0.720721,0.967059
4,0.0534,0.091361,0.752294,0.762791,0.757506,0.972773
5,0.0303,0.073465,0.872038,0.855814,0.86385,0.982857
6,0.0196,0.071403,0.871111,0.911628,0.890909,0.984874
7,0.0159,0.074001,0.805785,0.906977,0.853392,0.981513
8,0.01,0.070265,0.887387,0.916279,0.901602,0.986891
9,0.0046,0.072874,0.852814,0.916279,0.883408,0.985546
10,0.004,0.070411,0.90367,0.916279,0.909931,0.987899


Best hyperparameters found: {'learning_rate': 0.0001, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'num_train_epochs': 25, 'weight_decay': 1e-05}
Best F1 score: 0.916279069767442


In [2]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./output_dir_anaconda/best_model")
model = AutoModelForTokenClassification.from_pretrained("./output_dir_anaconda/best_model")

# Generate id2tag 
id2tag = {id: tag for tag, id in label_encoding_dict.items()}

def predict_for_urls(json_file_path, max_length):
    predictions_list = []
    
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    for item in data:
        url = item['url']
        text_content = item['text_content']
#         print(f"URL: {url}")
        segments = [text_content[i:i+max_length] for i in range(0, len(text_content), max_length)]
        predictions = []
        for segment in segments:
            segment_predictions = predict(segment)
            predictions.extend(segment_predictions)
        predictions_list.append({'url': url, 'predictions': predictions})
    
    return predictions_list

def predict(sentence):
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    inputs = tokenizer.encode(sentence, return_tensors="pt", truncation=True, padding=True)
    outputs = model(inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    return [(token, id2tag[prediction]) for token, prediction in zip(tokens, predictions[0].tolist()) if id2tag[prediction] != 'O' and token != '[SEP]']

# Set the maximum segment length
max_length = 512

# Call the function with your JSON file path and max_length
predictions_list = predict_for_urls('./data/to_predict.json', max_length)

# Print each dictionary on a new line
for prediction in predictions_list:
    print(prediction)


{'url': 'https://www.myconcept.com.hk/products/moo', 'predictions': [('light', 'B-Product'), ('lamp', 'B-Product'), ('light', 'B-Product'), ('light', 'B-Product'), ('lamp', 'B-Product'), ('lamp', 'B-Product'), ('lamp', 'B-Product')]}
{'url': 'https://vauntdesign.com/products/forna-plant-stand-small', 'predictions': []}
{'url': 'https://asianteakfurniture.com/products/bali-teak-bench-atf388', 'predictions': [('bench', 'B-Product'), ('bench', 'B-Product'), ('fur', 'B-Product'), ('##nish', 'B-Product'), ('##ing', 'B-Product'), ('bench', 'B-Product')]}
{'url': 'https://homestreethome.ie/products/zinc-tray', 'predictions': [('tables', 'B-Product'), ('chairs', 'B-Product'), ('benches', 'B-Product'), ('coffee', 'B-Product'), ('table', 'B-Product'), ('book', 'B-Product'), ('##case', 'B-Product'), ('##s', 'B-Product'), ('cabinets', 'B-Product'), ('cabinets', 'B-Product'), ('dresser', 'B-Product'), ('##s', 'B-Product'), ('consoles', 'B-Product'), ('tables', 'B-Product'), ('desk', 'B-Product'), (

In [3]:
###concatenate words by ##

for prediction in predictions_list:
    new_list = []
    tag = None

    for word, tag in prediction['predictions']:
        if word.startswith("##"):
            if new_list:  # Check if new_list is not empty
                new_list[-1] = (new_list[-1][0] + word[2:], tag)
        else:
            new_list.append((word, tag))

    prediction['predictions'] = new_list
    print(prediction)


{'url': 'https://www.myconcept.com.hk/products/moo', 'predictions': [('light', 'B-Product'), ('lamp', 'B-Product'), ('light', 'B-Product'), ('light', 'B-Product'), ('lamp', 'B-Product'), ('lamp', 'B-Product'), ('lamp', 'B-Product')]}
{'url': 'https://vauntdesign.com/products/forna-plant-stand-small', 'predictions': []}
{'url': 'https://asianteakfurniture.com/products/bali-teak-bench-atf388', 'predictions': [('bench', 'B-Product'), ('bench', 'B-Product'), ('furnishing', 'B-Product'), ('bench', 'B-Product')]}
{'url': 'https://homestreethome.ie/products/zinc-tray', 'predictions': [('tables', 'B-Product'), ('chairs', 'B-Product'), ('benches', 'B-Product'), ('coffee', 'B-Product'), ('table', 'B-Product'), ('bookcases', 'B-Product'), ('cabinets', 'B-Product'), ('cabinets', 'B-Product'), ('dressers', 'B-Product'), ('consoles', 'B-Product'), ('tables', 'B-Product'), ('desks', 'B-Product'), ('bedsides', 'B-Product'), ('mirrors', 'B-Product'), ('rugs', 'B-Product'), ('lampshades', 'B-Product'), 

In [4]:
##concatenate B-Product and I-Product
for prediction in predictions_list:
    new_list = []
    tag = None
    for word, tag in prediction['predictions']:
        if tag == 'I-Product':
            if new_list:
                new_list[-1] = (new_list[-1][0] + " " + word, new_list[-1][1] + " " + tag )
        else:
            new_list.append((word, tag))
    prediction['predictions'] = new_list
    print(prediction)

{'url': 'https://www.myconcept.com.hk/products/moo', 'predictions': [('light', 'B-Product'), ('lamp', 'B-Product'), ('light', 'B-Product'), ('light', 'B-Product'), ('lamp', 'B-Product'), ('lamp', 'B-Product'), ('lamp', 'B-Product')]}
{'url': 'https://vauntdesign.com/products/forna-plant-stand-small', 'predictions': []}
{'url': 'https://asianteakfurniture.com/products/bali-teak-bench-atf388', 'predictions': [('bench', 'B-Product'), ('bench', 'B-Product'), ('furnishing', 'B-Product'), ('bench', 'B-Product')]}
{'url': 'https://homestreethome.ie/products/zinc-tray', 'predictions': [('tables', 'B-Product'), ('chairs', 'B-Product'), ('benches', 'B-Product'), ('coffee', 'B-Product'), ('table', 'B-Product'), ('bookcases', 'B-Product'), ('cabinets', 'B-Product'), ('cabinets', 'B-Product'), ('dressers', 'B-Product'), ('consoles', 'B-Product'), ('tables', 'B-Product'), ('desks', 'B-Product'), ('bedsides', 'B-Product'), ('mirrors', 'B-Product'), ('rugs', 'B-Product'), ('lampshades', 'B-Product'), 

In [5]:
import json

predictions = []

for prediction in predictions_list:
    url = prediction['url']
    words = [word for word, _ in prediction['predictions']]
    predictions.append({'url': url, 'predictions': words})

# Save the predictions as a JSON file
with open('./data/final_predictions.json', 'w') as json_file:
    json.dump(predictions, json_file, indent=4)
