In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/entities/annotated_entities2.json
/kaggle/input/entttt/annotated_entities.json


In [8]:
import json
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import os

# Setting environment variable to avoid tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [9]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    texts = [item['Content'] for item in data]
    annotations = [[(ent['start'], ent['end'], ent['label']) for ent in item['entities']] for item in data]
    return texts, annotations


In [10]:
def encode_tags(texts, annotations, label_to_id, max_length):
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    encoded_inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    labels = []

    for i, (text, annotation) in enumerate(zip(texts, annotations)):
        # Create a list of -100 to ignore loss calculation for other tokens
        doc_labels = [-100] * len(encoded_inputs['input_ids'][i])
        
        for start, end, label in annotation:
            # Convert character positions to token positions
            start_token = encoded_inputs.char_to_token(i, start)
            end_token = encoded_inputs.char_to_token(i, end - 1)
            if start_token is not None and end_token is not None:
                # Set labels only for the first token of each word
                doc_labels[start_token] = label_to_id[label]
                # Ensure subsequent tokens in the same word are set to -100
                doc_labels[start_token + 1:end_token + 1] = [-100] * (end_token - start_token)

        labels.append(doc_labels)
    
    # Ensure that all label sequences are padded to the maximum length of the sequences
    labels_padded = [label + [-100] * (max_length - len(label)) for label in labels]
    encoded_inputs['labels'] = torch.tensor(labels_padded)
    return encoded_inputs


In [11]:
def main():
    file_path = '/kaggle/input/entities/annotated_entities2.json'
    texts, annotations = load_data(file_path)
    print("Data is loaded")
    label_to_id = {'O': 0, 'PRODUCT': 1, 'NON_PRODUCT': 2}  # Adăugăm NON_PRODUCT în dictionar  # Mapping labels to IDs

    # Encoding data
    encoded_data = encode_tags(texts, annotations, label_to_id, max_length=128)
    dataset = Dataset.from_dict(encoded_data)
    print("Data is encoded")
    # Specify device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load model and tokenizer
    model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_id))
    model.to(device)
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10
    )

    # Training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset
    )
    trainer.train()

    # Save model
    model.save_pretrained('./trained_ner_model')


In [12]:
def predict(text):
    model = BertForTokenClassification.from_pretrained('./trained_ner_model')
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2).squeeze().tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())

    return [(token, prediction) for token, prediction in zip(tokens, predictions) if token not in ['[CLS]', '[SEP]'] and prediction != 0]



In [None]:
if __name__ == "__main__":
    main()
    # Example inference
    example_text = "Check out these amazing new tables and chairs available in our store."
    product_names = predict(example_text)
    print("----------------")
    print(product_names)


In [16]:
# from datasets import load_metric

# metric = load_metric("seqeval")
# predictions, labels = predict(validation_texts), actual_labels(validation_texts)
# metric.compute(predictions=predictions, references=labels)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


NameError: name 'validation_texts' is not defined

In [15]:
! pip install seqeval

  pid, fd = os.forkpty()


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=1ae9f85b52ba97f79a35454981299014fae43710c9abd1d3e5c5a45205f73935
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
