In [9]:
# Import necessary libraries and setup environment
import pandas as pd
import json
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizer, TrainingArguments, Trainer
import torch
from datasets import Dataset, load_metric

# Check GPU availability for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [10]:
# Data Loading and Preprocessing
# Load the dataset and parse the JSON fields

data_path = 'output.csv'
data = pd.read_csv(data_path, delimiter='|')

def parse_json(items):
    try:
        return json.loads(items.replace("'", '"'))
    except json.JSONDecodeError:
        return []

data['Items'] = data['Items'].apply(parse_json)

# Flatten the data for processing
def flatten_items(row):
    items = row['Items']
    flattened = []
    for item in items:
        if isinstance(item, dict):
            flattened.append({
                'text': f"{item['name']} {item['quantity']} {item['price']}",
                'label': ['B-ITEM', 'I-ITEM', 'B-QTY', 'I-QTY', 'B-PRICE', 'I-PRICE']  # Example sequence of labels
            })
    return flattened

data['Flattened Items'] = data.apply(flatten_items, axis=1)
flattened_data = pd.DataFrame([item for sublist in data['Flattened Items'].tolist() for item in sublist])

# Display the flattened data to ensure it's correct
print(flattened_data.head())

labels = list(all_keys)
num_labels = len(labels)

# Display labels and the number of unique labels
print("Labels:", labels)
print("Number of labels:", num_labels)


                              text  \
0            Savoury Bread 1 24.99   
1         Spar P/Crisps 17 1 52.99   
2  Roasted Veg Falafel Wrap 1 95.0   
3           Berry Dairy Lrg 1 69.0   
4         Oreo Bisc 128.8g 1 17.99   

                                              label  
0  [B-ITEM, I-ITEM, B-QTY, I-QTY, B-PRICE, I-PRICE]  
1  [B-ITEM, I-ITEM, B-QTY, I-QTY, B-PRICE, I-PRICE]  
2  [B-ITEM, I-ITEM, B-QTY, I-QTY, B-PRICE, I-PRICE]  
3  [B-ITEM, I-ITEM, B-QTY, I-QTY, B-PRICE, I-PRICE]  
4  [B-ITEM, I-ITEM, B-QTY, I-QTY, B-PRICE, I-PRICE]  
Labels: ['quantity', 'name', 'price']
Number of labels: 3


In [6]:
# Model Loading
# Load the pre-trained LayoutLM model and tokenizer

from transformers import LayoutLMForTokenClassification, LayoutLMTokenizer

tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
model = LayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased', num_labels=num_labels)


model.safetensors:   0%|          | 0.00/451M [00:00<?, ?B/s]

Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Fine-Tuning Setup
# Define training arguments

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss'
)


In [8]:
# Model Training
# Prepare the dataset and train the model

# Here you need to convert your data['Items'] into the format expected by the model, including tokenization and labels.

# Dummy example of dataset preparation
train_dataset = Dataset.from_pandas(data)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,  # Ideally, this should be a separate validation dataset
)

trainer.train()


ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column Items with type object')

In [None]:
# Model Evaluation
# Evaluate the model's performance

eval_results = trainer.evaluate()
print(eval_results)
