In [1]:
!pip install transformers



In [3]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
# Load the dataset
df = pd.read_csv('sample_data/10000_Images_Samples.csv')

# Define a custom dataset
class EntityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        extracted_text = row['extracted_text']
        entity_name = row['entity_name']
        entity_value = row['entity_value']

        # Tokenize the text
        input_text = f"{extracted_text} [SEP] entity_name: {entity_name}"
        inputs = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Return the tokenized inputs and the label (entity_value)
        return {
            'input_ids': inputs['input_ids'].squeeze(),   # remove batch dimension
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': entity_value
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create the dataset
dataset = EntityDataset(df, tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [4]:
from sklearn.preprocessing import LabelEncoder

# Encode entity_value to numerical labels
label_encoder = LabelEncoder()
df['encoded_entity_value'] = label_encoder.fit_transform(df['entity_value'])

# Add 'encoded_entity_value' column to be used as labels
class EntityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        extracted_text = row['extracted_text']
        entity_name = row['entity_name']
        entity_value = row['encoded_entity_value']

        # Tokenize the text
        input_text = f"{extracted_text} [SEP] entity_name: {entity_name}"
        inputs = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(entity_value, dtype=torch.long)
        }

# Reinitialize dataset
dataset = EntityDataset(df, tokenizer)


In [5]:
from torch.utils.data import DataLoader

# Create DataLoader for batching
batch_size = 16
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [6]:
from transformers import BertForSequenceClassification, AdamW
import torch

# Define the model and optimizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['encoded_entity_value'].unique()))
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()

    total_loss = 0
    for batch in train_loader:
        # Move batch to GPU if available
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 7.2729522979736325
Epoch 2, Loss: 6.57736662979126
Epoch 3, Loss: 6.101733660125732


In [7]:
# Define the path where you want to save the model and tokenizer
output_dir = "./finetuned_bert_model"

# Save the fine-tuned model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)


('./finetuned_bert_model/tokenizer_config.json',
 './finetuned_bert_model/special_tokens_map.json',
 './finetuned_bert_model/vocab.txt',
 './finetuned_bert_model/added_tokens.json')

In [1]:
!pip install transformers



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
output_dir = "./finetuned_bert_model"
# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained(output_dir)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(output_dir)


In [14]:
def predict(extracted_text, entity_name):
    model.eval()
    input_text = f"{extracted_text} [SEP] entity_name: {entity_name}"
    inputs = tokenizer(
        input_text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    )

    # Move input tensors to device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get predicted entity value
    predicted_class = torch.argmax(outputs.logits, dim=1).cpu().item()
    predicted_value = label_encoder.inverse_transform([predicted_class])[0]

    return predicted_value

# Example usage:
predicted_value = predict("Sabina Carson  Forever & Always crystal birthstone Carson 10mm son Material: Sterling Silver. CZ:5x5mm 10k/14k/18k Solid Gold About Weight: 3.8g or  Forever & Always Engraving name ", entity_name='item_weight')
print(f"Predicted Entity Value: {predicted_value}")


Predicted Entity Value: 10.0 gram
