In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch

# Step 1: Load data (same as before)
with open("wound_data.txt", "r") as file:
    wound_data_loaded = json.load(file)

# Step 2: Convert data to a Hugging Face Dataset format
X = list(wound_data_loaded.keys())  # Wound descriptions
y = list(wound_data_loaded.values())  # Wound types

# Map labels to integer values
label_dict = {'cut': 0, 'burn': 1, 'bruise': 2, 'diabetic': 3, 'surgical': 4, 'abrasion': 5, 'pressure': 6, 'venous': 7, 'laceration': 8, 'normal': 9}
y = [label_dict[label] for label in y]

dataset = Dataset.from_dict({'text': X, 'label': y})

# Step 3: Tokenize data using BERT's tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Apply tokenization to the dataset
dataset = dataset.map(tokenize_function, batched=True)

# Step 4: Split data into training and validation sets
train_dataset, eval_dataset = dataset.train_test_split(test_size=0.2).values()

# Step 5: Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))

# Step 6: Setup Trainer with training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,  
    warmup_steps=500,               
    weight_decay=0.01,              
    logging_dir='./logs',           
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=eval_dataset             
)

# Step 7: Train the model
trainer.train()

# Step 8: Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Step 9: Example usage for new text prediction
new_description = ["Minor cut on the finger, needs to be cleaned"]
inputs = tokenizer(new_description, return_tensors='pt', padding=True, truncation=True)
logits = model(**inputs).logits
predicted_class = torch.argmax(logits, dim=-1)

# Convert the class id back to label
inv_label_dict = {v: k for k, v in label_dict.items()}
print(f"Prediction for new description: {inv_label_dict[predicted_class.item()]}")

Wound data has been saved successfully!
