In [1]:
import evaluate
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TrainerCallback
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")


Using cuda


## Read data

In [3]:
data_dir = "../data"
result_dir = "../results"
models_dir = "../models"
image_dir = f"{data_dir}/Images"

def read_llava_captions(captions_path: str, common_str: str):
  result = dict()
  with open(captions_path, 'r') as f:
    captions = json.load(f)

  for key, caption in captions.items():
    new_key = int(key.replace('.jpg', ''))
    new_caption = caption.replace(common_str, '')
    result[new_key] = new_caption
  return result

# read captoins
captions = read_llava_captions(
  f"{result_dir}/llava_captions.json", 
  #common_str="ER:  \nGive me a short description of the image in one sentence? ASSISTANT: "
  common_str="ER:  \nWhat is shown in this image? ASSISTANT: "
)
df_captions = pd.DataFrame.from_dict(captions, orient='index', columns=['caption'])
# read classes from xlsx
df_classes = pd.read_excel(f"{data_dir}/data.xlsx", index_col='image_id')

# join dataframes
df = pd.merge(df_captions, df_classes, left_index=True, right_index=True)
df.columns = ['caption', 'label']
data = Dataset.from_pandas(df).train_test_split(test_size=0.15, seed=4999)
data


DatasetDict({
    train: Dataset({
        features: ['caption', 'label', '__index_level_0__'],
        num_rows: 492
    })
    test: Dataset({
        features: ['caption', 'label', '__index_level_0__'],
        num_rows: 87
    })
})

## Text classification

In [4]:
model_id = "google-bert/bert-base-uncased"

### Tokenize data

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

def preprocess_function(examples):
  return tokenizer(examples["caption"], truncation=True)

tokenized_data = data.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/492 [00:00<?, ? examples/s]

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

### Train model

In [6]:

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {
  0: "Nature and Landscape/Seascape",
  1: "Fauna/Flora",
  2: "Rural",
  3: "Sport",
  4: "Cultural",
  5: "Gastronomy"
}

label2id = {
   "Nature and Landscape/Seascape": 0,
   "Fauna/Flora": 1,
   "Rural": 2,
   "Sport": 3,
   "Cultural": 4,
   "Gastronomy": 5
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=6, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir=f"{models_dir}/distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

class AccuracyHistory(TrainerCallback):
    def __init__(self):
        self.eval_acc = []

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None:
            self.eval_acc.append(metrics["eval_accuracy"])

accuracy_history = AccuracyHistory()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[accuracy_history]
)

trainer.train()
trainer.save_model(f"{models_dir}/best_distil_bert")

# Plot the train and test accuracy evolution
plt.figure(figsize=(10, 5))
plt.plot(accuracy_history.eval_acc, label='Eval Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Eval Accuracy Evolution')
plt.legend()
plt.show()

# After training, evaluate the model on the test set and compute the confusion matrix
predictions, labels, _ = trainer.predict(tokenized_data["test"])
predictions = np.argmax(predictions, axis=1)

# Compute the confusion matrix
cm = confusion_matrix(labels, predictions, labels=list(label2id.values()))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(label2id.keys()))

# Plot the confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 1.1372963190078735, 'eval_accuracy': 0.6666666666666666, 'eval_runtime': 0.7126, 'eval_samples_per_second': 122.09, 'eval_steps_per_second': 8.42, 'epoch': 1.0}
