In [1]:
import os
from   datasets import load_dataset
import datasets


dataset_name = "food101"
path_dataset = "/raid/scratch/tuchsanai/food101"


dataset  = load_dataset(dataset_name, split="train[:1000]")

if not os.path.exists(path_dataset):
    datasets.save_to_disk(path_dataset)

dataset            =  datasets.load_from_disk(path_dataset)
dataset            = dataset.shuffle(seed=42)
# Rename the 'label' column to 'labels'
dataset = dataset.rename_column("label", "labels")


train_val_dataset  = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset      = train_val_dataset["train"]
val_dataset        = train_val_dataset["test"]

In [2]:
dataset

Dataset({
    features: ['image', 'labels'],
    num_rows: 5000
})

In [3]:
train_val_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 1000
    })
})

In [4]:
train_dataset 

Dataset({
    features: ['image', 'labels'],
    num_rows: 4000
})

In [5]:
val_dataset  

Dataset({
    features: ['image', 'labels'],
    num_rows: 1000
})

In [6]:
labels = train_val_dataset["train"].features["labels"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [7]:
import torch
from datasets import load_dataset
from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
from torchvision import transforms


# Load the pre-trained model and image processor
model_name = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForImageClassification.from_pretrained(model_name,num_labels=len(labels),id2label=id2label,label2id=label2id)



Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torchvision.transforms import Compose, ColorJitter, ToTensor

jitter = Compose(
    [
         ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.7),
         ToTensor(),
    ]
)

In [None]:
def transforms(examples):
    examples["pixel_values"] = [jitter(image.convert("RGB")) for image in examples["image"]]
    return examples

dataset.set_transform(transforms)

In [None]:
def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch["image"]]
    labels = [x for x in example_batch["annotation"]]
    inputs = image_processor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [x for x in example_batch["image"]]
    labels = [x for x in example_batch["annotation"]]
    inputs = image_processor(images, labels)
    return inputs


train_ds.set_transform(train_transforms)
test_ds.set_transform(val_transforms)

In [None]:


# Preprocess the dataset
def preprocess_function(examples):
    inputs = image_processor(examples["image"], return_tensors="pt")
    inputs["labels"] = examples["labels"]
    return inputs



processed_dataset = train_val_dataset.map(preprocess_function, batched=True, num_proc=200)


In [8]:
processed_dataset 

DatasetDict({
    train: Dataset({
        features: ['image', 'labels', 'pixel_values'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['image', 'labels', 'pixel_values'],
        num_rows: 1000
    })
})

In [9]:
model 

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [10]:
image_processor 

ViTImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [11]:
import numpy as np
import evaluate


metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Define the data collator
def collate_fn(examples):
    pixel_values = torch.stack([torch.tensor(example["pixel_values"]) for example in examples])
    labels = torch.tensor([example["labels"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    fp16=True,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"] ,
    tokenizer=image_processor,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
   
)

# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.4492,2.30428,0.846
2,1.726,1.678733,0.903
3,1.4284,1.533349,0.909


TrainOutput(global_step=189, training_loss=2.1888368798311424, metrics={'train_runtime': 1407.7048, 'train_samples_per_second': 8.525, 'train_steps_per_second': 0.134, 'total_flos': 9.307289843712e+17, 'train_loss': 2.1888368798311424, 'epoch': 3.0})

In [None]:


trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 