In [25]:
import numpy as np

# Define the shapes
x_shape = (5, 224, 224, 3)
y_shape = (5,)

# Generate random x_train data
x_train = np.random.rand(*x_shape)

# Generate random y_train labels
y_train = np.random.rand(*y_shape)

# Now, x_train has shape (5, 224, 224, 3) and y_train has shape (5, 1000)


In [4]:
from datasets import load_dataset

ds = load_dataset('beans')
ds

DatasetDict({
    train: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 1034
    })
    validation: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 133
    })
    test: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 128
    })
})

In [3]:
print(np.shape(y_train),np.shape(x_train))

(5, 1000) (5, 224, 224, 3)


In [5]:
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)




In [7]:
def process_example(example):
    inputs = feature_extractor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs


In [48]:
ds = load_dataset('beans')

def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs


In [9]:
prepared_ds = ds.with_transform(transform)


In [49]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }


In [50]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)


In [51]:
from transformers import ViTForImageClassification

labels = ds['train'].features['labels'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-beans",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=2,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)


In [53]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    tokenizer=feature_extractor,
)


In [54]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()


Step,Training Loss,Validation Loss,Accuracy
100,0.0418,0.042392,1.0


***** train metrics *****
  epoch                    =         2.0
  total_flos               = 149248978GF
  train_loss               =      0.2164
  train_runtime            =  0:01:14.35
  train_samples_per_second =      27.811
  train_steps_per_second   =       1.748


In [55]:
import torch
import numpy as np

# Generate random data and labels for demonstration
x_train = np.random.rand(5, 224, 224, 3)
y_train = np.random.randint(0, 10, size=(5,))


In [66]:
import torch
import numpy as np
from transformers import ViTForImageClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset

import torch
import numpy as np
from transformers import ViTForImageClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset

# Define your random data and labels
x_train = np.random.rand(5, 3, 224, 224)  # Reshape the data to (batch_size, num_channels, height, width)
y_train = np.random.randint(0, 10, size=(5,), dtype=np.int64)  # Ensure labels have the correct data type

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return {
            'pixel_values': torch.tensor(self.x[idx]),
            'labels': torch.tensor(self.y[idx], dtype=torch.long)  # Ensure labels have the correct data type
        }

# Create an instance of the custom dataset
custom_dataset = CustomDataset(x_train, y_train)

# ... (rest of your code remains the same)


# Define your model
model_name_or_path = 'google/vit-base-patch16-224-in21k'
labels = list(range(10))

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
)

# Define your training arguments
training_args = TrainingArguments(
    output_dir="./vit-base-beans",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=2,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
    load_best_model_at_end=True,
)

# Define a compute_metrics function if needed
from sklearn.metrics import accuracy_score
import torch

def collate_fn(batch):
    # Assuming 'batch' is a list of dictionaries with 'pixel_values' and 'labels'
    pixel_values = torch.stack([x['pixel_values'] for x in batch])
    labels = torch.tensor([x['labels'] for x in batch])
    
    return {
        'pixel_values': pixel_values,
        'labels': labels
    }

def compute_metrics(p):
    # Assuming p.label_ids contains your y_train as a list or array
    references = np.array(p.label_ids)
    predictions = np.argmax(p.predictions, axis=1)
    
    return metric.compute(predictions=predictions, references=references)

# Generate random data for validation
num_samples_val = 5
x_val = np.random.rand(num_samples_val, 224, 224, 3)  # Assuming 224x224x3 image data
y_val = np.random.randint(0, 2, size=num_samples_val) 

# Create a custom dataset for validation
val_dataset = torch.utils.data.TensorDataset(torch.tensor(x_val), torch.tensor(y_val))



# Define your Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=None,  # You don't need data_collator when using a custom DataLoader
    
    compute_metrics=compute_metrics,
    train_dataset=custom_dataset,  # Pass the custom dataset directly
)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")



# Train your model with the random data
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

results = trainer.evaluate()

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


***** train metrics *****
  epoch                    =        2.0
  total_flos               =   721752GF
  train_loss               =      2.086
  train_runtime            = 0:00:00.60
  train_samples_per_second =     16.661
  train_steps_per_second   =      3.332


TypeError: vars() argument must have __dict__ attribute

In [69]:
import torch
x_val_tensor = torch.tensor(x_val.transpose((0, 3, 1, 2)), dtype=torch.float32)  # Transpose to match model input format
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x_val_tensor = x_val_tensor.to(device)
model.eval()
with torch.no_grad():
    output = model(x_val_tensor)  # This assumes your model outputs logits
logits = output.logits
pred = logits.argmax(dim=1).cpu().numpy()
pred[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [70]:
y_val

array([0, 1, 0, 1, 1])