In [1]:
import pandas as pd
import numpy as np
from transformers import ViTFeatureExtractor, ViTForImageClassification
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch
from sklearn.preprocessing import LabelEncoder
import os
from PIL import Image
from tqdm import tqdm


In [2]:
csv_path = 'archive/dogs.csv'
df = pd.read_csv(csv_path)

# Select breeds and filter dataframes
selected_breeds = ['Afghan', 'Basset', 'Beagle', 'Bearded Collie', 'Bermaise', 'Border Collie', 'Boxer', 'Bulldog', 'Chihuahua', 'Corgi', 'Coyote', 'Doberman', 
                   'French Bulldog', 'German Sheperd', 'Golden Retriever', 'Great Dane', 'Great Perenees', 'Greyhound', 'Irish Spaniel', 'Komondor', 'Labradoodle', 
                   'Labrador Retriever', 'Malinois', 'Maltese', 'Newfoundland', 'Pit Bull', 'Pomeranian', 'Poodle', 'Pug', 'Rottweiler', 'Saint Bernard', 'Shiba Inu', 
                   'Shih-Tzu', 'Siberian Husky', 'Yorkie']

train_df = df[df['data set'] == 'train']
valid_df = df[df['data set'] == 'valid']
test_df = df[df['data set'] == 'test']

train_df = train_df[train_df['labels'].isin(selected_breeds)]
valid_df = valid_df[valid_df['labels'].isin(selected_breeds)]
test_df = test_df[test_df['labels'].isin(selected_breeds)]

In [3]:
from PIL import Image

# Modify the DogDataset class to load images using PIL
class DogDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, transforms=None, root_path="archive/"):
        self.dataframe = dataframe
        self.feature_extractor = feature_extractor
        self.transforms = transforms
        self.root_path = root_path

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_path, self.dataframe.iloc[idx]['filepaths'])
        label = self.dataframe.iloc[idx]['labels']

        # Load image using PIL
        image = Image.open(img_path)

        if self.transforms:
            image = self.transforms(image)

        # Use ViTFeatureExtractor
        image = self.feature_extractor(images=image, return_tensors='pt')['pixel_values'].squeeze(0)

        return {'input_ids': image, 'labels': label}


In [4]:
model_name = "google/vit-base-patch16-224"
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)




In [5]:
train_dataset = DogDataset(train_df, feature_extractor)
valid_dataset = DogDataset(valid_df, feature_extractor)
test_dataset = DogDataset(test_df, feature_extractor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [6]:
model = ViTForImageClassification.from_pretrained(model_name, num_labels=len(selected_breeds), ignore_mismatched_sizes=True)
model.classifier = torch.nn.Linear(in_features=model.config.hidden_size, out_features=len(selected_breeds))


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([35]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([35, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 15

In [9]:
for epoch in range(epochs):
    model.train()
    train_loader_iter = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)

    for batch in train_loader:
        inputs = batch['input_ids'].to(device)
        labels_np = LabelEncoder().fit(selected_breeds).transform(batch['labels'])
        labels = torch.tensor(labels_np, dtype=torch.long).to(device)  # Convert to PyTorch tensor and move to device
        
        outputs = model(pixel_values=inputs, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        train_loader_iter.set_postfix(loss=loss.item()) 
    # Validation loop (optional)
    model.eval()
    with torch.no_grad():
        for batch in valid_loader:
            inputs = batch['input_ids'].to(device)
            labels_np = LabelEncoder().fit(selected_breeds).transform(batch['labels'])
            labels = torch.tensor(labels_np, dtype=torch.long).to(device)
            outputs = model(pixel_values=inputs, labels=labels)
            val_loss = outputs.loss

    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")


Epoch 1/15:   0%|          | 0/1007 [01:57<?, ?it/s, loss=0.133] 

Epoch 1/15, Training Loss: 0.13315236568450928, Validation Loss: 0.29032397270202637


                                                                

Epoch 2/15, Training Loss: 0.01978013664484024, Validation Loss: 0.08704998344182968


Epoch 3/15:   0%|          | 0/1007 [02:12<?, ?it/s, loss=0.0176] 

Epoch 3/15, Training Loss: 0.01763521321117878, Validation Loss: 0.1261780709028244


                                                                 

Epoch 4/15, Training Loss: 0.006311475299298763, Validation Loss: 0.10606090724468231


Epoch 5/15:   0%|          | 0/1007 [02:10<?, ?it/s, loss=0.0035] 

Epoch 5/15, Training Loss: 0.0034955416340380907, Validation Loss: 0.11935557425022125


                                                                 

Epoch 6/15, Training Loss: 0.004299204330891371, Validation Loss: 0.060031596571207047


Epoch 7/15:   0%|          | 0/1007 [02:14<?, ?it/s, loss=0.00214] 

Epoch 7/15, Training Loss: 0.0021432610228657722, Validation Loss: 0.04599820449948311


                                                                  

Epoch 8/15, Training Loss: 0.0015102678444236517, Validation Loss: 0.0343182310461998


Epoch 9/15:   0%|          | 0/1007 [02:14<?, ?it/s, loss=0.00102] 

Epoch 9/15, Training Loss: 0.0010159889934584498, Validation Loss: 0.03320109471678734


                                                                  

Epoch 10/15, Training Loss: 0.0005274072755128145, Validation Loss: 0.04371563717722893


Epoch 11/15:   0%|          | 0/1007 [02:14<?, ?it/s, loss=0.00026] 

Epoch 11/15, Training Loss: 0.0002604070177767426, Validation Loss: 0.03518763557076454


                                                                   

Epoch 12/15, Training Loss: 0.00020152595243416727, Validation Loss: 0.027159444987773895


Epoch 13/15:   0%|          | 0/1007 [02:13<?, ?it/s, loss=0.000195]

Epoch 13/15, Training Loss: 0.00019539202912710607, Validation Loss: 0.02208169922232628


                                                                    

Epoch 14/15, Training Loss: 6.946623761905357e-05, Validation Loss: 0.01988394744694233


Epoch 15/15:   0%|          | 0/1007 [01:58<?, ?it/s, loss=3.85e-5] 

Epoch 15/15, Training Loss: 3.85335115424823e-05, Validation Loss: 0.0127252247184515


In [13]:
correct_predictions = 0
total_samples = 0

model.eval()
model.to(device)  # Add this line to move the model to the same device as the inputs

with torch.no_grad():
    for batch in test_loader:
        inputs = batch['input_ids'].to(device)
        labels_np = LabelEncoder().fit(selected_breeds).transform(batch['labels'])
        labels = torch.tensor(labels_np, dtype=torch.long).to(device)

        outputs = model(pixel_values=inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        correct_predictions += (predictions == labels).sum().item()
        total_samples += labels.size(0)

test_accuracy = correct_predictions / total_samples
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 98.61%


In [14]:
model.save_pretrained("fine_tuned_vit_model_35(2)")


In [16]:
from PIL import Image
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch
import torch.nn.functional as F

# Load the fine-tuned model
model_path = "fine_tuned_vit_model_35"  # Change this path to the actual path where you saved the model
model = ViTForImageClassification.from_pretrained(model_path)
model.eval()

# Load the ViT feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

# Load and preprocess a new image
new_image_path = "archive\\test\\Newfoundland\\04.jpg."  # Change this path to the path of your new image
new_image = Image.open(new_image_path)
inputs = feature_extractor(images=new_image, return_tensors='pt')['pixel_values']

# Make predictions
with torch.no_grad():
    outputs = model(pixel_values=inputs)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=1)


# Get the predicted label
predicted_label = torch.argmax(logits).item()
predicted_probability = probabilities[0, predicted_label].item()

print(f"Predicted Label Index: {selected_breeds[predicted_label]}")
print(f"Predicted Probability: {predicted_probability:.2f}")

Predicted Label Index: Newfoundland
Predicted Probability: 1.00
