In [24]:
import pandas as pd
import numpy as np
from transformers import ViTFeatureExtractor, ViTForImageClassification
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch
from sklearn.preprocessing import LabelEncoder
import os
from PIL import Image
from tqdm import tqdm


In [25]:
csv_path = 'archive/dogs.csv'
df = pd.read_csv(csv_path)

# Select breeds and filter dataframes
selected_breeds = ['Afghan', 'Basset', 'Beagle', 'Bearded Collie', 'Bermaise', 'Bloodhound', 'Border Collie', 'Boston Terrier', 
                   'Boxer', 'Bulldog', 'Chihuahua', 'Chow', 'Corgi', 'Coyote', 'Doberman', 'French Bulldog', 'German Sheperd', 
                   'Golden Retriever', 'Great Dane', 'Great Perenees', 'Greyhound', 'Irish Spaniel', 'Japanese Spaniel', 'Komondor',
                   'Labradoodle', 'Labrador Retriever', 'Malinois', 'Maltese', 'Newfoundland', 'Pekinese', 'Pit Bull', 'Pomeranian',
                   'Poodle', 'Pug', 'Rottweiler', 'Saint Bernard', 'Shiba Inu', 'Shih-Tzu', 'Siberian Husky', 'Yorkie']

train_df = df[df['data set'] == 'train']
valid_df = df[df['data set'] == 'valid']
test_df = df[df['data set'] == 'test']

train_df = train_df[train_df['labels'].isin(selected_breeds)]
valid_df = valid_df[valid_df['labels'].isin(selected_breeds)]
test_df = test_df[test_df['labels'].isin(selected_breeds)]

In [26]:
len(selected_breeds)

40

In [27]:
from PIL import Image

# Modify the DogDataset class to load images using PIL
class DogDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, transforms=None, root_path="archive/"):
        self.dataframe = dataframe
        self.feature_extractor = feature_extractor
        self.transforms = transforms
        self.root_path = root_path

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_path, self.dataframe.iloc[idx]['filepaths'])
        label = self.dataframe.iloc[idx]['labels']

        # Load image using PIL
        image = Image.open(img_path)

        if self.transforms:
            image = self.transforms(image)

        # Use ViTFeatureExtractor
        image = self.feature_extractor(images=image, return_tensors='pt')['pixel_values'].squeeze(0)

        return {'input_ids': image, 'labels': label}
model_name = "google/vit-base-patch16-224"
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
train_dataset = DogDataset(train_df, feature_extractor)
valid_dataset = DogDataset(valid_df, feature_extractor)
test_dataset = DogDataset(test_df, feature_extractor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)



In [28]:
model = ViTForImageClassification.from_pretrained(model_name, num_labels=len(selected_breeds), ignore_mismatched_sizes=True)
model.classifier = torch.nn.Linear(in_features=model.config.hidden_size, out_features=len(selected_breeds))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([40]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([40, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [29]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 35

In [30]:
for epoch in range(epochs):
    model.train()
    train_loader_iter = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)

    for batch in train_loader:
        inputs = batch['input_ids'].to(device)
        labels_np = LabelEncoder().fit(selected_breeds).transform(batch['labels'])
        labels = torch.tensor(labels_np, dtype=torch.long).to(device)  # Convert to PyTorch tensor and move to device
        
        outputs = model(pixel_values=inputs, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        train_loader_iter.set_postfix(loss=loss.item()) 
    # Validation loop (optional)
    model.eval()
    with torch.no_grad():
        for batch in valid_loader:
            inputs = batch['input_ids'].to(device)
            labels_np = LabelEncoder().fit(selected_breeds).transform(batch['labels'])
            labels = torch.tensor(labels_np, dtype=torch.long).to(device)
            outputs = model(pixel_values=inputs, labels=labels)
            val_loss = outputs.loss

    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")


Epoch 1/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=0.249] 

Epoch 1/35, Training Loss: 0.24925999343395233, Validation Loss: 0.1590338498353958


                                                                

Epoch 2/35, Training Loss: 0.015759164467453957, Validation Loss: 0.08483875542879105


Epoch 3/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=0.0135] 

Epoch 3/35, Training Loss: 0.013492795638740063, Validation Loss: 0.04303399845957756


                                                                 

Epoch 4/35, Training Loss: 0.004407690372318029, Validation Loss: 0.04036957398056984


Epoch 5/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=0.00378]

Epoch 5/35, Training Loss: 0.0037841994781047106, Validation Loss: 0.01949186623096466


                                                                  

Epoch 6/35, Training Loss: 0.0016689986223354936, Validation Loss: 0.018358057364821434


Epoch 7/35:   0%|          | 0/1148 [02:14<?, ?it/s, loss=0.00164] 

Epoch 7/35, Training Loss: 0.0016377372667193413, Validation Loss: 0.015830032527446747


                                                                  

Epoch 8/35, Training Loss: 0.0040533277206122875, Validation Loss: 0.11620575934648514


Epoch 9/35:   0%|          | 0/1148 [02:14<?, ?it/s, loss=0.00396] 

Epoch 9/35, Training Loss: 0.003963998984545469, Validation Loss: 0.004260318819433451


                                                                  

Epoch 10/35, Training Loss: 0.000977862160652876, Validation Loss: 0.00397651270031929


Epoch 11/35:   0%|          | 0/1148 [02:14<?, ?it/s, loss=0.000793]

Epoch 11/35, Training Loss: 0.0007932308944873512, Validation Loss: 0.0023873301688581705


                                                                    

Epoch 12/35, Training Loss: 0.0004160024400334805, Validation Loss: 0.0017049345187842846


Epoch 13/35:   0%|          | 0/1148 [02:14<?, ?it/s, loss=0.000116]

Epoch 13/35, Training Loss: 0.00011637906573014334, Validation Loss: 0.0011908210581168532


                                                                    

Epoch 14/35, Training Loss: 8.626317139714956e-05, Validation Loss: 0.0008139600977301598


Epoch 15/35:   0%|          | 0/1148 [02:14<?, ?it/s, loss=4.81e-5] 

Epoch 15/35, Training Loss: 4.8119487473741174e-05, Validation Loss: 0.0005219408194534481


                                                                   

Epoch 16/35, Training Loss: 4.672890645451844e-05, Validation Loss: 0.00032208123593591154


Epoch 17/35:   0%|          | 0/1148 [02:14<?, ?it/s, loss=1.2e-5]  

Epoch 17/35, Training Loss: 1.2000317838101182e-05, Validation Loss: 0.00020890652376692742


                                                                  

Epoch 18/35, Training Loss: 1.5934172552078962e-05, Validation Loss: 0.00013393988774623722


Epoch 19/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=6.2e-6] 

Epoch 19/35, Training Loss: 6.198863502504537e-06, Validation Loss: 8.503114804625511e-05


                                                                  

Epoch 20/35, Training Loss: 1.1086402082582936e-05, Validation Loss: 5.225147833698429e-05


Epoch 21/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=2.7e-6] 

Epoch 21/35, Training Loss: 2.7020726065529743e-06, Validation Loss: 3.111292971880175e-05


                                                                  

Epoch 22/35, Training Loss: 3.5762778338721546e-07, Validation Loss: 1.8556660506874323e-05


Epoch 23/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=1.19e-6]

Epoch 23/35, Training Loss: 1.1920920996999484e-06, Validation Loss: 1.0609543096506968e-05


                                                                   

Epoch 24/35, Training Loss: 1.1126193157906528e-06, Validation Loss: 6.635948466282571e-06


Epoch 25/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=1.99e-7]

Epoch 25/35, Training Loss: 1.9868211609264108e-07, Validation Loss: 4.331255240686005e-06


                                                                   

Epoch 26/35, Training Loss: 7.947285496356926e-08, Validation Loss: 3.099433115494321e-06


Epoch 27/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=7.95e-8]

Epoch 27/35, Training Loss: 7.947285496356926e-08, Validation Loss: 1.9868182334903395e-06


                                                                   

Epoch 28/35, Training Loss: 3.973642748178463e-08, Validation Loss: 1.6291911606458598e-06


Epoch 29/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=0]      

Epoch 29/35, Training Loss: 0.0, Validation Loss: 1.3907732636653236e-06


                                                             

Epoch 30/35, Training Loss: 7.94728478581419e-08, Validation Loss: 1.4305096556199715e-06


Epoch 31/35:   0%|          | 0/1148 [02:13<?, ?it/s, loss=0]      

Epoch 31/35, Training Loss: 0.0, Validation Loss: 1.6291911606458598e-06


                                                             

Epoch 32/35, Training Loss: 0.0, Validation Loss: 1.5894548823780497e-06


Epoch 33/35:   0%|          | 0/1148 [02:17<?, ?it/s, loss=0]      

Epoch 33/35, Training Loss: 0.0, Validation Loss: 2.145762891814229e-06


                                                             

Epoch 34/35, Training Loss: 0.0, Validation Loss: 2.46365311795671e-06


Epoch 35/35:   0%|          | 0/1148 [02:14<?, ?it/s, loss=0]      

Epoch 35/35, Training Loss: 0.0, Validation Loss: 3.0994317512522684e-06


In [31]:
model.save_pretrained("fine_tuned_vit_model_40(2)")


In [36]:
from PIL import Image
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch
import torch.nn.functional as F

# Load the fine-tuned model
model_path = "fine_tuned_vit_model_40"  # Change this path to the actual path where you saved the model
model = ViTForImageClassification.from_pretrained(model_path)
model.eval()

# Load the ViT feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

# Load and preprocess a new image
new_image_path = "archive\\test\\Yorkie\\04.jpg."  # Change this path to the path of your new image
new_image = Image.open(new_image_path)
inputs = feature_extractor(images=new_image, return_tensors='pt')['pixel_values']

# Make predictions
with torch.no_grad():
    outputs = model(pixel_values=inputs)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=1)


# Get the predicted label
predicted_label = torch.argmax(logits).item()
predicted_probability = probabilities[0, predicted_label].item()

print(f"Predicted Label Index: {selected_breeds[predicted_label]}")
print(f"Predicted Probability: {predicted_probability:.2f}")



Predicted Label Index: Yorkie
Predicted Probability: 1.00


In [37]:
correct_predictions = 0
total_samples = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()
model.to(device)  # Move the model to the GPU

with torch.no_grad():
    for batch in valid_loader:
        inputs = batch['input_ids'].to(device)
        labels_np = LabelEncoder().fit(selected_breeds).transform(batch['labels'])
        labels = torch.tensor(labels_np, dtype=torch.long).to(device)

        outputs = model(pixel_values=inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        correct_predictions += (predictions == labels).sum().item()
        total_samples += labels.size(0)

test_accuracy = correct_predictions / total_samples
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 98.59%
