In [1]:
import torch
import torch.nn as nn
from torchvision import *
from transformers import ViTModel, ViTForImageClassification, AutoImageProcessor
from datasets import load_dataset

2024-01-30 09:50:20.515557: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset = load_dataset("imagefolder", data_dir="tomato")

Resolving data files:   0%|          | 0/10001 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1000 [00:00<?, ?it/s]

In [3]:
from datasets import load_metric

metric = load_metric("accuracy")

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1000
    })
})

In [5]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label
    
id2label[2]

'Tomato___Late_blight'

## Preprocessing the data

In [6]:
model_checkpoint = 'google/vit-base-patch16-224-in21k'
batch_size = 32

In [7]:
image_processor = AutoImageProcessor.from_pretrained(model_checkpoint)
image_processor

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [8]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean = image_processor.image_mean, std = image_processor.image_std)

In [9]:
if "height" in image_processor.size:
    size = (image_processor.size["height"], image_processor.size["width"])
    crop_size = size
    max_size = None
    
elif "shortest_edge" in image_processor.size:
    size = image_processor.size["shortest_edge"]
    crop_size = (size, size)
    max_size = image_processor.size.get("longest_edge")

In [10]:
train_transforms = Compose(
        [
            RandomResizedCrop(crop_size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(size),
            CenterCrop(crop_size),
            ToTensor(),
            normalize,
        ]
    )

In [11]:
# Defining preprocess functions

def preprocess_train(example_batch):
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    
    return example_batch

def preprocess_val(example_batch):
    example_batch["pixel_values"] = [
        val_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

In [12]:
splits = dataset["train"].train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

In [13]:
train_ds[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256>,
 'label': 3}

In [14]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

In [15]:
train_ds[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256>,
 'label': 3,
 'pixel_values': tensor([[[ 0.4275,  0.3412,  0.2078,  ..., -0.1373, -0.1373, -0.0902],
          [ 0.1451,  0.0980,  0.2235,  ..., -0.0824, -0.0745, -0.0275],
          [ 0.0039,  0.0745,  0.2392,  ..., -0.0353, -0.0196,  0.0431],
          ...,
          [-0.6000, -0.4824, -0.4824,  ...,  0.4510,  0.4431,  0.4431],
          [-0.5922, -0.5137, -0.5216,  ...,  0.4588,  0.4588,  0.4667],
          [-0.4980, -0.5216, -0.5137,  ...,  0.4824,  0.4824,  0.4980]],
 
         [[ 0.3725,  0.2863,  0.1529,  ...,  0.1216,  0.1216,  0.1686],
          [ 0.0902,  0.0431,  0.1686,  ...,  0.1686,  0.1686,  0.2157],
          [-0.0510,  0.0196,  0.1843,  ...,  0.2000,  0.2157,  0.2627],
          ...,
          [-0.5922, -0.4902, -0.5137,  ...,  0.4431,  0.4353,  0.4353],
          [-0.6235, -0.5529, -0.5765,  ...,  0.4510,  0.4510,  0.4588],
          [-0.6000, -0.6235, -0.6235,  ...,  0.4745,  0.4745,  0.4902]]

In [16]:
train_ds[0]["pixel_values"].shape

torch.Size([3, 224, 224])

In [17]:
from torchvision.models import resnet50, vgg16
cnn_base = resnet50(pretrained=True)



In [18]:
for param in cnn_base.parameters():
    param.requires_grad = False

In [19]:
cnn_base

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [20]:
for epoch in range(3):
    for example in train_ds:
        image = example['image']
        label = example['label']

        # Convert PIL image to tensor
        image_tensor = torchvision.transforms.ToTensor()(image)
        image_tensor = image_tensor.unsqueeze(0) 
        print("image_tensor", image_tensor.shape)
        optimizer.zero_grad()
        outputs = cnn_base(image_tensor)  # Pass tensor as input
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

NameError: name 'torchvision' is not defined

In [None]:
cnn_features = cnn_base.avgpool

In [None]:
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224")

In [None]:
#Connect CNN to ViT
num_classes = 10
class HybridModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn_base = cnn_base
        self.vit_model = vit_model
        self.classifier = nn.Linear(vit_model.config.hidden_size, num_classes)
    
    def forward(self,x):
        x = self.cnn_base(x)
        x = x.view(1, *x.shape) 
        x = x.flatten(start_dim = 1)
        x = self.vit_model(x)[0]
        x = self.classifier(x)
        return x

In [None]:
model = HybridModel()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 

In [None]:
import torchvision

In [None]:
for epoch in range(3):
    for example in train_ds:
        image = example['image']
        label = example['label']

        # Convert PIL image to tensor
        image_tensor = torchvision.transforms.ToTensor()(image)
        image_tensor = image_tensor.unsqueeze(0) 
        print("image_tensor", image_tensor.shape)
        optimizer.zero_grad()
        outputs = model(image_tensor)  # Pass tensor as input
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        