## Training the Vision Transformer on a Wafer Dataset




Let's start by installing the relevant libraries.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My\ Drive/Mixed\ Type Wafer/
!pwd

In [None]:
!pip install -q git+https://github.com/huggingface/transformers

In [None]:
!pip install torchvision

## Downloading

Next, convert the folder structure dataset into a PyTorch dataset format using PyTorch's ImageFolder dataset structure:

In [None]:
import numpy as np 
import matplotlib.pyplot as plt


In [None]:
import torchvision

from torchvision.transforms import ToTensor,Compose,Pad

# train_ds = torchvision.datasets.ImageFolder('/content/train/', transform=ToTensor())
# valid_ds = torchvision.datasets.ImageFolder('/content/valid/', transform=ToTensor())
# test_ds = torchvision.datasets.ImageFolder('/content/test/', transform=ToTensor())
train_ds = torchvision.datasets.ImageFolder('Feature_Images(train_test_valid)/train', transform=Compose([Pad(86),ToTensor()]))
valid_ds = torchvision.datasets.ImageFolder('Feature_Images(train_test_valid)/valid', transform=Compose([Pad(86),ToTensor()]))
test_ds =torchvision.datasets.ImageFolder('Feature_Images(train_test_valid)/test', transform=Compose([Pad(86),ToTensor()]))



## Define the Model

Here we define the model.

The model itself uses a linear layer on top of a pre-trained `ViTModel`. We place a linear layer on top of the last hidden state of the [CLS] token, which serves as a good representation of an entire image. We also add dropout for regularization.

**Note:** The Vision Transformer pretrained model can be used as a regular PyTorch layer.

In [None]:
from transformers import ViTModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn
import torch.nn.functional as F

class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=38):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.vit.config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, pixel_values, labels):
        outputs = self.vit(pixel_values=pixel_values)
        output = self.dropout(outputs.last_hidden_state[:,0])
        logits = self.classifier(output)

        loss = None
        if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if loss is not None:
          return logits, loss.item()
        else:
          return logits, None

## Define the Model Parameters

To train this model, we will train in 3 epochs, with a batch size of 10 and a learning rate of 2e-5:

In [None]:
EPOCHS = 6
BATCH_SIZE = 10
LEARNING_RATE = 2e-5

We will use the pretrained Vision Transformer feature extractor, an Adam Optimizer, and a Cross Entropy Loss function.

In [None]:
from transformers import ViTFeatureExtractor
import torch.nn as nn
import torch
# Define Model
model = ViTForImageClassification(len(train_ds.classes))    
# Feature Extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
# Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# Cross Entropy Loss
loss_func = nn.CrossEntropyLoss()
# Use GPU if available  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
if torch.cuda.is_available():
    model.cuda() 

## Train the Model

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
# from tensorflow.keras.callbacks import TensorBoard

# from pytorch_lightning.callbacks import Callback


# tensorboard_callback = TensorBoard(logdir, histogram_freq=1)

In [None]:
# torch.Tensor.cpu().tensor(np.stack(feature_extractor(x)['pixel_values'], axis=0))

In [None]:
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np
# from pytorch_lightning.callbacks import ModelCheckpoint
from transformers.integrations import TensorBoardCallback


print("Number of train samples: ", len(train_ds))
print("Number of test samples: ", len(test_ds))
print("Detected Classes are: ", train_ds.class_to_idx) 

train_loader = data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=4)
test_loader  = data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
'''
train_features, train_labels = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imshow(img.reshape(224,224,3), cmap="gray")
plt.show()
print(f"Label: {label}")
'''

net_accuracy = []
# Train the model
for epoch in range(EPOCHS):    
  epoch_accuracy=[]    
  for step, (x, y) in enumerate(train_loader):
    # Change input array into list with each batch being one element
    x = np.split(np.squeeze(np.array(x)), BATCH_SIZE)
    # Remove unecessary dimension
    for index, array in enumerate(x):
      x[index] = np.squeeze(array)
    # Apply feature extractor, stack back into 1 tensor and then convert to tensor
    x = torch.tensor(np.stack(feature_extractor(x)['pixel_values'], axis=0))
    # Send to GPU if available
    x, y  = x.to(device), y.to(device)
    b_x = Variable(x)   # batch x (image)
    b_y = Variable(y)   # batch y (target)
    # Feed through model
    output, loss = model(b_x, None)

    # callbacks
    TensorBoardCallback()

    # Calculate loss
    if loss is None: 
      loss = loss_func(output, b_y)   
      optimizer.zero_grad()           
      loss.backward()                 
      optimizer.step()

    
    if step % 50 == 0:
      # Get the next batch for testing purposes
      test = next(iter(test_loader))
      test_x = test[0]
      # Reshape and get feature matrices as needed
      test_x = np.split(np.squeeze(np.array(test_x)), BATCH_SIZE)
      for index, array in enumerate(test_x):
        test_x[index] = np.squeeze(array)
      test_x = torch.tensor(np.stack(feature_extractor(test_x)['pixel_values'], axis=0))
      # Send to appropirate computing device
      test_x = test_x.to(device)
      test_y = test[1].to(device)
      # Get output (+ respective class) and compare to target
      test_output, loss = model(test_x, test_y)
      test_output = test_output.argmax(1)
      # Calculate Accuracy
      accuracy = (test_output == test_y).sum().item() / BATCH_SIZE
      epoch_accuracy.append(accuracy)
      net_accuracy.append(accuracy)
      print('Epoch: ', epoch, '| train loss: %.4f' % loss, '| test accuracy: %.2f' % accuracy)
    
  print(f'After Epoch : {epoch}, Epoch Accuracy: {sum(epoch_accuracy) / len(epoch_accuracy):.2f}')
  
print(f'After Epoch : {epoch}, Total Accuracy: {sum(net_accuracy) / len(net_accuracy):.2f}')



    # checkpoint_callback = ModelCheckpoint(
    # monitor="val_loss",
    # dirpath="my/path/",
    # filename="sample-mnist-{epoch:02d}-{val_loss:.2f}",
    # save_top_k=3,
    # mode="min",
    # )
    

Number of train samples:  26610
Number of test samples:  5700
Detected Classes are:  {'C1': 0, 'C10': 1, 'C11': 2, 'C12': 3, 'C13': 4, 'C14': 5, 'C15': 6, 'C16': 7, 'C17': 8, 'C18': 9, 'C19': 10, 'C2': 11, 'C20': 12, 'C21': 13, 'C22': 14, 'C23': 15, 'C24': 16, 'C25': 17, 'C26': 18, 'C27': 19, 'C28': 20, 'C29': 21, 'C3': 22, 'C30': 23, 'C31': 24, 'C32': 25, 'C33': 26, 'C34': 27, 'C35': 28, 'C36': 29, 'C37': 30, 'C38': 31, 'C4': 32, 'C5': 33, 'C6': 34, 'C7': 35, 'C8': 36, 'C9': 37}


  cpuset_checked))
  cpuset_checked))


Epoch:  0 | train loss: 3.6461 | test accuracy: 0.00
Epoch:  0 | train loss: 3.5586 | test accuracy: 0.10
Epoch:  0 | train loss: 3.2582 | test accuracy: 0.40
Epoch:  0 | train loss: 3.2217 | test accuracy: 0.30
Epoch:  0 | train loss: 2.7622 | test accuracy: 0.40
Epoch:  0 | train loss: 2.8103 | test accuracy: 0.40
Epoch:  0 | train loss: 2.3865 | test accuracy: 0.60
Epoch:  0 | train loss: 2.4244 | test accuracy: 0.50
Epoch:  0 | train loss: 2.0738 | test accuracy: 0.70
Epoch:  0 | train loss: 2.1545 | test accuracy: 0.60
Epoch:  0 | train loss: 1.9109 | test accuracy: 0.60
Epoch:  0 | train loss: 2.0549 | test accuracy: 0.80


In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

## Evaluate on a Test Image

Finally, let's evaluate the model on a test image:

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch.utils.data as data
from torch.autograd import Variable

EVAL_BATCH = 1
eval_loader  = data.DataLoader(valid_ds, batch_size=EVAL_BATCH, shuffle=True, num_workers=4) 
# Disable grad
with torch.no_grad():
    
  inputs, target = next(iter(eval_loader))
  # Reshape and get feature matrices as needed
  print(inputs.shape)
  inputs = inputs[0].permute(1, 2, 0)
  # Save original Input
  originalInput = inputs
  for index, array in enumerate(inputs):
    inputs[index] = np.squeeze(array)
  inputs = torch.tensor(np.stack(feature_extractor(inputs)['pixel_values'], axis=0))

  # Send to appropriate computing device
  inputs = inputs.to(device)
  target = target.to(device)
 
  # Generate prediction
  prediction, loss = model(inputs, target)
    
  # Predicted class value using argmax
  predicted_class = np.argmax(prediction.cpu())
  value_predicted = list(valid_ds.class_to_idx.keys())[list(valid_ds.class_to_idx.values()).index(predicted_class)]
  value_target = list(valid_ds.class_to_idx.keys())[list(valid_ds.class_to_idx.values()).index(target)]
        
  # Show result
  plt.imshow(originalInput)
  plt.xlim(224,0)
  plt.ylim(224,0)
  plt.title(f'Prediction: {value_predicted} - Actual target: {value_target}')
  plt.show()

## Save the Entire Model

We can save the entire model as follows:

In [None]:
torch.save(model, './model_6_epoch.pt')

## Export Trained Model

Now that you have trained your custom vision transformer, you can export the trained model you have made here for inference on your device elsewhere

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# %cp /content/model.pt /content/gdrive/My\ Drive

## Use your Exported Model

In [None]:
MODEL_PATH = './model_6_epoch.pt'
# model = torch.load(MODEL_PATH, map_location='cpu')
model = torch.load(MODEL_PATH)
model.eval()

In [None]:
import matplotlib.pyplot as plt
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np
count = 0 
EVAL_BATCH = 1
eval_loader  = data.DataLoader(valid_ds, batch_size=EVAL_BATCH, shuffle=True, num_workers=4) 

# Disable grad
with torch.no_grad():
  for i in range(len(eval_loader)):
    inputs, target = next(iter(eval_loader))
    # Reshape and get feature matrices as needed
    #print(inputs.shape)
    inputs = inputs[0].permute(1, 2, 0)
    # Save original Input
    originalInput = inputs
    for index, array in enumerate(inputs):
        inputs[index] = np.squeeze(array)
    inputs = torch.tensor(np.stack(feature_extractor(inputs)['pixel_values'], axis=0))

    # Send to appropriate computing device
    inputs = inputs.to(device)
    target = target.to(device)

    # Generate prediction
    prediction, loss = model(inputs, target)

    # Predicted class value using argmax
    predicted_class = np.argmax(prediction.cpu())
    value_predicted = list(valid_ds.class_to_idx.keys())[list(valid_ds.class_to_idx.values()).index(predicted_class)]
    value_target = list(valid_ds.class_to_idx.keys())[list(valid_ds.class_to_idx.values()).index(target)]

    if (value_predicted == value_target):
        count+=1
    print(count, i)

print(count/len(eval_loader))

In [None]:
import matplotlib.pyplot as plt
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np
count = 0 
EVAL_BATCH = 1
eval_loader  = data.DataLoader(valid_ds, batch_size=EVAL_BATCH, shuffle=True, num_workers=4)
print(len(eval_loader))
print(len(valid_ds))

In [None]:
5704-5704*.9530154277

In [None]:
100/38