In [1]:
import sys 
sys.path.append("/kaggle/input/einops/einops-0.7.0") 
import einops
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from PIL import Image
import os
from torchvision.datasets.folder import default_loader

In [2]:
def pair(t):
    if isinstance(t, tuple):
        return t
    return t, t

class Mish(nn.Module):
    def forward(self, x):
        return x * torch.tanh(F.softplus(x))

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            Mish(),  # Using Mish activation function as defined
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head * heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)


In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    
])

dataset = datasets.ImageFolder(root='/kaggle/input/smart-watc/training', transform=transform)

batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the ViT model
model = ViT(
    image_size=224,
    patch_size=16,
    num_classes=2,  # Assuming 2 classes: defective and normal
    dim=512,
    depth=6,
    heads=8,
    mlp_dim=1024,
    pool='cls',
    channels=3,
    dim_head=64,
    dropout=0.1,
    emb_dropout=0.1
)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 15
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for images, labels in data_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    # Print epoch loss
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(data_loader):.4f}')
torch.save(model.state_dict(), 'vision_transformer.pth')

Epoch [1/15], Loss: 0.7554
Epoch [2/15], Loss: 0.6713
Epoch [3/15], Loss: 0.6679
Epoch [4/15], Loss: 0.6766
Epoch [5/15], Loss: 0.6518
Epoch [6/15], Loss: 0.6460
Epoch [7/15], Loss: 0.6197
Epoch [8/15], Loss: 0.6312
Epoch [9/15], Loss: 0.6073
Epoch [10/15], Loss: 0.5860
Epoch [11/15], Loss: 0.6072
Epoch [12/15], Loss: 0.5864
Epoch [13/15], Loss: 0.5551
Epoch [14/15], Loss: 0.5484
Epoch [15/15], Loss: 0.5356


In [17]:
import torch
from torchvision import transforms
from PIL import Image
import os

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_normal = "/kaggle/input/test-acc/Test/normal"
test_defect = "/kaggle/input/test-acc/Test/defective"

model = ViT(
    image_size=224,
    patch_size=16,
    num_classes=2, 
    dim=512,
    depth=6,
    heads=8,
    mlp_dim=1024,
    pool='cls',
    channels=3,
    dim_head=64,
    dropout=0.1,
    emb_dropout=0.1
)
model.load_state_dict(torch.load('vision_transformer.pth'))
model.eval()

def test_model(model, test_dir):
    correct = 0
    total = 0
    for image_file in os.listdir(test_dir):
        image_path = os.path.join(test_dir, image_file)
        image = Image.open(image_path).convert("RGB")
        input_image = transform(image).unsqueeze(0)  # Add batch dimension
        
        with torch.no_grad():
            output = model(input_image)
            probabilities = torch.softmax(output, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1).item()
            
            ground_truth_label = "defective" if "defective" in image_file else "normal"
            l
            if (predicted_class == 0 and ground_truth_label == "defective") or \
               (predicted_class == 1 and ground_truth_label == "normal"):
                correct += 1
            
            total += 1
    
    accuracy = correct / total * 100
    return accuracy

accnormal = test_model(model, test_normal)
accdefect = test_model(model, test_defect)
accuracy = (accnormal+accdefect)/2
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 94.00%
