In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install transformers



In [3]:
import torch
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np
import torchvision
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize CIFAR-10 images to 224x224 to match DeiT input size
    transforms.ToTensor(),
    # Normalize using ImageNet mean and std
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Download and load CIFAR-10 training dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)

# Download and load CIFAR-10 test dataset
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 48243204.35it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [6]:
from transformers import DeiTForImageClassificationWithTeacher

baseline_model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
baseline_model = baseline_model.to(device)

config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/349M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [19]:
baseline_model.distillation_classifier = torch.nn.Linear(in_features=baseline_model.distillation_classifier.in_features, out_features=10)
baseline_model.cls_classifier = torch.nn.Linear(in_features=baseline_model.cls_classifier.in_features, out_features=10)

In [20]:
# After modifying the model's layers, ensure they are on the correct device
baseline_model.distillation_classifier.to(device)
baseline_model.cls_classifier.to(device)

# Before starting the training loop, you can add a check like this
for name, param in baseline_model.named_parameters():
    print(f"{name} is on {param.device}")

# Ensure inputs are on the correct device right before the forward pass
inputs, targets = inputs.to(device), targets.to(device)

# Then proceed with your training loop


deit.embeddings.cls_token is on cuda:0
deit.embeddings.distillation_token is on cuda:0
deit.embeddings.position_embeddings is on cuda:0
deit.embeddings.patch_embeddings.projection.weight is on cuda:0
deit.embeddings.patch_embeddings.projection.bias is on cuda:0
deit.encoder.layer.0.attention.attention.query.weight is on cuda:0
deit.encoder.layer.0.attention.attention.query.bias is on cuda:0
deit.encoder.layer.0.attention.attention.key.weight is on cuda:0
deit.encoder.layer.0.attention.attention.key.bias is on cuda:0
deit.encoder.layer.0.attention.attention.value.weight is on cuda:0
deit.encoder.layer.0.attention.attention.value.bias is on cuda:0
deit.encoder.layer.0.attention.output.dense.weight is on cuda:0
deit.encoder.layer.0.attention.output.dense.bias is on cuda:0
deit.encoder.layer.0.intermediate.dense.weight is on cuda:0
deit.encoder.layer.0.intermediate.dense.bias is on cuda:0
deit.encoder.layer.0.output.dense.weight is on cuda:0
deit.encoder.layer.0.output.dense.bias is on cud

NameError: name 'inputs' is not defined

In [21]:
from transformers import DeiTForImageClassificationWithTeacher
import torch.nn.functional as F

def train_one_epoch(epoch, model, train_loader, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader))
    for batch_idx, (inputs, targets) in progress_bar:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = F.cross_entropy(outputs.logits, targets)  # Corrected to use outputs.logits
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.logits.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar.set_description(f'Epoch {epoch} Loss: {running_loss/(batch_idx+1):.3f} Acc: {100.*correct/total:.3f}%')
    
    return running_loss / len(train_loader), 100.*correct / total


In [22]:
def validate(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.logits.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    acc = 100.*correct / total
    print(f'Validation Accuracy: {acc:.3f}%')
    return acc

In [23]:
optimizer = optim.Adam(baseline_model.parameters(), lr=0.001)
max_epoch = 16

In [25]:
for epoch in range(1,17):
    print(f"Epoch {epoch}: Normal Training")
    train_loss, train_acc = train_one_epoch(epoch, baseline_model, train_loader, optimizer, device)
    print(f"Training Loss: {train_loss}, Training Accuracy: {train_acc}")
    val_acc = validate(baseline_model, test_loader, device)
    print(f"Validation Accuracy: {val_acc}%")
    

Epoch 1: Normal Training


Epoch 1 Loss: 1.245 Acc: 54.814%: 100%|██████████| 1563/1563 [15:02<00:00,  1.73it/s]

Training Loss: 1.2451215557234454, Training Accuracy: 54.814





Validation Accuracy: 59.460%
Validation Accuracy: 59.46%
Epoch 2: Normal Training


Epoch 2 Loss: 1.071 Acc: 61.428%: 100%|██████████| 1563/1563 [15:03<00:00,  1.73it/s]

Training Loss: 1.0713457645549274, Training Accuracy: 61.428





Validation Accuracy: 60.750%
Validation Accuracy: 60.75%
Epoch 3: Normal Training


Epoch 3 Loss: 0.940 Acc: 66.582%: 100%|██████████| 1563/1563 [15:03<00:00,  1.73it/s]

Training Loss: 0.9398508183252942, Training Accuracy: 66.582





Validation Accuracy: 65.320%
Validation Accuracy: 65.32%
Epoch 4: Normal Training


Epoch 4 Loss: 0.840 Acc: 70.282%: 100%|██████████| 1563/1563 [15:04<00:00,  1.73it/s]

Training Loss: 0.8398302430459802, Training Accuracy: 70.282





Validation Accuracy: 69.100%
Validation Accuracy: 69.1%
Epoch 5: Normal Training


Epoch 5 Loss: 0.758 Acc: 73.138%: 100%|██████████| 1563/1563 [15:04<00:00,  1.73it/s]

Training Loss: 0.7582456930966539, Training Accuracy: 73.138





Validation Accuracy: 70.270%
Validation Accuracy: 70.27%
Epoch 6: Normal Training


Epoch 6 Loss: 0.691 Acc: 75.490%: 100%|██████████| 1563/1563 [15:04<00:00,  1.73it/s]

Training Loss: 0.6910398420392132, Training Accuracy: 75.49





Validation Accuracy: 70.560%
Validation Accuracy: 70.56%
Epoch 7: Normal Training


Epoch 7 Loss: 0.628 Acc: 77.612%: 100%|██████████| 1563/1563 [15:03<00:00,  1.73it/s]

Training Loss: 0.6278826528474908, Training Accuracy: 77.612





Validation Accuracy: 72.160%
Validation Accuracy: 72.16%
Epoch 8: Normal Training


Epoch 8 Loss: 0.570 Acc: 79.878%: 100%|██████████| 1563/1563 [15:02<00:00,  1.73it/s]

Training Loss: 0.5697052857182534, Training Accuracy: 79.878





Validation Accuracy: 75.140%
Validation Accuracy: 75.14%
Epoch 9: Normal Training


Epoch 9 Loss: 0.523 Acc: 81.264%: 100%|██████████| 1563/1563 [15:02<00:00,  1.73it/s]

Training Loss: 0.5234740216592452, Training Accuracy: 81.264





Validation Accuracy: 71.020%
Validation Accuracy: 71.02%
Epoch 10: Normal Training


Epoch 10 Loss: 0.474 Acc: 83.006%: 100%|██████████| 1563/1563 [15:02<00:00,  1.73it/s]

Training Loss: 0.47366323667654076, Training Accuracy: 83.006





Validation Accuracy: 75.040%
Validation Accuracy: 75.04%
Epoch 11: Normal Training


Epoch 11 Loss: 0.425 Acc: 84.626%: 100%|██████████| 1563/1563 [15:02<00:00,  1.73it/s]

Training Loss: 0.42463889568138413, Training Accuracy: 84.626





Validation Accuracy: 74.520%
Validation Accuracy: 74.52%
Epoch 12: Normal Training


Epoch 12 Loss: 1.174 Acc: 57.652%: 100%|██████████| 1563/1563 [15:02<00:00,  1.73it/s]

Training Loss: 1.1741690588537044, Training Accuracy: 57.652





Validation Accuracy: 34.750%
Validation Accuracy: 34.75%
Epoch 13: Normal Training


Epoch 13 Loss: 1.468 Acc: 45.360%: 100%|██████████| 1563/1563 [15:03<00:00,  1.73it/s]

Training Loss: 1.467732390034908, Training Accuracy: 45.36





Validation Accuracy: 51.070%
Validation Accuracy: 51.07%
Epoch 14: Normal Training


Epoch 14 Loss: 1.225 Acc: 55.688%: 100%|██████████| 1563/1563 [15:03<00:00,  1.73it/s]

Training Loss: 1.2248903632697887, Training Accuracy: 55.688





Validation Accuracy: 57.390%
Validation Accuracy: 57.39%
Epoch 15: Normal Training


Epoch 15 Loss: 1.082 Acc: 60.900%: 100%|██████████| 1563/1563 [15:04<00:00,  1.73it/s]

Training Loss: 1.0820157960753218, Training Accuracy: 60.9





Validation Accuracy: 64.080%
Validation Accuracy: 64.08%
Epoch 16: Normal Training


Epoch 16 Loss: 0.927 Acc: 66.808%: 100%|██████████| 1563/1563 [15:03<00:00,  1.73it/s]

Training Loss: 0.9269010096082914, Training Accuracy: 66.808





Validation Accuracy: 67.830%
Validation Accuracy: 67.83%
