<a href="https://colab.research.google.com/github/Shakilkhan24/Playground_DL/blob/main/fine_tune_normal_pytorch_trainer_api_also.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# hugging face fine tuning a model
!pip install transformers



```
# Fine tuning vit image classifier using pytorch on mnist data
```



In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor

# Step 1: Load the MNIST dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize MNIST images to fit the vision transformer input size
    transforms.Grayscale(num_output_channels=3),  # Convert grayscale to RGB
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

# Step 2: Prepare the Vision Transformer model
model_name = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name, num_labels=10)  # MNIST has 10 classes

# Step 3: Define Fine-tuning Parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Step 4: Fine-tuning Loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs.logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

# Step 5: Evaluation (optional)
# You can evaluate the model on a validation set if available.

# Step 6: Save the Fine-tuned Model
torch.save(model.state_dict(), 'fine_tuned_vision_transformer_mnist.pth')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




```
# we can use TQDM (tqdm) with the training loop
# that's also easy
```



In [7]:
from tqdm import tqdm
import time

In [12]:
for i in tqdm(range(10), desc="Processing"):
  print(f'{i: } shakil')
  time.sleep(0.1)


Processing:  10%|█         | 1/10 [00:00<00:00,  9.98it/s]

 0 shakil
 1 shakil


Processing:  30%|███       | 3/10 [00:00<00:00,  9.58it/s]

 2 shakil
 3 shakil


Processing:  60%|██████    | 6/10 [00:00<00:00,  9.60it/s]

 4 shakil
 5 shakil


Processing:  80%|████████  | 8/10 [00:00<00:00,  9.62it/s]

 6 shakil
 7 shakil


Processing: 100%|██████████| 10/10 [00:01<00:00,  9.65it/s]

 8 shakil
 9 shakil


Processing: 100%|██████████| 10/10 [00:01<00:00,  9.52it/s]




```
# Same intention, but highly featured training from hugging_face
# TrainingArguments and Trainer api
# callbacks, checkpoints, different logs and many more
```



In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from transformers import Trainer, TrainingArguments, ViTForImageClassification, ViTFeatureExtractor

# Step 1: Load the MNIST dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize MNIST images to fit the vision transformer input size
    transforms.Grayscale(num_output_channels=3),  # Convert grayscale to RGB
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)

# Step 2: Prepare the Vision Transformer model
model_name = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name, num_labels=10)  # MNIST has 10 classes

# Step 3: Define TrainingArguments
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    output_dir='./results',
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
)

# Step 4: Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Step 5: Fine-tuning
trainer.train()

# Step 6: Save the Fine-tuned Model
trainer.save_model('fine_tuned_vision_transformer_mnist')
