In [2]:
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
from tqdm.notebook import tqdm
from transformers import ViTForImageClassification, ViTFeatureExtractor

In [4]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
train_code_dir = "/home/bartek/Kod/PD/praca_dyplomowa/dane/resnet_dane/ready/train/code"  # Contains 'code' and 'non_code' subfolders
train_no_code_dir = "/home/bartek/Kod/PD/praca_dyplomowa/dane/resnet_dane/ready/train/no_code"  # Contains 'code' and 'non_code' subfolders
val_code_dir = "/home/bartek/Kod/PD/praca_dyplomowa/dane/resnet_dane/ready/val/code"      # Contains 'code' and 'non_code' subfolders
val_no_code_dir = "/home/bartek/Kod/PD/praca_dyplomowa/dane/resnet_dane/ready/val/no_code"      # Contains 'code' and 'non_code' subfolders

In [7]:
model_name = "google/vit-base-patch16-224"

In [8]:
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [13]:
# Load images from folders with existing structure
def load_dataset_from_folders(code_dir, no_code_dir):
    images = []
    labels = []
    
    # Load code images (label 1)
    for img_name in os.listdir(code_dir):
        if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            images.append(os.path.join(code_dir, img_name))
            labels.append(1)
    
    # Load non-code images (label 0)
    for img_name in os.listdir(no_code_dir):
        if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            images.append(os.path.join(no_code_dir, img_name))
            labels.append(0)
    
    return images, labels

In [14]:
# Load training and validation datasets
train_images, train_labels = load_dataset_from_folders(train_code_dir, train_no_code_dir)
val_images, val_labels = load_dataset_from_folders(val_code_dir, val_no_code_dir)

print(f"Training samples: {len(train_images)}")
print(f"Validation samples: {len(val_images)}")

Training samples: 9044
Validation samples: 2262


In [15]:
def preprocess_batch(image_paths, labels):
    images = []
    for img_path in image_paths:
        try:
            img = Image.open(img_path).convert("RGB")
            images.append(img)
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            continue
    
    # Process images using the ViT feature extractor
    inputs = feature_extractor(images=images, return_tensors="pt")
    inputs['labels'] = torch.tensor(labels[:len(images)])  # Match labels to successfully loaded images
    return inputs

In [16]:
def create_dataloader(image_paths, labels, batch_size=16, shuffle=True):
    indices = list(range(len(image_paths)))
    if shuffle:
        np.random.shuffle(indices)
    
    # Create mini-batches
    batches = []
    for i in range(0, len(indices), batch_size):
        batch_indices = indices[i:i + batch_size]
        batch_images = [image_paths[idx] for idx in batch_indices]
        batch_labels = [labels[idx] for idx in batch_indices]
        batches.append((batch_images, batch_labels))
    
    return batches

In [17]:
model = ViTForImageClassification.from_pretrained(
    model_name, 
    num_labels=2,
    ignore_mismatched_sizes=True  # This is the key fix
)
model.to(device)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [19]:
# Training function with early stopping
def train_model(model, train_dataloader, val_dataloader, epochs=5, patience=2):
    best_accuracy = 0
    patience_counter = 0
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch_images, batch_labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            inputs = preprocess_batch(batch_images, batch_labels)
            
            # Skip empty batches
            if inputs['pixel_values'].size(0) == 0:
                continue
                
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs.loss
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            # Calculate training accuracy
            _, predicted = torch.max(outputs.logits, 1)
            train_total += inputs['labels'].size(0)
            train_correct += (predicted == inputs['labels']).sum().item()
        
        avg_train_loss = train_loss / len(train_dataloader)
        train_accuracy = 100 * train_correct / train_total if train_total > 0 else 0
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch_images, batch_labels in tqdm(val_dataloader, desc="Validating"):
                inputs = preprocess_batch(batch_images, batch_labels)
                
                # Skip empty batches
                if inputs['pixel_values'].size(0) == 0:
                    continue
                    
                inputs = {k: v.to(device) for k, v in inputs.items()}
                
                outputs = model(**inputs)
                val_loss += outputs.loss.item()
                _, predicted = torch.max(outputs.logits, 1)
                labels = inputs['labels']
                
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        avg_val_loss = val_loss / len(val_dataloader)
        accuracy = 100 * correct / total if total > 0 else 0
        
        print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Val Loss: {avg_val_loss:.4f}, Val Acc: {accuracy:.2f}%")
        
        # Early stopping logic
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            patience_counter = 0
            # Save the best model
            torch.save(model.state_dict(), "vit_code_classifier.pth")
            print(f"Model saved with accuracy: {best_accuracy:.2f}%")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping after {epoch+1} epochs")
                break
    
    return model

In [20]:
# Create data loaders
batch_size = 16  # Reduce this if you encounter memory issues
train_dataloader = create_dataloader(train_images, train_labels, batch_size)
val_dataloader = create_dataloader(val_images, val_labels, batch_size, shuffle=False)

In [21]:
# Train the model (with increased epochs and early stopping patience)
trained_model = train_model(model, train_dataloader, val_dataloader, epochs=5, patience=2)

Epoch 1/5:   0%|          | 0/566 [00:00<?, ?it/s]

Validating:   0%|          | 0/142 [00:00<?, ?it/s]

Epoch 1 - Train Loss: 0.0240, Train Acc: 99.25%, Val Loss: 0.0251, Val Acc: 99.20%
Model saved with accuracy: 99.20%


Epoch 2/5:   0%|          | 0/566 [00:00<?, ?it/s]

Validating:   0%|          | 0/142 [00:00<?, ?it/s]

Epoch 2 - Train Loss: 0.0089, Train Acc: 99.67%, Val Loss: 0.0047, Val Acc: 99.91%
Model saved with accuracy: 99.91%


Epoch 3/5:   0%|          | 0/566 [00:00<?, ?it/s]

Validating:   0%|          | 0/142 [00:00<?, ?it/s]

Epoch 3 - Train Loss: 0.0059, Train Acc: 99.82%, Val Loss: 0.0009, Val Acc: 99.96%
Model saved with accuracy: 99.96%


Epoch 4/5:   0%|          | 0/566 [00:00<?, ?it/s]

Validating:   0%|          | 0/142 [00:00<?, ?it/s]

Epoch 4 - Train Loss: 0.0003, Train Acc: 99.99%, Val Loss: 0.0024, Val Acc: 99.91%


Epoch 5/5:   0%|          | 0/566 [00:00<?, ?it/s]

Validating:   0%|          | 0/142 [00:00<?, ?it/s]

Epoch 5 - Train Loss: 0.0000, Train Acc: 100.00%, Val Loss: 0.0029, Val Acc: 99.91%
Early stopping after 5 epochs


In [None]:
# Function for inference on a single image
def predict_image(model, image_path):
    model.eval()
    try:
        img = Image.open(image_path).convert("RGB")
        inputs = feature_extractor(images=[img], return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
            prediction = torch.argmax(probabilities, dim=1).item()
            confidence = probabilities[0][prediction].item()
        
        label = "Code" if prediction == 1 else "Not Code"
        return label, confidence
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return "Error", 0.0

In [26]:
model_path = "/home/bartek/Kod/PD/praca_dyplomowa/dane/modele/vit_code_classifier_full.pt"

In [None]:
# Method 2: Save the entire model
# This saves the architecture and weights together
def save_full_model(model, path):
    torch.save(model, path)
    print(f"Full model saved to {path}")

In [30]:
save_full_model(trained_model, model_path)

Full model saved to /home/bartek/Kod/PD/praca_dyplomowa/dane/modele/vit_code_classifier_full.pt
