In [3]:
#Imports

import os
import torch
import wandb
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
from datasets import Dataset
from transformers import ViTFeatureExtractor, ViTForImageClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np




In [6]:
# Configuration
TRAIN_ROOT = r'C:\Skola\D7047e\cross_out_dataset\train\images'  # contains train/, val/, test/
VAL_ROOT = r'C:\Skola\D7047e\cross_out_dataset\val\images'
TEST_ROOT = r'C:\Skola\D7047e\cross_out_dataset\test\images'
MIN_SIDE_LENGTH = 30
BATCH_SIZE = 32
NUM_EPOCHS = 5
PROJECT_NAME = "Group_19_handwritten-text-classification_ViT"
MODEL_NAME = "google/vit-base-patch16-224-in21k"
LABELS = ["CLEAN", "CROSS", "DIAGONAL", "DOUBLE_LINE", "SCRATCH", "SINGLE_LINE", "WAVE", "ZIG_ZAG"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

wandb.init(project=PROJECT_NAME)

feature_extractor = ViTFeatureExtractor.from_pretrained(MODEL_NAME)

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.




In [8]:
def load_split(split_dir, min_side):
    images, labels = [], []
    for label_idx, label in enumerate(LABELS):
        folder = os.path.join(split_dir, label)
        for fname in tqdm(os.listdir(folder), desc=f"Loading {split_dir} - {label}"):
            if not fname.endswith(".png"):
                continue
            path = os.path.join(folder, fname)
            try:
                with Image.open(path) as img:
                    if img.width < min_side or img.height < min_side:
                        continue
                    images.append(path)
                    labels.append(label_idx)
            except:
                continue
    return Dataset.from_dict({"image_path": images, "label": labels})

train_dataset = load_split(os.path.join(TRAIN_ROOT, "train"), MIN_SIDE_LENGTH)
val_dataset = load_split(os.path.join(VAL_ROOT, "val"), MIN_SIDE_LENGTH)
test_dataset = load_split(os.path.join(TEST_ROOT, "test"), MIN_SIDE_LENGTH)


FileNotFoundError: [WinError 3] Det går inte att hitta sökvägen: 'C:\\Skola\\D7047e\\cross_out_dataset\\train\\images\\train\\CLEAN'

In [None]:
def transform(example):
    image = Image.open(example["image_path"]).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt")
    return {"pixel_values": inputs["pixel_values"][0], "label": example["label"]}

train_dataset = train_dataset.map(transform)
val_dataset = val_dataset.map(transform)
test_dataset = test_dataset.map(transform)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

model = ViTForImageClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label={i: l for i, l in enumerate(LABELS)},
    label2id={l: i for i, l in enumerate(LABELS)},
).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="./vit-handwritten",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    num_train_epochs=NUM_EPOCHS,
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)


In [None]:

trainer.train()


In [None]:
print("Evaluating on Test Set:")
trainer.evaluate(eval_dataset=test_dataset)