In [1]:
!pip install transformers datasets torch torchvision scikit-learn pillow

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import tensorflow as tf
from transformers import AutoFeatureExtractor, ViTForImageClassification, TrainingArguments, Trainer
from datasets import Dataset
from PIL import Image
import numpy as np
import os
import zipfile
import shutil
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

# Path to dataset zip file
zip_file_path = '/content/drive/MyDrive/DL_Final_Project.zip'  # Update path if needed
extracted_path = '/content/flower_dataset'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

print("Dataset extracted to:", extracted_path)
print("Extracted dataset structure:", os.listdir(extracted_path))


Mounted at /content/drive
Dataset extracted to: /content/flower_dataset
Extracted dataset structure: ['Lotus', 'Tulip', 'Orchid', 'Lily', 'Sunflower']


In [4]:
from PIL import Image
import os

# Path to converted dataset
converted_dataset_path = '/content/converted_flower_dataset'

# Create the converted dataset directory
os.makedirs(converted_dataset_path, exist_ok=True)

# Convert all images to PNG
for class_name in os.listdir(extracted_path):
    class_dir = os.path.join(extracted_path, class_name)
    if os.path.isdir(class_dir):
        # Create class directory in converted dataset path
        converted_class_dir = os.path.join(converted_dataset_path, class_name)
        os.makedirs(converted_class_dir, exist_ok=True)

        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            try:
                with Image.open(img_path) as img:
                    # Convert image to RGB mode and save as PNG
                    img = img.convert("RGB")
                    new_img_name = os.path.splitext(img_name)[0] + ".png"
                    img.save(os.path.join(converted_class_dir, new_img_name), "PNG")
            except Exception as e:
                print(f"Error converting {img_path}: {e}")

print("All images converted to PNG format.")


All images converted to PNG format.


In [5]:
import shutil

output_base_path = '/content/split_flower_dataset'

# Create directories for train, val, and test splits
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(output_base_path, split), exist_ok=True)

# Split dataset
for class_name in os.listdir(converted_dataset_path):
    class_dir = os.path.join(converted_dataset_path, class_name)
    if not os.path.isdir(class_dir):
        continue

    images = sorted([img for img in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, img))])

    # Select splits
    train_images = images[:140]
    val_images = images[140:170]
    test_images = images[170:200]

    # Copy files to respective directories
    for split, split_images in zip(['train', 'val', 'test'], [train_images, val_images, test_images]):
        split_class_dir = os.path.join(output_base_path, split, class_name)
        os.makedirs(split_class_dir, exist_ok=True)
        for img in split_images:
            shutil.copy(os.path.join(class_dir, img), os.path.join(split_class_dir, img))

print("Dataset split into train, val, and test sets successfully!")


Dataset split into train, val, and test sets successfully!


In [6]:
raw_train_dataset = tf.keras.utils.image_dataset_from_directory(
    os.path.join(output_base_path, "train"),
    image_size=(224, 224),
    batch_size=32
)

class_names = raw_train_dataset.class_names
print("Classes:", class_names)

Found 700 files belonging to 5 classes.
Classes: ['Lily', 'Lotus', 'Orchid', 'Sunflower', 'Tulip']


In [7]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.2),
])

def augment_image(image, label):
    return data_augmentation(image), label

In [8]:
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]



In [9]:
def preprocess_for_hf(image, label):
    image = tf.cast(image, tf.uint8).numpy()
    processed = feature_extractor(images=image, return_tensors="pt")
    return processed["pixel_values"].squeeze(0), label

In [10]:
def preprocess_dataset(tf_dataset, augment=False):
    images, labels = [], []
    for batch_images, batch_labels in tf_dataset:
        for image, label in zip(batch_images, batch_labels):
            processed_image, label = preprocess_for_hf(image, label)
            images.append(processed_image.numpy())
            labels.append(label.numpy())
    return Dataset.from_dict({"pixel_values": images, "label": labels})


In [11]:
train_data = preprocess_dataset(raw_train_dataset)
val_data = preprocess_dataset(tf.keras.utils.image_dataset_from_directory(
    os.path.join(output_base_path, "val"),
    image_size=(224, 224),
    batch_size=32
))
test_data = preprocess_dataset(tf.keras.utils.image_dataset_from_directory(
    os.path.join(output_base_path, "test"),
    image_size=(224, 224),
    batch_size=32
))


Found 150 files belonging to 5 classes.
Found 150 files belonging to 5 classes.


In [12]:
model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=len(class_names),
    id2label={i: label for i, label in enumerate(class_names)},
    label2id={label: i for i, label in enumerate(class_names)}
)

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True
)




In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=None,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [16]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8852,0.746124,0.94
2,0.3559,0.361198,0.953333
3,0.1714,0.277091,0.953333
4,0.1302,0.248946,0.953333
5,0.1225,0.243486,0.953333


TrainOutput(global_step=220, training_loss=0.41487108956683766, metrics={'train_runtime': 460.7177, 'train_samples_per_second': 7.597, 'train_steps_per_second': 0.478, 'total_flos': 2.71229256281088e+17, 'train_loss': 0.41487108956683766, 'epoch': 5.0})

In [17]:
metrics = trainer.evaluate(test_data)
print("Test Metrics:", metrics)

model.save_pretrained("./fine_tuned_vit_model")
print("Model fine-tuned and saved successfully!")

Test Metrics: {'eval_loss': 0.29499492049217224, 'eval_accuracy': 0.94, 'eval_runtime': 13.5501, 'eval_samples_per_second': 11.07, 'eval_steps_per_second': 0.738, 'epoch': 5.0}
Model fine-tuned and saved successfully!


In [18]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def get_predictions(trainer, test_dataset):
    predictions = trainer.predict(test_dataset)
    logits = predictions.predictions
    predicted_labels = np.argmax(logits, axis=1)
    true_labels = predictions.label_ids
    return true_labels, predicted_labels

true_labels, predicted_labels = get_predictions(trainer, test_data)

cm = confusion_matrix(true_labels, predicted_labels, labels=range(len(class_names)))
print(cm)

[[23  6  1  0  0]
 [ 0 30  0  0  0]
 [ 0  0 30  0  0]
 [ 0  0  0 30  0]
 [ 2  0  0  0 28]]
