In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
# import gdown

# url = 'https://drive.google.com/file/d/1zNFXjlsSFuUlfsB6lHd_ybe224qmHCUW/view?usp=sharing'
# output_path = '/'
# gdown.download(url, output_path, quiet=False,fuzzy=True)

In [3]:
# import zipfile
# with zipfile.ZipFile('/notebooks/Color.zip', 'r') as zip_ref:
#     zip_ref.extractall('/notebooks/Transformation')

In [4]:
class Params:
    def __init__(self):
        self.batch_size = 64
        self.name = "resnet18_color"
        self.lr = 0.1
        self.workers = 4
        self.betha_1 = 0.9
        self.betha_2 = 0.999
        self.epsilon = 1e-7
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1
        self.total_epochs = 500

    def __repr__(self):
        return str(self.__dict__)
    
    def __eq__(self, other):
        return self.__dict__ == other.__dict__

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device} device")

params = Params()
params, params.batch_size

Using cuda device


({'batch_size': 64, 'name': 'resnet18_color', 'lr': 0.1, 'workers': 4, 'betha_1': 0.9, 'betha_2': 0.999, 'epsilon': 1e-07, 'weight_decay': 0.0001, 'lr_step_size': 30, 'lr_gamma': 0.1, 'total_epochs': 500},
 64)

In [5]:
def show_image(image, label):
    image = image.permute(1, 2, 0)
    plt.imshow(image.squeeze())
    plt.title(f'Label: {label}')
    plt.show()

In [6]:
import os
import shutil

## to get paths and names of each image

def images_get_paths(path):
    paths = {}
    names = {}
    rel_paths = ''
    for file in os.listdir(path):
        if '.' not in file:
            for img in os.listdir(path + '/' + file):
                rel_paths = path + '/' + file + '/' + os.path.relpath(img)
                if file not in paths.keys():
                    paths[file] = [rel_paths]
                    names[file] = [os.path.relpath(img)]
                else:
                    paths[file].append(rel_paths)
                    names[file].append(os.path.relpath(img))
    return paths, names

## to organizes files from the given root path into a new structured directory.

def organize_files(path):
    new_root = os.path.join("Dataset", os.path.basename(path) + "_organized")
    os.makedirs(new_root, mode=0o777, exist_ok=True)
    files_paths, files_names = images_get_paths(path)

    for month in files_paths.keys():
        month_folder = os.path.join(new_root, month)
        os.makedirs(month_folder, mode=0o777, exist_ok=True)
        
        for name, file_path in zip(files_names[month], files_paths[month]):
            category = name.split("_")[0]
            category_folder = os.path.join(month_folder, category)
            os.makedirs(category_folder, mode=0o777, exist_ok=True)
            shutil.move(file_path, os.path.join(category_folder, name))
            print(f"Moved: {file_path} -> {category_folder}")

In [7]:
## loader for different datasets
import random
from PIL import Image
import numpy as np
    
class BGR2RGB:
    def __call__(self, img):

        img = np.array(img)
        img = img[..., ::-1]
        img = Image.fromarray(img)
        return img

def Loader_train(root_folder):
    train_transformation = transforms.Compose([
            BGR2RGB(),
            transforms.ToTensor(),
            transforms.RandomVerticalFlip(0.5),
            transforms.RandomHorizontalFlip(0.5),
            transforms.RandomRotation(degrees=(-90,90)),
            transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    train_dataset = torchvision.datasets.ImageFolder(
        root = root_folder,
        transform = train_transformation
    )
    train_sampler = torch.utils.data.RandomSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=params.batch_size,
        sampler=train_sampler,
        num_workers = params.workers,
        pin_memory=True,
    )
    return train_loader

In [8]:
## Validation loader
val_transformation = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize(size=256, antialias=True),
        transforms.CenterCrop(224),
        transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
    ])
val_dataset = torchvision.datasets.ImageFolder(
    root='imagenet-mini/val',
    transform=val_transformation
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=64,
    num_workers=params.workers,
    shuffle=False,
    pin_memory=True
)

In [9]:
from math import sqrt
def train(dataloader, model, loss_fn, optimizer, epoch, writer):
    size = len(dataloader.dataset)
    model.train()
    start0 = time.time()
    start = time.time()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        batch_size = len(X)
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * batch_size
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}], {(current/size * 100):>4f}%")
            step = epoch * size + current
            writer.add_scalar('training loss',
                            loss,
                            step)
            new_start = time.time()
            delta = new_start - start
            start = new_start
            if batch != 0:
                print("Done in ", delta, " seconds")
                remaining_steps = size - current
                speed = 100 * batch_size / delta
                remaining_time = remaining_steps / speed
                print("Remaining time (seconds): ", remaining_time)
        optimizer.zero_grad()
    print("Entire epoch done in ", time.time() - start0, " seconds")

In [10]:
def test(dataloader, model, loss_fn, epoch, writer, train_dataloader, calc_acc5=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct, correct_top5 = 0, 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            if calc_acc5:
                _, pred_top5 = pred.topk(5, 1, largest=True, sorted=True)
                correct_top5 += pred_top5.eq(y.view(-1, 1).expand_as(pred_top5)).sum().item()
    test_loss /= num_batches
    step = epoch * len(train_dataloader.dataset)
    if writer != None:
        writer.add_scalar('test loss',
                            test_loss,
                            step)
    correct /= size
    correct_top5 /= size
    if writer != None:
        writer.add_scalar('test accuracy',
                            100*correct,
                            step)
        if calc_acc5:
            writer.add_scalar('test accuracy5',
                            100*correct_top5,
                            step)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    if calc_acc5:
        print(f"Test Error: \n Accuracy-5: {(100*correct_top5):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct

In [11]:
model = torchvision.models.resnet18()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = params.lr, betas = (params.betha_1, params.betha_2),
                            eps = params.epsilon, weight_decay = params.weight_decay)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=params.lr_step_size, gamma=params.lr_gamma)

In [12]:
model = model.to(device)
resume_training = True

In [13]:
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path

start_dataset_idx = 1
start_epoch = 1
early_stopping_patience = 20
no_improvement_count = 0
best_val_accuracy = 0

checkpoint_path = os.path.join("checkpoints", params.name, f"checkpoint.pth")

if resume_training and os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["model"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
    start_epoch = checkpoint["epoch"] + 1
    start_dataset_idx = checkpoint["dataset_idx"]
    best_val_accuracy = checkpoint.get("best_val_accuracy", float('inf'))
    no_improvement_count = checkpoint.get("no_improvement_count", 0)
    assert params == checkpoint["params"]

Path(os.path.join("checkpoints", params.name)).mkdir(parents=True, exist_ok=True)
writer = SummaryWriter('runs/' + params.name)

dataset_root = 'Color'
dataset_folders = [os.path.join(dataset_root, f"color_{i}_months") for i in range(0, 13)]

2024-12-22 17:43:43.972032: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-22 17:43:43.972139: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-22 17:43:43.973297: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-22 17:43:43.980129: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
for dataset_idx, dataset_folder in enumerate(dataset_folders, start=1):
    if dataset_idx < start_dataset_idx:
        continue

    print(f"Training on dataset {dataset_idx} at {dataset_folder}")
    train_loader = Loader_train(dataset_folder)

    for epoch in range(start_epoch if dataset_idx == start_dataset_idx else 1, params.total_epochs):
        train(train_loader, model, loss_fn, optimizer, epoch, writer)
        
        val_accuracy = test(val_loader, model, loss_fn, epoch, writer, train_dataloader=train_loader, calc_acc5=True)
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            no_improvement_count = 0
            
            checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "lr_scheduler": lr_scheduler.state_dict(),
            "epoch": epoch,
            "dataset_idx": dataset_idx,
            "params": params,
            "best_val_accuracy": best_val_accuracy,
            "no_improvement_count": no_improvement_count,
            }
            torch.save(checkpoint, checkpoint_path)
            print(f"Checkpoint successfully saved at {checkpoint_path}")
            print(f"New best validation accuracy: {val_accuracy:.4f}")
        else:
            no_improvement_count += 1
            print(f"No improvement for {no_improvement_count} epochs.")

        if no_improvement_count >= early_stopping_patience:
            print(f"Early stopping triggered after {epoch} epochs with no improvement.")
            break
        
    best_val_accuracy = 0
    start_epoch = 1  
    no_improvement_count = 0
    print(f"Finished training on dataset {dataset_idx}.")

writer.close()

Training on dataset 3 at Color/color_2_months
loss: 6.972977  [   64/34745], 0.184199%
loss: 6.907855  [ 6464/34745], 18.604116%
Done in  13.20580792427063  seconds
Remaining time (seconds):  58.35522717285901
loss: 7.197613  [12864/34745], 37.024032%
Done in  14.165594339370728  seconds
Remaining time (seconds):  48.430839021839205
loss: 6.934303  [19264/34745], 55.443949%
Done in  15.345356702804565  seconds
Remaining time (seconds):  37.11897923689336
loss: 6.901526  [25664/34745], 73.863865%
Done in  13.799885988235474  seconds
Remaining time (seconds):  19.58074447799474
loss: 6.943606  [32064/34745], 92.283782%
Done in  13.480324268341064  seconds
Remaining time (seconds):  5.646992088034749
Entire epoch done in  79.48419833183289  seconds
Test Error: 
 Accuracy: 0.1%, Avg loss: 7.068277 

Test Error: 
 Accuracy-5: 0.5%, Avg loss: 7.068277 

No improvement for 1 epochs.
loss: 7.076455  [   64/34745], 0.184199%
loss: 6.886782  [ 6464/34745], 18.604116%
Done in  13.52879810333252  

In [14]:
test_accuracy = test(val_loader, model, loss_fn, epoch, writer, train_dataloader=train_loader, calc_acc5=True)
print(test_accuracy)

Test Error: 
 Accuracy: 0.3%, Avg loss: 21.305120 

Test Error: 
 Accuracy-5: 0.7%, Avg loss: 21.305120 

0.002803976548559776


In [None]:
## Standard Training
# for epoch in range(start_epoch, 100):
#     train(train_loader, model, loss_fn, optimizer, epoch=epoch, writer=writer)
#     checkpoint = {
#         "model": model.state_dict(),
#         "optimizer": optimizer.state_dict(),
#         "lr_scheduler": lr_scheduler.state_dict(),
#         "epoch": epoch,
#         "params": params
#     }
#     torch.save(checkpoint, os.path.join("checkpoints", params.name, f"model_{epoch}.pth"))
#     torch.save(checkpoint, os.path.join("checkpoints", params.name, f"checkpoint.pth"))
#     lr_scheduler.step()
#     test(val_loader, model, loss_fn, epoch + 1, writer, train_dataloader=train_loader, calc_acc5=True)