# Imports

In [None]:
import os
import random
import pathlib
import pandas as pd

import json
from google.colab import userdata

import numpy as np
import torch
import torch.nn as nn
import cv2
import matplotlib.pyplot as plt
import torchvision
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision import transforms
import copy
from PIL import Image
import torch.nn as nn
from tqdm.auto import tqdm
import multiprocessing

from sklearn import metrics
import seaborn as sns

import time
import math

%matplotlib inline

# Download dataset

In [None]:
!pip install -q kaggle

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username": userdata.get('username'),
             "key": userdata.get('key')}

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c dogs-vs-cats

In [None]:
!unzip /content/dogs-vs-cats.zip

In [None]:
!unzip -q -o test1.zip

In [None]:
!unzip -q -o train.zip

In [None]:
list(filter(lambda x: '.csv' in x, list(os.listdir('test1'))))

In [None]:
len(list(os.listdir('test1')))

In [None]:
len(list(os.listdir('train')))

In [None]:
sample_submission = pd.read_csv('sampleSubmission.csv')

In [None]:
sample_submission['label'].value_counts()

# Prepare dataset

In [None]:
def split_dataset(dataset_root, test_ratio=0.2):
    out_train_fname = 'train.csv'
    out_test_fname = 'val.csv'

    print("Running splitting dataset to Train and Test")
    print(f"test_ratio: {test_ratio}")
    print(f"dataset_root: {dataset_root}")
    print(f"out_train_fname: {out_train_fname}")
    print(f"out_test_fname: {out_test_fname}")

    image_counter = {}

    out_train_files = []
    out_test_files = []

    cur_dir = pathlib.Path(dataset_root)

    if not cur_dir.is_dir():
        return

    img_names = list(map(str, [item for item in cur_dir.rglob("*")
                               if item.is_file()
                               and str(item).lower().endswith(('.jpg', '.jpeg', '.png'))]))

    img_names = list(map(lambda it: it.split('/')[-1], img_names))

    def create_csv(class_name):
        instance_files = [tf for tf in img_names if class_name in tf]

        image_counter[class_name] = len(instance_files)

        test_size = int(len(instance_files) * test_ratio)
        test_files = random.sample(instance_files, test_size)

        for img_name in img_names:

            p = f'{class_name},{img_name}\n'

            if img_name in test_files:
                out_test_files.append(p)
            else:
                out_train_files.append(p)

    create_csv('dog')
    create_csv('cat')

    with open(f'{dataset_root}/{out_train_fname}', 'w') as f:
        f.write('label,img_name\n')
        f.writelines(out_train_files)
    print(f'Number of train images: {len(out_train_files)}')

    with open(f'{dataset_root}/{out_test_fname}', 'w') as f:
        f.write('label,img_name\n')
        f.writelines(out_test_files)
    print(f'Number of test images: {len(out_test_files)}')

    return len(image_counter), image_counter

In [None]:
dataset_root = '/content/train'
num_classes, image_counter = split_dataset(dataset_root, 0.1)
print(f"{num_classes=}\n{image_counter=}\n{len(image_counter)=}")

In [None]:
def create_test(dataset_root):
    out_test_fname = 'test.csv'

    cur_dir = pathlib.Path(dataset_root)

    if not cur_dir.is_dir():
        return

    img_names = list(map(str, [item for item in cur_dir.rglob("*")
                               if item.is_file()
                               and str(item).lower().endswith(('.jpg', '.jpeg', '.png'))]))

    img_names = list(map(lambda it: it.split('/')[-1] + '\n', img_names))

    with open(f'{dataset_root}/{out_test_fname}', 'w') as f:
        f.write('label\n')
        f.writelines(img_names)

    print(f'Number of test images: {len(img_names)}')

In [None]:
create_test('test1')

In [None]:
df_val = pd.read_csv('train/val.csv')
df_val.head()

In [None]:
df_test = pd.read_csv('test1/test.csv')
df_test.head()

# Variables

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_workers = multiprocessing.cpu_count()
seed = 2025
lr = 1e-4
batch_size = 32

# Dataset

In [None]:
class ImageDataset(Dataset):
    def __init__(self, dataset_root, csv_filename, transform):
        print("Reading Image Dataset...")
        self.dataset_root = dataset_root

        print("Reading dataset file paths...")
        self.csv_filename = csv_filename
        self.img_labels = pd.read_csv(f'{dataset_root}/{self.csv_filename}', delimiter=',')
        self.transform = transform
        print("Image Dataset instance created!")

        if 'test' not in self.csv_filename:
            self.classes = self.img_labels['label'].unique().tolist()
            self.label_to_idx = {val: idx for idx, val in enumerate(self.classes)}

    def __len__(self):
        return len(self.img_labels)

    def get_labels(self):
        if 'test' not in self.csv_filename:
            return list(map(self.label_to_idx.get, self.img_labels.iloc[:, 0]))
        else:
            return []

    def read_image(self, img_name):
        img_path = f'{self.dataset_root}/{img_name}'
        image = Image.open(img_path).convert('RGB')
        image_tensor = self.transform(image)
        image.close()
        return image_tensor

    def __getitem__(self, index):
        if 'test' not in self.csv_filename:
            label = self.img_labels.iloc[index, 0]
            img_name = self.img_labels.iloc[index, 1]
            return self.read_image(img_name), self.label_to_idx[label]
        else:
            img_name = self.img_labels.iloc[index, 0]
            return self.read_image(img_name), img_name

In [None]:
train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.ColorJitter(),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.Resize(128),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

In [None]:
image_datasets = dict()
image_datasets['train'] = ImageDataset(dataset_root=dataset_root,
                                       csv_filename='train.csv',
                                       transform=train_transform)

image_datasets['val'] = ImageDataset(dataset_root=dataset_root,
                                      csv_filename='val.csv',
                                      transform=test_transform)

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

dataloaders = dict()
dataloaders['train'] = DataLoader(image_datasets['train'],
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=num_workers,
                                  pin_memory=True,
                                  drop_last=True,
                                  worker_init_fn=seed_worker,
                                  generator=g)
dataloaders['val'] = DataLoader(image_datasets['val'],
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=num_workers,
                                 pin_memory=True,
                                 worker_init_fn=seed_worker,
                                 generator=g)

In [None]:
samples, labels = next(iter(dataloaders['train']))
plt.figure(figsize=(16, 24))
grid_imgs = torchvision.utils.make_grid(samples[:24])
plt.imshow(grid_imgs.permute(1, 2, 0).numpy()) # [B, W, H] -> [W, H, B]

In [None]:
model = torchvision.models.resnet101(weights='IMAGENET1K_V2')

In [None]:
model

In [None]:
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 1000),
    nn.Linear(1000, 2)
)

In [None]:
model = model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(
    model.parameters(), lr=lr, amsgrad=True)
scheduler = torch.optim.lr_scheduler.MultiStepLR(
    optimizer, milestones=[500,1000,1500], gamma=0.5)

In [None]:
def run_epoch(phase, dataloader):
    if phase == 'train':
        model.train()
    else:
        model.eval()

    running_loss = 0.0
    running_corrects = 0

    y_test = []
    y_pred = []

    all_elems_count = 0
    cur_tqdm = tqdm(dataloader)

    for inputs, labels in cur_tqdm:

        bz = inputs.shape[0]
        all_elems_count += bz

        inputs = inputs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        if phase == 'train':
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        _, preds = torch.max(outputs, 1)
        y_test.extend(labels.detach().cpu().numpy())
        y_pred.extend(preds.detach().cpu().numpy())
        running_loss += loss.item() * bz
        corrects_cnt = torch.sum(preds == labels.detach())
        running_corrects += corrects_cnt
        show_dict = {'Loss': f'{loss.item():.6f}',
                     'Corrects': f'{corrects_cnt.item()}/{bz}',
                     'Accuracy': f'{(corrects_cnt * 100 / bz).item():.3f}%'}
        cur_tqdm.set_postfix(show_dict)

    conf_matrix = metrics.confusion_matrix(y_test, y_pred, labels=range(num_classes))

    print("Calculating metrics...")
    f05_macro = metrics.fbeta_score(y_test, y_pred, average="macro", beta=0.5)
    f1_macro = metrics.f1_score(y_test, y_pred, average="macro")
    epoch_loss = running_loss / all_elems_count
    epoch_acc = running_corrects.float().item() / all_elems_count

    return epoch_loss, epoch_acc, f05_macro, f1_macro, conf_matrix

def test_epoch(dataloader):
    with torch.inference_mode(): # существенно ускоряет этап тестирования
        return run_epoch('test', dataloader)

def train_epoch(dataloader):
    return run_epoch('train', dataloader)

In [None]:
def vis(test_accs, confusion_mtxes, labels, figsize=(50, 80)):

    cm = confusion_mtxes[np.argmax(test_accs)]

    cm_sum = np.sum(cm, axis=0, keepdims=True)
    cm_perc = cm / cm_sum * 100

    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape

    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%' % p

    cm = pd.DataFrame(cm, index=labels, columns=labels)

    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'

    fig = plt.figure(figsize=figsize)

    plt.subplot(2, 1, 1)
    plt.plot(test_accs, 'g')
    plt.grid(True)

    plt.subplot(2, 1, 2)
    sns.heatmap(cm, annot=annot, fmt='', cmap="Blues", square=False)

    fig.tight_layout()

    plt.show()

In [None]:
log_folder = 'logs'
os.makedirs(log_folder, exist_ok=True)

def train_model(dataloaders, num_epochs=5):
    print(f"Training model with params:")
    print(f"Optim: {optimizer}")
    print(f"Criterion: {criterion}")

    phases = ['train', 'test']
    for phase in dataloaders:
        if phase not in phases:
            phases.append(phase)

    saved_epoch_losses = {phase: [] for phase in phases}
    saved_epoch_accuracies = {phase: [] for phase in phases}
    saved_epoch_f1_macros = {phase: [] for phase in phases}
    saved_epoch_conf_matrices = {phase: [] for phase in phases}

    for epoch in range(1, num_epochs + 1):
        start_time = time.time()

        print("=" * 100)
        print(f'Epoch {epoch}/{num_epochs}')
        print('-' * 10)

        for phase in phases:
            print("--- Cur phase:", phase)

            epoch_loss, epoch_acc, f05_macro, f1_macro, conf_matrix = \
                train_epoch(dataloaders[phase]) if phase == 'train' \
                    else test_epoch(dataloaders[phase])

            saved_epoch_losses[phase].append(epoch_loss)
            saved_epoch_accuracies[phase].append(epoch_acc)
            saved_epoch_f1_macros[phase].append(f1_macro)
            saved_epoch_conf_matrices[phase].append(conf_matrix)

            print(f'{phase} loss: {epoch_loss:.6f}, '
                  f'acc: {epoch_acc:.6f}, '
                  f'f05_macro: {f05_macro:.6f}, '
                  f'f1_macro: {f1_macro:.6f}')

            print("Confusion matrix:")
            print(conf_matrix)

        model.eval()
        if epoch > 1:
            plt.title(f'Losses during training. Epoch {epoch}/{num_epochs}.')
            plt.plot(range(1, epoch + 1), saved_epoch_losses['train'], label='Train Loss')
            plt.plot(range(1, epoch + 1), saved_epoch_losses['test'], label='Test Loss')
            plt.xlabel('Epochs')
            plt.ylabel(criterion.__class__.__name__)
            plt.legend(loc="upper left")
            plt.savefig(f'{log_folder}/loss_graph_epoch{epoch + 1}.png')
            plt.show()
            plt.close('all')

            plt.title(f'Accuracies during training. Epoch {epoch}/{num_epochs}.')
            plt.plot(range(1, epoch + 1), saved_epoch_accuracies['train'], label='Train Acc')
            plt.plot(range(1, epoch + 1), saved_epoch_accuracies['test'], label='Test Acc')
            plt.xlabel('Epochs')
            plt.ylabel('Accuracy')
            plt.legend(loc="upper left")
            plt.savefig(f'{log_folder}/acc_graph_epoch{epoch + 1}.png')
            plt.show()
            plt.close('all')

        end_time = time.time()
        epoch_time = end_time - start_time
        print("-" * 10)
        print(f"Epoch Time: {math.floor(epoch_time // 60)}:{math.floor(epoch_time % 60):02d}")

    print("*** Training Completed ***")

    return saved_epoch_losses, saved_epoch_accuracies, saved_epoch_f1_macros, saved_epoch_conf_matrices

In [None]:
num_epochs = 5
classe_labels = range(num_classes)

losses, accuracies, f1_macros, conf_matrices = train_model(dataloaders, num_epochs)
vis(accuracies['test'], conf_matrices['test'], classe_labels)

In [None]:
filename_pth = 'ckpt_densenet121_catdog.pth'
torch.save(model.state_dict(), filename_pth)

# Test submission

In [None]:
def collate_fn(batch):
  return (torch.stack([im for im, _ in batch]), [label for _, label in batch])

testset = ImageDataset('test1', 'test.csv', transform=test_transform)
testloader = DataLoader(testset, batch_size=32, shuffle=False, num_workers=num_workers, collate_fn=collate_fn)

In [None]:
model.eval()

fn_list = []
pred_list = []

for x, fn in tqdm(testloader, leave=False):
    with torch.no_grad():
        x = x.to(device)

        output = model(x)

        pred = torch.argmax(output, dim=1)

        fn_list += [n[:-4] for n in fn]
        pred_list += [p.item() for p in pred]

submission = pd.DataFrame({"id": fn_list, "label": pred_list})
submission.to_csv('preds_densenet121.csv', index=False)

In [None]:
samples, _ = next(iter(testloader))
samples = samples.to(device)

fig = plt.figure(figsize=(24, 16))
fig.tight_layout()

output = model(samples[:24])
pred = torch.argmax(output, dim=1)
pred = [p.item() for p in pred]

ad = {0:'cat', 1:'dog'}

for num, sample in enumerate(samples[:24]):
    plt.subplot(4, 6, num+1)
    plt.title(ad[pred[num]])
    plt.axis('off')
    plt.imshow(sample.permute(1, 2, 0).cpu().numpy())