In [1]:
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as tt

from PIL import Image
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision.models import ResNet18_Weights
from tqdm.notebook import tqdm
from torchvision import models as vision_models
import timm


# Зафиксируем сиды, чтобы обучение было воспроизводимым.
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(1001)

In [2]:
import torch

if torch.cuda.is_available():
    print("Все хорошо, установлена версия с поддержкой видеокарт")
else:
    print("Что-то не так, стоит torch с поддержкой только CPU (если у вас MacOS или так и задумано, то все нормально.")

Все хорошо, установлена версия с поддержкой видеокарт


In [3]:
from sklearn.preprocessing import LabelEncoder
data = pd.read_csv(r"D:\data science\T1_hol\dataset\annotations\train_labels.csv")
le1 = LabelEncoder()
data['image_name'] = data['image_id']
data['unified_class'] = data['class']
data['class_id'] = le1.fit_transform(data['class'])
data['class_id'] = data['class_id'].astype('int64')
images_path = r"D:\data science\T1_hol\dataset\train"

# Разобъем данные на тренировочную и отложенную (на которой мы будем проверять качество работы) части
train, val = train_test_split(data, test_size=0.1, random_state=1, stratify=data['class'])

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

# Получилось примерно 28.1к картинок для тренировки и 12.0к картинок для подсчета метрик
print(train.shape, val.shape)

(2160, 5) (240, 5)


In [4]:
train

Unnamed: 0,image_id,class,image_name,unified_class,class_id
0,image_95274.jpg,sunflower,image_95274.jpg,sunflower,3
1,image_48791.jpg,rose,image_48791.jpg,rose,2
2,image_26601.jpg,rose,image_26601.jpg,rose,2
3,image_10488.jpg,rose,image_10488.jpg,rose,2
4,image_64214.jpg,rose,image_64214.jpg,rose,2
...,...,...,...,...,...
2155,image_26027.jpg,rose,image_26027.jpg,rose,2
2156,image_21087.jpg,rose,image_21087.jpg,rose,2
2157,image_32962.jpg,rose,image_32962.jpg,rose,2
2158,image_21896.jpg,cactus,image_21896.jpg,cactus,0


In [5]:
val = val.drop(['class','image_id'],axis = 1)

In [6]:
class EfficientNet(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.model = vision_models.efficientnet_b1(vision_models.EfficientNet_B1_Weights.DEFAULT)
        self.model.classifier[1] = torch.nn.Linear(self.model.classifier[1].in_features, num_classes)

    def forward(self, batch):
        inputs, _ = batch
        return self.model(inputs)

In [26]:
import torch
import torch.nn as nn
import timm

class SwinTransformer(nn.Module):
    def __init__(self, num_classes: int, model_name: str = 'swin_base_patch4_window7_224', pretrained: bool = True):
        """
        Initializes a Swin Transformer model.

        Args:
            num_classes (int): The number of output classes.
            model_name (str): The name of the Swin Transformer model to use from timm.
                              Defaults to 'swin_base_patch4_window7_224'.
            pretrained (bool): Whether to use pretrained weights. Defaults to True.
        """
        super().__init__()

        self.model = timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)

    def forward(self, batch):
        """
        Forward pass of the model.

        Args:
            batch: A tuple containing the input tensor and labels (optional).
                   The input tensor should have shape (batch_size, channels, height, width).

        Returns:
            The output logits of the model.
        """
        if isinstance(batch, tuple):
            inputs, _ = batch
        else:
            inputs = batch
        return self.model(inputs)

# Example Usage:
num_classes = 10  # Replace with your actual number of classes
model = SwinTransformer(num_classes, model_name="swin_base_patch4_window7_224_in22k")

# Example input (batch_size, channels, height, width)
dummy_input = torch.randn(1, 3, 224, 224)

# Get the output logits
output_logits = model(dummy_input)
print(output_logits.shape)  # Expected output: (1, num_classes)

  model = create_fn(


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


torch.Size([1, 10])


In [27]:
import torch
import torch.nn as nn
from torchvision import models as vision_models

class ResNet50(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.model = vision_models.resnet50(weights=vision_models.ResNet50_Weights.DEFAULT)
        # Заменяем последний полностью связанный слой на новый с нужным количеством классов
        in_features = self.model.fc.in_features
        self.model.fc = nn.Linear(in_features, num_classes)

    def forward(self, batch):
        inputs, _ = batch
        return self.model(inputs)

# Пример использования:

In [28]:
import torch
import torch.nn as nn
import timm

class ConvNeXt(nn.Module):
    def __init__(self, num_classes: int, model_name: str = 'convnext_tiny', pretrained: bool = True):
        """
        Initializes a ConvNeXt model.

        Args:
            num_classes (int): The number of output classes.
            model_name (str): The name of the ConvNeXt model to use from timm.
                              Defaults to 'convnext_tiny'.
            pretrained (bool): Whether to use pretrained weights. Defaults to True.
        """
        super().__init__()

        self.model = timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)

    def forward(self, batch):
        """
        Forward pass of the model.

        Args:
            batch: A tuple containing the input tensor and labels (optional).
                   The input tensor should have shape (batch_size, channels, height, width).

        Returns:
            The output logits of the model.
        """
        inputs, _ = batch
        return self.model(inputs)

In [29]:
device='cuda'

In [30]:
train

Unnamed: 0,image_id,class,image_name,unified_class,class_id
0,image_95274.jpg,sunflower,image_95274.jpg,sunflower,3
1,image_48791.jpg,rose,image_48791.jpg,rose,2
2,image_26601.jpg,rose,image_26601.jpg,rose,2
3,image_10488.jpg,rose,image_10488.jpg,rose,2
4,image_64214.jpg,rose,image_64214.jpg,rose,2
...,...,...,...,...,...
2155,image_26027.jpg,rose,image_26027.jpg,rose,2
2156,image_21087.jpg,rose,image_21087.jpg,rose,2
2157,image_32962.jpg,rose,image_32962.jpg,rose,2
2158,image_21896.jpg,cactus,image_21896.jpg,cactus,0


In [31]:
class AnimalsDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, path_to_images: Path, transforms: tt.Compose) -> None:
        self.df = dataframe
        self.path_to_images = path_to_images
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # print(row)
        image = Image.open(self.path_to_images + '/' + row["image_name"]).convert('RGB')
        # print(image)
        if self.transforms is not None:
            image = self.transforms(image)
        return image, row["class_id"]

In [32]:
from torchvision.utils import make_grid
from torchvision.io import decode_image
from pathlib import Path
import torchvision.transforms.functional as F

def show(imgs):
    if not isinstance(imgs, list):
        imgs = [imgs]
    fig, axs = plt.subplots(ncols=len(imgs), squeeze=False)
    for i, img in enumerate(imgs):
        img = img.detach()
        img = F.to_pil_image(img)
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

In [33]:
from torchvision.io import read_image
idx = 1
img = read_image(images_path +'/'+ data.iloc[idx]["image_name"])

In [40]:
# Инициализируем агументации. В нашем примере я не использую ничего, кроме изменения разрешения изображения в квадрат 224 на 224 пикселя
# и нормализации (перевода пикселей от диапазона 0-255 к распределению с заданными средним и дисперсией).

train_transform = tt.Compose([
    # tt.RandomGrayscale(p=0.5),
    # tt.RandomHorizontalFlip(0.5),
    tt.RandomRotation((-5, 5)),
    tt.Resize((224, 224)),
    tt.ToTensor(),
    tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transform = tt.Compose([
    tt.Resize((224, 224)),
    tt.ToTensor(),
    tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = AnimalsDataset(train, images_path, transforms=train_transform)
val_dataset = AnimalsDataset(val, images_path, transforms=val_transform)


# Обратите внимание на
# num_workers - во сколько отдельных потоков мы будем готовить данные (вызывать AnimalsDataset), можете поставить своё значение, посмотрев количество ядер на вашем компьютере
# batch_size - сколько картинок одновременно мы подадим на вход нашей модели.
# train_dataloader = DataLoader(train_dataset, batch_size=64, num_workers=0, shuffle=True)
# valid_dataloader = DataLoader(val_dataset, batch_size=64, num_workers=0, shuffle=False)
train_dataloader = DataLoader(train_dataset, batch_size=64,  shuffle=True)
valid_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

In [41]:
next(iter(train_dataloader))[1]

tensor([2, 2, 4, 0, 2, 0, 2, 4, 1, 1, 1, 4, 1, 2, 0, 3, 4, 1, 1, 3, 4, 0, 4, 1,
        0, 0, 4, 0, 2, 0, 2, 3, 1, 4, 3, 2, 2, 1, 4, 2, 2, 0, 1, 4, 2, 4, 2, 2,
        3, 1, 4, 0, 3, 4, 3, 0, 3, 1, 3, 4, 4, 1, 2, 4])

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Для обучения выбран девайс {}".format(device))

Для обучения выбран девайс cuda


In [43]:
torch.cuda.empty_cache()

In [44]:
# Напишем код для обучения нашей нейронной сети:
# model = ResNet18(num_classes=data["unified_class"].nunique()).to(device)
model = SwinTransformer(num_classes=data["unified_class"].nunique()).to(device)
# model = Megadescriptor(num_classes=data["unified_class"].nunique()).to(device)
from torch.optim.lr_scheduler import StepLR

# Инициализируем функцию потерь (loss/criterion), а так же оптимизатор, который будет регулировать обновление весов нашей модели
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Переменные для визуализации метрик и функции потерь
train_losses = []
val_losses = []

# Для удобства оценивать качество модели будем той же метрику, что на лидерборде - F1 score
train_f1_scores = []
val_f1_scores = []

best_val_f1 = 0.0
best_model_path = 'best_model.pth'

# Определим, сколько раз мы пройдёмся по всему датасету, прежде, чем закончим обучение модели и выберем лучшую версию
num_epochs = 50

# Шаговое уменьшение (StepLR)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Каждые 5 эпох уменьшать lr в 10 раз

# Напишем свой train_loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_true = []
    train_pred = []

    for batch in tqdm(train_dataloader):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model((inputs, labels))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        # train_true.extend(labels.cpu().numpy())
        # train_pred.extend(preds.cpu().numpy())

    # train_f1 = f1_score(train_true, train_pred, average='macro')
    # train_losses.append(running_loss / len(train_dataloader))
    # train_f1_scores.append(train_f1)

    scheduler.step()
    model.eval()
    val_running_loss = 0.0
    val_true = []
    val_pred = []

    # валидационный цикл, когда мы оцениваем качество работы модели на отложенной выборке
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model((inputs, labels))
            loss = criterion(outputs, labels)

            val_running_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            val_true.extend(labels.cpu().numpy())
            val_pred.extend(preds.cpu().numpy())

    val_f1 = f1_score(val_true, val_pred, average='macro')
    val_losses.append(val_running_loss / len(valid_dataloader))
    val_f1_scores.append(val_f1)

    # если получившаяся модель лучше предыдущей, сохраним чекпоинт
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        print(f'New best model saved with F1: {best_val_f1:.4f}')


    # выведем в консоль получившиеся результаты на отдельной эпохе
    print(f'Epoch [{epoch+1}/{num_epochs}], '
        #   f'Train Loss: {train_losses[-1]:.4f}, Train F1: {train_f1:.4f}, '
          f'Val Loss: {val_losses[-1]:.4f}, Val F1: {val_f1:.4f}')

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

New best model saved with F1: 0.9626
Epoch [1/50], Val Loss: 0.1420, Val F1: 0.9626


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [2/50], Val Loss: 0.2880, Val F1: 0.9008


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

New best model saved with F1: 0.9669
Epoch [3/50], Val Loss: 0.1371, Val F1: 0.9669


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [4/50], Val Loss: 0.1462, Val F1: 0.9498


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [5/50], Val Loss: 0.1911, Val F1: 0.9424


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

New best model saved with F1: 0.9709
Epoch [6/50], Val Loss: 0.0802, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [7/50], Val Loss: 0.0711, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [8/50], Val Loss: 0.0749, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [9/50], Val Loss: 0.0824, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [10/50], Val Loss: 0.0735, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [11/50], Val Loss: 0.0735, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [12/50], Val Loss: 0.0734, Val F1: 0.9667


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [13/50], Val Loss: 0.0732, Val F1: 0.9667


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [14/50], Val Loss: 0.0736, Val F1: 0.9667


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [15/50], Val Loss: 0.0741, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [16/50], Val Loss: 0.0741, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [17/50], Val Loss: 0.0741, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [18/50], Val Loss: 0.0741, Val F1: 0.9709


  0%|          | 0/34 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [45]:
sample = pd.read_csv(r"D:\data science\T1_hol\dataset\annotations\sample_submission.csv")
sample['image_name'] = r'D:\data science\T1_hol\dataset\test/'+sample['name']
sample

Unnamed: 0,name,label,image_name
0,image_62214.jpg,,D:\data science\T1_hol\dataset\test/image_6221...
1,image_91562.jpg,,D:\data science\T1_hol\dataset\test/image_9156...
2,image_44104.jpg,,D:\data science\T1_hol\dataset\test/image_4410...
3,image_79943.jpg,,D:\data science\T1_hol\dataset\test/image_7994...
4,image_79847.jpg,,D:\data science\T1_hol\dataset\test/image_7984...
...,...,...,...
595,image_58364.jpg,,D:\data science\T1_hol\dataset\test/image_5836...
596,image_51853.jpg,,D:\data science\T1_hol\dataset\test/image_5185...
597,image_44601.jpg,,D:\data science\T1_hol\dataset\test/image_4460...
598,image_08599.jpg,,D:\data science\T1_hol\dataset\test/image_0859...


In [46]:
class InferenceDataset(Dataset):
    def __init__(self, image_paths, transforms=None):
        self.image_paths = image_paths
        self.transforms = transforms

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        if self.transforms is not None:
            image = self.transforms(image)
        return image, image_path


# Тут важно не ошибиться и не использовать тренировочные трансформы
infer_transform = tt.Compose([
    tt.Resize((224, 224)),
    tt.ToTensor(),
    tt.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Найдем все тестовые картинки
test_image_paths = sample.image_name.tolist()

infer_dataset = InferenceDataset(test_image_paths, transforms=infer_transform)
infer_dataloader = DataLoader(infer_dataset, batch_size=1, shuffle=False)


# Инициализируем нашу модель и загрузим в неё лучшие после эксперимента веса
# model = ResNet18(num_classes=data["unified_class"].nunique()).to(device)
# model = EfficientNet(num_classes=data["unified_class"].nunique()).to(device)

best_model_path = r'D:\data science\T1_hol\best_model.pth'
model.load_state_dict(torch.load(best_model_path))

# Не забудем перевести модель в режим предсказания, а не обучения.
model.eval()

# Для ускорения инференса будем подавать в модель картинки батчами (по несколько картинок за раз) и сохраним предсказанные метки классов.
results = []
for images, image_names in tqdm(infer_dataloader):
    images = images.to(device)

    with torch.no_grad():
        outputs = model((images, None)) #для не хагина
        preds = torch.argmax(outputs, dim=1).cpu().numpy()

        # outputs = model(images) # для хагина
        # preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        
        results.append(preds[0])


# Для удобства объединим все пары "имя файла - предсказанный класс" в датафрейм (таблицу) с колонками image_name, predicted_class
sample['predicted_class'] = results

# Вывод DataFrame
sample

  model.load_state_dict(torch.load(best_model_path))


  0%|          | 0/600 [00:00<?, ?it/s]

Unnamed: 0,name,label,image_name,predicted_class
0,image_62214.jpg,,D:\data science\T1_hol\dataset\test/image_6221...,2
1,image_91562.jpg,,D:\data science\T1_hol\dataset\test/image_9156...,2
2,image_44104.jpg,,D:\data science\T1_hol\dataset\test/image_4410...,2
3,image_79943.jpg,,D:\data science\T1_hol\dataset\test/image_7994...,2
4,image_79847.jpg,,D:\data science\T1_hol\dataset\test/image_7984...,2
...,...,...,...,...
595,image_58364.jpg,,D:\data science\T1_hol\dataset\test/image_5836...,1
596,image_51853.jpg,,D:\data science\T1_hol\dataset\test/image_5185...,1
597,image_44601.jpg,,D:\data science\T1_hol\dataset\test/image_4460...,1
598,image_08599.jpg,,D:\data science\T1_hol\dataset\test/image_0859...,1


In [49]:
sample['label'] = le1.inverse_transform(sample['predicted_class'])

KeyError: 'predicted_class'

In [48]:
sample.drop(['image_name','predicted_class'],axis=1,inplace=True)


In [50]:
sample

Unnamed: 0,name,label
0,image_62214.jpg,rose
1,image_91562.jpg,rose
2,image_44104.jpg,rose
3,image_79943.jpg,rose
4,image_79847.jpg,rose
...,...,...
595,image_58364.jpg,fern
596,image_51853.jpg,fern
597,image_44601.jpg,fern
598,image_08599.jpg,fern


In [51]:
# Сохраним результат предсказаний в файл и всё! Можно отправлять!
sample.to_csv("otv_l.csv", index=False)