In [44]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ResNetForImageClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from collections import Counter
import evaluate

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
DEVICE

device(type='cuda')

In [37]:
# Установим необходимые параметры
BATCH_SIZE = 64
NUM_EPOCHS = 10
LEARNING_RATE = 3e-4
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# Загрузим данные
train_df = pd.read_csv('train.csv')

In [6]:
train_df

Unnamed: 0,unified_class,class_id,image_name
0,Оленевые,5,3cf4207b958eade893a2f1618cf062b8.JPG
1,Кошки,2,37698901280c871f426d40afe5c373cd.JPG
2,Заяц,0,20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,Кошки,2,a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,Оленевые,5,54eb76914b84db8a0d56f98125abf588.JPG
...,...,...,...
28010,Оленевые,5,07b420b4fe265b4ed918b46435c025d7.JPG
28011,Пантеры,6,2d1c5918357bbdd729bf79085e55d35e.JPG
28012,Заяц,0,1531efa9f8687e390adf780355acd606.JPG
28013,Кабан,1,2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [7]:
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

In [8]:
# Создаем класс для загрузки данных
class AnimalDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 2])  # image_name
        image = Image.open(img_name).convert("RGB")  # Открываем изображение
        label = self.dataframe.iloc[idx, 1]  # class_id

        if self.transform:
            image = self.transform(image)

        # Возвращаем словарь вместо кортежа
        return {
            'pixel_values': image,
            'labels': label
        }

In [9]:
# Преобразования изображений
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [10]:
# Создание обучающего и валидационного датасетов
train_dataset = AnimalDataset(train_data, 'train/train/', transform=transform)
val_dataset = AnimalDataset(val_data, 'train/train/', transform=transform)

In [11]:
train_dataset[0]

{'pixel_values': tensor([[[0.3255, 0.4471, 0.4902,  ..., 0.5608, 0.5412, 0.5412],
          [0.2235, 0.3216, 0.3882,  ..., 0.5098, 0.5255, 0.5294],
          [0.1529, 0.1686, 0.1961,  ..., 0.4118, 0.4902, 0.5059],
          ...,
          [0.3137, 0.3098, 0.2824,  ..., 0.5255, 0.5451, 0.5569],
          [0.2980, 0.2902, 0.2706,  ..., 0.4510, 0.3020, 0.2039],
          [0.3098, 0.2980, 0.2863,  ..., 0.4392, 0.3098, 0.2627]],
 
         [[0.3059, 0.4275, 0.4706,  ..., 0.5608, 0.5412, 0.5412],
          [0.2078, 0.3059, 0.3725,  ..., 0.5098, 0.5255, 0.5294],
          [0.1373, 0.1529, 0.1804,  ..., 0.4118, 0.4902, 0.5059],
          ...,
          [0.3137, 0.3098, 0.2863,  ..., 0.5216, 0.5412, 0.5529],
          [0.2980, 0.2902, 0.2745,  ..., 0.4471, 0.2980, 0.1961],
          [0.3098, 0.2980, 0.2902,  ..., 0.4353, 0.3059, 0.2549]],
 
         [[0.3059, 0.4196, 0.4588,  ..., 0.5216, 0.4941, 0.4941],
          [0.2157, 0.3059, 0.3686,  ..., 0.4706, 0.4824, 0.4824],
          [0.1608, 0.172

In [12]:
val_dataset[0]

{'pixel_values': tensor([[[0.3451, 0.3765, 0.4314,  ..., 0.7765, 0.5725, 0.6863],
          [0.3255, 0.3020, 0.3294,  ..., 0.8314, 0.6902, 0.6667],
          [0.3490, 0.2745, 0.2980,  ..., 0.7922, 0.7333, 0.7333],
          ...,
          [0.8039, 0.8706, 0.8745,  ..., 0.6784, 0.7020, 0.8275],
          [0.8902, 0.7569, 0.6118,  ..., 0.6039, 0.5686, 0.6549],
          [0.6980, 0.7294, 0.5020,  ..., 0.6118, 0.5961, 0.4863]],
 
         [[0.3529, 0.3843, 0.4392,  ..., 0.7725, 0.5725, 0.6980],
          [0.3216, 0.2980, 0.3255,  ..., 0.8275, 0.6863, 0.6706],
          [0.3294, 0.2588, 0.2784,  ..., 0.7843, 0.7294, 0.7333],
          ...,
          [0.7843, 0.8549, 0.8706,  ..., 0.6863, 0.7137, 0.8431],
          [0.8706, 0.7412, 0.6157,  ..., 0.6157, 0.5804, 0.6745],
          [0.6824, 0.7137, 0.5059,  ..., 0.6235, 0.6078, 0.5059]],
 
         [[0.3804, 0.4157, 0.4667,  ..., 0.7961, 0.6000, 0.7333],
          [0.3490, 0.3294, 0.3529,  ..., 0.8510, 0.7176, 0.7059],
          [0.3569, 0.286

In [13]:
# Загрузка модели
model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50", num_labels=len(train_df['class_id'].unique()), ignore_mismatched_sizes=True)
model.to(DEVICE)

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-50 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 2048]) in the checkpoint and torch.Size([10, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ResNetForImageClassification(
  (resnet): ResNetModel(
    (embedder): ResNetEmbeddings(
      (embedder): ResNetConvLayer(
        (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): ReLU()
      )
      (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    )
    (encoder): ResNetEncoder(
      (stages): ModuleList(
        (0): ResNetStage(
          (layers): Sequential(
            (0): ResNetBottleNeckLayer(
              (shortcut): ResNetShortCut(
                (convolution): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (layer): Sequential(
                (0): ResNetConvLayer(
                  (convolution): Conv2d(64

In [34]:
accuracy = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels, average="macro")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [42]:
# Настройка аргументов для тренировки с валидацией
training_args = TrainingArguments(
    output_dir='./results',
    logging_strategy='epoch',
    evaluation_strategy='epoch',  # Стратегия валидации на каждой эпохе
    save_strategy='epoch',
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    logging_dir='./logs',
    report_to='none',  # Отключаем wandb
    load_best_model_at_end=True,
    metric_for_best_model="f1"

)

# Создаем Trainer с валидационным набором данных
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Добавляем валидационный набор
    compute_metrics=compute_metrics
)



In [45]:
# Обучение модели
trainer.train()

  0%|          | 0/3510 [00:00<?, ?it/s]

{'loss': 0.0173, 'grad_norm': 1.672014832496643, 'learning_rate': 0.00027, 'epoch': 1.0}
{'eval_loss': 0.14438168704509735, 'eval_f1': 0.9418502789792293, 'eval_runtime': 30.5874, 'eval_samples_per_second': 183.18, 'eval_steps_per_second': 22.918, 'epoch': 1.0}
{'loss': 0.0178, 'grad_norm': 7.759483337402344, 'learning_rate': 0.00023999999999999998, 'epoch': 2.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.14729221165180206, 'eval_f1': 0.9423050578768791, 'eval_runtime': 45.7093, 'eval_samples_per_second': 122.579, 'eval_steps_per_second': 15.336, 'epoch': 2.0}
{'loss': 0.0135, 'grad_norm': 11.683399200439453, 'learning_rate': 0.00020999999999999998, 'epoch': 3.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.1498822122812271, 'eval_f1': 0.9514046771783132, 'eval_runtime': 40.8707, 'eval_samples_per_second': 137.091, 'eval_steps_per_second': 17.152, 'epoch': 3.0}
{'loss': 0.009, 'grad_norm': 5.540922164916992, 'learning_rate': 0.00017999999999999998, 'epoch': 4.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.11805693805217743, 'eval_f1': 0.9581158982527768, 'eval_runtime': 30.5199, 'eval_samples_per_second': 183.585, 'eval_steps_per_second': 22.969, 'epoch': 4.0}
{'loss': 0.0064, 'grad_norm': 1.74955415725708, 'learning_rate': 0.00015, 'epoch': 5.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.10827530920505524, 'eval_f1': 0.9583187067456451, 'eval_runtime': 30.5003, 'eval_samples_per_second': 183.703, 'eval_steps_per_second': 22.983, 'epoch': 5.0}
{'loss': 0.0051, 'grad_norm': 0.6025083661079407, 'learning_rate': 0.00011999999999999999, 'epoch': 6.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.11362635344266891, 'eval_f1': 0.959678726880249, 'eval_runtime': 30.4784, 'eval_samples_per_second': 183.835, 'eval_steps_per_second': 23.0, 'epoch': 6.0}
{'loss': 0.0041, 'grad_norm': 2.011050224304199, 'learning_rate': 8.999999999999999e-05, 'epoch': 7.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.09814654290676117, 'eval_f1': 0.9629651247012392, 'eval_runtime': 30.6943, 'eval_samples_per_second': 182.542, 'eval_steps_per_second': 22.838, 'epoch': 7.0}
{'loss': 0.0015, 'grad_norm': 1.2125581502914429, 'learning_rate': 5.9999999999999995e-05, 'epoch': 8.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.09950543195009232, 'eval_f1': 0.964353014072579, 'eval_runtime': 33.2771, 'eval_samples_per_second': 168.374, 'eval_steps_per_second': 21.066, 'epoch': 8.0}
{'loss': 0.0015, 'grad_norm': 1.9241657257080078, 'learning_rate': 2.9999999999999997e-05, 'epoch': 9.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.09561894088983536, 'eval_f1': 0.9644897167984844, 'eval_runtime': 44.5505, 'eval_samples_per_second': 125.767, 'eval_steps_per_second': 15.735, 'epoch': 9.0}
{'loss': 0.001, 'grad_norm': 0.37217843532562256, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/701 [00:00<?, ?it/s]

{'eval_loss': 0.1028129979968071, 'eval_f1': 0.9642607059202455, 'eval_runtime': 34.9168, 'eval_samples_per_second': 160.467, 'eval_steps_per_second': 20.076, 'epoch': 10.0}
{'train_runtime': 2715.6535, 'train_samples_per_second': 82.529, 'train_steps_per_second': 1.293, 'train_loss': 0.0077077179298102, 'epoch': 10.0}


TrainOutput(global_step=3510, training_loss=0.0077077179298102, metrics={'train_runtime': 2715.6535, 'train_samples_per_second': 82.529, 'train_steps_per_second': 1.293, 'total_flos': 4.762596652302828e+18, 'train_loss': 0.0077077179298102, 'epoch': 10.0})

In [46]:
# Подготовка к тестированию и созданию предсказаний
test_df = pd.read_csv('sample_submission.csv')
test_df

Unnamed: 0,image_name,predicted_class
0,cc27b9b56583a615fb8501e352402eb9.JPG,0
1,87872711fe672676fd34a97e997f9c47.JPG,0
2,424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,c5537eaa60525efd7bad4a5560607e83.JPG,0
4,e9f15b67ca49453e281b2b4f245eac13.JPG,0
...,...,...
12953,028668e733cd17ec9b9f1c7e2c657b36.JPG,0
12954,eb1f1152941fdfdd50ff9954010e622a.JPG,0
12955,bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,0
12956,2eaf9c794958a93bb9984441fd5d7f61.JPG,0


In [47]:
# Создание класса для тестового датасета
class TestAnimalDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None):
        self.dataframe = dataframe
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.dataframe.iloc[idx, 0])  # image_name
        image = Image.open(img_name).convert("RGB")  # Открываем изображение

        if self.transform:
            image = self.transform(image)

        # Возвращаем только изображение, так как у нас нет меток в тестовом наборе
        return {
            'pixel_values': image,
            'image_name': self.dataframe.iloc[idx, 0]  # Сохраняем имя изображения для предсказаний
        }

In [48]:
test_dataset = TestAnimalDataset(test_df, 'test/test/', transform=transform)


In [49]:
test_dataset[0]

{'pixel_values': tensor([[[0.0078, 0.0078, 0.0039,  ..., 0.0000, 0.0000, 0.0000],
          [0.0157, 0.0157, 0.0118,  ..., 0.0000, 0.0000, 0.0000],
          [0.0235, 0.0275, 0.0235,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0275, 0.0706, 0.3059,  ..., 0.0745, 0.0863, 0.1098],
          [0.0275, 0.0196, 0.2118,  ..., 0.0784, 0.0745, 0.0980],
          [0.0235, 0.0471, 0.4902,  ..., 0.0196, 0.0157, 0.0275]],
 
         [[0.0078, 0.0078, 0.0039,  ..., 0.0000, 0.0000, 0.0000],
          [0.0157, 0.0157, 0.0118,  ..., 0.0000, 0.0000, 0.0000],
          [0.0235, 0.0275, 0.0235,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0275, 0.0706, 0.3059,  ..., 0.0745, 0.0863, 0.1098],
          [0.0275, 0.0196, 0.2118,  ..., 0.0784, 0.0745, 0.0980],
          [0.0235, 0.0471, 0.4902,  ..., 0.0196, 0.0157, 0.0275]],
 
         [[0.0078, 0.0078, 0.0039,  ..., 0.0000, 0.0000, 0.0000],
          [0.0157, 0.0157, 0.0118,  ..., 0.0000, 0.0000, 0.0000],
          [0.0235, 0.027

In [50]:
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [51]:
# Убедитесь, что модель находится в режиме оценки
model.eval()

predictions = []

# Используем with torch.no_grad() для отключения градиентов
with torch.no_grad():
    for batch in tqdm(test_loader):
        images = batch['pixel_values'].to(DEVICE)  # Получаем изображения
        outputs = model(images).logits  # Получаем выходы модели
        _, preds = torch.max(outputs, 1)  # Получаем предсказанные классы
        predictions.extend(preds.cpu().numpy())  # Сохраняем предсказания

100%|██████████| 203/203 [01:36<00:00,  2.11it/s]


In [52]:
# Создание DataFrame с результатами предсказаний
submission_df = pd.DataFrame({
    'image_name': [batch['image_name'][i] for batch in test_loader for i in tqdm(range(len(batch['image_name'])))],
    'predicted_class': predictions
})

100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<00:00, 63989.38it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<00:00, 64019.90it/s]
100%|██████████| 64/64 [00:00<00:00, 63958.89it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<?, ?it/s]
100%|██████████| 64/64 [00:00<00:00, 64004.64it

In [53]:
submission_df

Unnamed: 0,image_name,predicted_class
0,cc27b9b56583a615fb8501e352402eb9.JPG,4
1,87872711fe672676fd34a97e997f9c47.JPG,5
2,424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,c5537eaa60525efd7bad4a5560607e83.JPG,1
4,e9f15b67ca49453e281b2b4f245eac13.JPG,6
...,...,...
12953,028668e733cd17ec9b9f1c7e2c657b36.JPG,1
12954,eb1f1152941fdfdd50ff9954010e622a.JPG,4
12955,bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,5
12956,2eaf9c794958a93bb9984441fd5d7f61.JPG,6


In [55]:
Counter(submission_df['predicted_class'])

Counter({5: 4843,
         6: 2544,
         4: 1475,
         1: 905,
         7: 785,
         8: 785,
         0: 507,
         2: 421,
         9: 378,
         3: 315})

In [54]:
submission_df.to_csv('submission4.csv', index=False)

In [113]:
lst=[]
with torch.no_grad():
    for input in tqdm(test_dataset):
        outputs = model(input['pixel_values'].to('cuda').reshape(1,3,224,224)).logits.argmax(-1)
        lst.append(int(outputs.cpu()))
lst[:5]

100%|██████████| 12958/12958 [03:27<00:00, 62.46it/s]


[6, 5, 5, 5, 6]

In [116]:
submission = pd.DataFrame({
    'image_name': [batch['image_name'][i] for batch in test_loader for i in range(len(batch['image_name']))],
    'predicted_class': lst
})

In [117]:
submission

Unnamed: 0,image_name,predicted_class
0,cc27b9b56583a615fb8501e352402eb9.JPG,6
1,87872711fe672676fd34a97e997f9c47.JPG,5
2,424aa1aa8eb5bbdd07275f88077bc86c.JPG,5
3,c5537eaa60525efd7bad4a5560607e83.JPG,5
4,e9f15b67ca49453e281b2b4f245eac13.JPG,6
...,...,...
12953,028668e733cd17ec9b9f1c7e2c657b36.JPG,5
12954,eb1f1152941fdfdd50ff9954010e622a.JPG,5
12955,bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,5
12956,2eaf9c794958a93bb9984441fd5d7f61.JPG,6


In [118]:
Counter(submission['predicted_class'])

Counter({6: 3815, 5: 8211, 4: 534, 1: 356, 8: 9, 9: 18, 7: 15})