In [607]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision import models

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from PIL import Image


In [608]:
data_path = '../input/neurowood/trainset'
preprocess = transforms.Compose([
             transforms.Resize((224,224)),
             
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225]),
])
train_data = datasets.ImageFolder(data_path, transform=preprocess)

In [609]:
vgg19 = models.vgg19(pretrained=True)

In [610]:
vgg19.classifier = nn.Sequential(*list(vgg19.classifier.children()))[:-1]

class New_VGG19(nn.Module):
    def __init__(self):
        super().__init__()
        self.vgg19 = vgg19
        # for param in self.vgg16.features.parameters():
        #     param.requires_grad = False
        self.fc = nn.Linear(4096, 3)
    
    def forward(self, x):
        x = self.vgg19(x)
        x = self.fc(x)
        return x

In [611]:
def subset_ind(dataset, ratio: float):
    return np.random.choice(len(dataset), size=int(ratio*len(dataset)), replace=False)

In [612]:
val_size = 0.07
val_inds = subset_ind(train_data, val_size)

train_dataset = Subset(train_data, [i for i in range(len(train_data)) if i not in val_inds])
val_dataset = Subset(train_data, val_inds)

print(f'training size: {len(train_dataset)}\nvalidation size: {len(val_dataset)}')


In [613]:
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2)

In [614]:
device = torch.device('cuda')

In [615]:
def train(net, loss_fn, optimizer, train_loader,val_loader,n_epoch):
    best_metrics = 0

    for epoch in range(n_epoch):
        print(f'Epoch {epoch + 1}')
        train_dataiter = iter(train_loader)
        for i, batch in enumerate(tqdm(train_dataiter)):
            X_batch, y_batch = batch

            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()

            y_pred = net(X_batch)
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            metrics = []
            for batch in val_loader:
                x, y = batch
                x = x.to(device)
                y = y.to(device)
                y_pred = net(x)

                y_true = y.detach().cpu().numpy() 
                y_pred = np.argmax(y_pred.detach().cpu().numpy(), axis=1)
                f1_batch = f1_score(y_true, y_pred, average='macro')
                metrics.append(f1_batch)
            
            metrics = np.mean(np.array(metrics))
            # если стало лучше - сохраняем на диск и обновляем лучшую метрику
            if metrics > best_metrics:
                print('New best model with test f1-score:', metrics)
                torch.save(net.state_dict(), './best_model.pt')
                best_metrics = metrics
            if best_metrics == 1:
                break

    return net

In [616]:
torch.cuda.empty_cache()
import gc
gc.collect()

In [617]:
net = New_VGG19().to(device)

lr = 1e-5
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr,weight_decay = 3e-6)

net = train(net, loss_fn, optimizer, train_loader, val_loader, n_epoch=50)

In [618]:

the_model = New_VGG19()
the_model.load_state_dict(torch.load('./best_model.pt'))
the_model.to(device)

from sklearn.metrics import classification_report

real = []
pred = []

with torch.no_grad():
    metrics = []
    for batch in tqdm(val_loader):
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        y_pred = the_model(x)

        y_true = y.detach().cpu().numpy().tolist() 
        y_pred = np.argmax(y_pred.detach().cpu().numpy(), axis=1).tolist()
        real.extend(y_true)
        pred.extend(y_pred)

print(classification_report(real, pred))

In [619]:
test_path  = '../input/testdata/test'
test_data  = datasets.ImageFolder(test_path, transform=preprocess)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, pin_memory=True, num_workers=2)

real = []
pred = []

with torch.no_grad():
    metrics = []
    for batch in tqdm(test_loader):
        x, y = batch
        x = x.to(device)
        y = y.to(device)
        y_pred = the_model(x)

        y_true = y.detach().cpu().numpy().tolist() 
        y_pred = np.argmax(y_pred.detach().cpu().numpy(), axis=1).tolist()
        real.extend(y_true)
        pred.extend(y_pred)

print(pred)

In [620]:

a = np.array([list(range(1, 250)), pred]).T
submission = pd.DataFrame(a, columns=['id','class'])

In [621]:
submission['class'] = np.where((submission['class'] == 1), 3, submission['class'])
submission['class'] = np.where((submission['class'] == 0), 1, submission['class'])
submission['class'] = np.where((submission['class'] == 2), 0, submission['class'])

In [622]:
submission.head(10)

In [623]:
submission.to_csv('./submission.csv', index=False)