In [1]:
#from fastai.vision import *

In [2]:
from pathlib import Path

In [3]:
import pandas as pd

In [4]:
from torch.utils.data import Dataset, DataLoader, TensorDataset


In [5]:
import numpy as np
from PIL import Image

In [6]:
class HerbInferenceDataset(Dataset):
    def __init__(self, data_frame, location, size, x_name="file_name"):
        """

        Args:
            data_frame (Dataframe): Dataframe containing label and data
            location (str): data location directory
            size (int): size of image
            x_name (str): column name of file
            y_name (str): column name of label
        """
        self.df = data_frame
        self.location = location
        self.size = (size, size)
        self.x_name = x_name

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        image = np.array(Image.open("{}/{}".format(self.location, item[self.x_name])).resize(self.size)).reshape((3, *self.size))
        image_tensor = torch.tensor(image/255, dtype=torch.float).cuda()
        return image_tensor

In [7]:
# Dataloader

In [8]:
TEST_PATH = Path("/home/ubuntu/data-1/nybg2020/test/")

In [9]:
! ls $TEST_PATH

images	metadata.json


In [10]:
import json

In [11]:
with open(TEST_PATH/"metadata.json", encoding="utf8", errors='ignore') as json_file:
    data = json.load(json_file)

In [12]:
data.keys()

dict_keys(['images', 'info', 'licenses'])

In [13]:
df =  pd.DataFrame.from_dict(data["images"])

In [14]:
df.iloc[0]["file_name"]

'images/104/104891.jpg'

In [15]:
df.head()

Unnamed: 0,file_name,height,id,license,width
0,images/104/104891.jpg,1000,104891,1,661
1,images/018/18029.jpg,1000,18029,1,661
2,images/035/35151.jpg,1000,35151,1,661
3,images/124/124144.jpg,1000,124144,1,682
4,images/024/24649.jpg,1000,24649,1,682


In [16]:
ds = HerbInferenceDataset(df,TEST_PATH, 246, x_name="file_name")

In [17]:
data_loader = DataLoader(ds, batch_size=128, shuffle=False)

In [18]:
# load model

In [19]:
! ls /home/ubuntu/data-1/nybg2020/train/models/

model-32211.pth  model-64422.pth  model-96633.pth


In [20]:
model_path = "/home/ubuntu/data-1/nybg2020/train/models/model-96633.pth"

In [21]:
import torchvision.models as models
import torch
import torch.nn as nn

In [22]:
base = models.resnet34(pretrained=False)
head = nn.Linear(in_features=512, out_features=32094)
base.fc = head


In [23]:
base.load_state_dict(torch.load(model_path))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [24]:
base.cuda()
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Co

In [25]:
## Submission

In [26]:
smax = nn.Softmax(dim=1)

In [27]:
from tqdm import tqdm

In [28]:
res =  []

In [29]:
for batch in tqdm(data_loader):
    pred = smax(base(batch))
    final = np.argmax(pred.cpu().detach().numpy(), axis=1)
    res.append(final)


100%|██████████| 1081/1081 [34:13<00:00,  1.56s/it]


In [41]:
flat_list = [item for sublist in res for item in sublist]

In [42]:
res_flat = np.array(flat_list)

In [44]:
res_flat.shape

(138292,)

In [40]:
res_flat.shape

(1081,)

In [45]:
df["Predicted"] = res_flat

In [47]:
sub = df[["id", "Predicted"]]

In [48]:
subm_file = "/home/ubuntu/data-1/sub-1"

In [49]:
sub.to_csv(subm_file, index=False)

In [51]:
! kaggle competitions submit herbarium-2020-fgvc7 -f $subm_file -m "first sub"

100%|███████████████████████████████████████| 1.56M/1.56M [00:02<00:00, 654kB/s]
Successfully submitted to Herbarium 2020 - FGVC7