In [1]:
# allows to import of modules
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
nb_dir
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [2]:
from torchvision import transforms

In [3]:
from pathlib import Path

In [4]:
import pandas as pd

In [5]:
from torch.utils.data import Dataset, DataLoader, TensorDataset


In [6]:
import numpy as np
from PIL import Image

In [7]:
class HerbInferenceDataset(Dataset):
    def __init__(self, data_frame, location, size, x_name="file_name"):
        """

        Args:
            data_frame (Dataframe): Dataframe containing label and data
            location (str): data location directory
            size (int): size of image
            x_name (str): column name of file
            y_name (str): column name of label
        """
        self.df = data_frame
        self.location = location
        self.size = (size, size)
        self.x_name = x_name
        self.normalise = transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Taken from torchvision
                                              std=[0.229, 0.224, 0.225])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        image = np.array(Image.open("{}/{}".format(self.location, item[self.x_name])).resize(self.size)).reshape((3, *self.size))
        image_tensor = self.normalise(torch.tensor(image/255, dtype=torch.float)).cuda()
        return image_tensor

In [8]:
# Dataloader

In [9]:
# TEST_PATH = Path("/home/ubuntu/data-1/nybg2020/test/")
TEST_PATH = Path("/home/shaun/personal/kaggle-data/nybg2020/test/")


In [10]:
! ls $TEST_PATH

ds1_1.npy  ds1_2.npy  ds2_1.npy  ds2_2.npy  images  metadata.json


In [11]:
import json

In [12]:
with open(TEST_PATH/"metadata.json", encoding="utf8", errors='ignore') as json_file:
    data = json.load(json_file)

In [13]:
data.keys()

dict_keys(['images', 'info', 'licenses'])

In [14]:
df =  pd.DataFrame.from_dict(data["images"])

In [15]:
df.iloc[0]["file_name"]

'images/104/104891.jpg'

In [16]:
df.head()

Unnamed: 0,file_name,height,id,license,width
0,images/104/104891.jpg,1000,104891,1,661
1,images/018/18029.jpg,1000,18029,1,661
2,images/035/35151.jpg,1000,35151,1,661
3,images/124/124144.jpg,1000,124144,1,682
4,images/024/24649.jpg,1000,24649,1,682


In [17]:
ds = HerbInferenceDataset(df,TEST_PATH, 256, x_name="file_name")

In [18]:
data_loader = DataLoader(ds, batch_size=8, shuffle=False)

In [19]:
# load model

In [20]:
! ls /home/shaun/personal/kaggle-data/nybg2020/models-1

model-114720.pth  model-50000.pth  model-81000.pth
model-29000.pth   model-61000.pth


In [21]:
model_path = "/home/shaun/personal/kaggle-data/nybg2020/models-1/model-81000.pth"

In [22]:
import torchvision.models as models
import torch
import torch.nn as nn
import model

In [23]:
m = model.get_model(models.resnet50(pretrained=False), p1=0, p2=0, device="gpu") # no drop out, trying to over fit

In [24]:
m.load_state_dict(torch.load(model_path))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [25]:
m.eval()
1

1

In [26]:
## Submission

In [27]:
# smax = nn.Softmax(dim=1).cuda()

In [28]:
from tqdm import tqdm

In [29]:
res =  []

In [30]:
for batch in tqdm(data_loader):
    pred = m(batch)
    final = np.argmax(pred.cpu().detach().numpy(), axis=1)
    res.extend(final)


100%|██████████| 17287/17287 [25:58<00:00, 11.83it/s]


In [31]:
class_map_path =  "/home/shaun/personal/kaggle-data/nybg2020/classes-map.json"
with open(class_map_path) as f:
    class_map = json.load(f)

In [32]:
classes_map_key = {v: k for k,v in class_map.items()}

In [33]:
final_preds = [classes_map_key[i] for i in res]

In [34]:
len(final_preds), 138292

(138292, 138292)

In [35]:
df["Predicted"] = final_preds

In [36]:
sub = df[["id", "Predicted"]]

In [37]:
subm_file = "/home/shaun/personal/kaggle-data/nybg2020/sub-12"

In [38]:
sub.to_csv(subm_file, index=False)

In [39]:
! kaggle competitions submit herbarium-2020-fgvc7 -f $subm_file -m "no softmax, normalise input, training run9 model-81000.pth"

100%|███████████████████████████████████████| 1.56M/1.56M [00:10<00:00, 150kB/s]
Successfully submitted to Herbarium 2020 - FGVC7