In [None]:
import pandas as pd
from efficientnet_pytorch import EfficientNet
import numpy as np
import pandas as pd
import torch
from torchvision import transforms
torch.cuda.empty_cache()

import glob

from sklearn.metrics.pairwise import cosine_similarity

import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torchvision import transforms
torch.cuda.empty_cache()
import boto3
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
os.environ['AWS_ACCESS_KEY_ID'] = 'ACCOUNT'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'PASSWORD'

s3 = boto3.resource('s3')

def read_image_from_s3(key):
    bucket = s3.Bucket('masterarbeit125255aa')
    img = bucket.Object(key).get().get('Body').read()
    img = cv2.imdecode(np.asarray(bytearray(img)), cv2.COLOR_BGR2RGB)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img


def read_csv_from_s3(stage):

    if stage == 'public':
        df = pd.read_csv('s3://masterarbeit125255aa/data/train/public_tianyi.csv')
    elif stage == 'private':
        df = pd.read_csv('s3://masterarbeit125255aa/data/train/private_tianyi.csv')
    elif stage == 'public_private':
        df = pd.read_csv('s3://masterarbeit125255aa/data/train/public_private.csv')
    elif stage == 'index':
        df = pd.read_csv('s3://masterarbeit125255aa/data/train/index_tianyi.csv')

    else:
        raise ValueError(f'not supported stage{stage}')
    return df

class SiameseGLDV2(Dataset):

    def __init__(self, stage: str):

        self.df = read_csv_from_s3(stage)
        self.df.drop(self.df.filter(regex="Unname"), axis=1, inplace=True)
        self.label_list = self.df.landmark_id.tolist()
        self.namelist = [i.split('\\')[-1] for i in self.df.anchor.tolist()]
        self.s3 = boto3.resource('s3')
        if stage == 'index':
            self.s3path = 'data/train/train_compress'
        else:
            self.s3path = 'data/test'
        self.my_transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])


    def __getitem__(self, index):
        label = self.df.iloc[index]['landmark_id']
        anchor = self.df.iloc[index]['anchor'].split('\\')
        anchor_class = anchor[1]
        anchor_filen = anchor[2]
        anchor_image = self.s3path + '/' + anchor_class + '/' + anchor_filen
        anchor_im = read_image_from_s3(anchor_image)
        transformed_anchor_im = self.my_transformer(anchor_im)
        return label, transformed_anchor_im , anchor_im

# Hier is index global feature

In [None]:
BATCH_SIZE = 500

def batch(iterable, n=BATCH_SIZE):
    l = len(iterable.df)
    for ndx in range(0, l, n):
        imgs = []
        for i in range(ndx,min(ndx + n, l)):
            _,img,_ = iterable.__getitem__(i)
            imgs.append(img.unsqueeze(0))
        print(len(imgs))
        ims = torch.vstack((imgs)).to(device)
        yield ims

def extract_global_features(model, dataset, stage='index'):
    """Save BATCH_SIZE embeddings into npy."""
    BATCH_IDX = 0
    with torch.inference_mode():
        for batch_imgs in batch(dataset):
            print(f'current batch is {BATCH_IDX}')
            embed = model.extract_features(batch_imgs)
            embed = embed.cpu().detach().numpy()
            embed_np = [i.flatten() for i in embed]
            with open(SAVE_PATH + stage + '/' + str(BATCH_IDX).zfill(3) + '.npy', 'wb') as f:
                np.save(f, np.array(embed_np))
            BATCH_IDX += 1

In [None]:
SAVE_PATH = 'out/'

device='cuda'
index_dataset = SiameseGLDV2('index')
test_dataset = SiameseGLDV2('public_private')

model = EfficientNet.from_pretrained("efficientnet-b0").to(device)
model.eval()
extract_global_features(model, index_dataset,  'index')
extract_global_features(model, test_dataset, 'test')

## 000.npy - 199.npy

In [None]:
def map_at_k(y_true, y_denominator, y_pred):
    assert isinstance(y_true, np.ndarray) and isinstance(y_pred, np.ndarray)
    assert y_true.ndim == 2 and y_pred.ndim == 2

    k = y_pred.shape[1]
    is_correct_list = []

    for i in range(y_true.shape[1]):
        is_correct = y_true[:, i][:, np.newaxis] == y_pred
        is_correct_list.append(is_correct)
    is_correct_mat = np.logical_or.reduce(np.array(is_correct_list))

    cumsum_mat = np.apply_along_axis(np.cumsum, axis=1, arr=is_correct_mat)
    arange_mat = np.expand_dims(np.arange(1, k + 1), axis=0)
    ap_100_list = np.sum((cumsum_mat / arange_mat) * is_correct_mat, axis=1) / y_denominator
    ap_100_list[ap_100_list > 1] = 1

    return np.mean(ap_100_list), ap_100_list

def list_to_array (x):
    dff = pd.concat([pd.DataFrame({'{}'.format(index):labels}) for index,labels in enumerate(string_to_list(x))],axis=1)
    return dff.fillna(0).values.T.astype(int)

def string_to_list(x):
    res = []
    for a in x:
        tmp = a.split(' ')
        tmp2= [int(i) for i in tmp]
        res.append(tmp2)
    return res

def index_to_label(index_array,label_list):
    res = []
    for i in range(len(index_array)):
        res.append(label_list[index_array[i].astype(int)])
    return np.array(res)

def comt_denominater(y_true,y_pred):
    res = []
    for idx,i in enumerate(y_true):
        count = 0
        for j in i:
            count += list(y_pred[idx]).count(j)
        res.append(count)
    res = np.array(res)
    res[res==0] = 1
    return res

In [None]:
BATCH_SIZE = 500
class NN:
    def __init__(self, batch_size=BATCH_SIZE, dim=512):
        self.array = np.empty((0, dim))
        self.id = []
        self.score = np.empty((0, batch_size))

    def add_item(self, item):
        if item.ndim == 1:
            item = item[np.newaxis,:]
        self.array = item

    def search(self, search_item, top_k=100):
        self.res = cosine_similarity(self.array, search_item)
        self.score = np.concatenate((self.score ,self.res),axis=0)
        self.rank = self.res[:,0].argsort()[::-1][:top_k]

    def update(self):
        self.array = np.array([self.array[i] for i in self.rank])
        self.id = [self.id[i] for i in self.rank]

    def get_top_k(self, top_k=100):
        return np.argsort(-self.score, axis= 0)[:top_k,:]

In [None]:
index_dataset = SiameseGLDV2('index')
test_dataset = SiameseGLDV2('public_private')

npy_list_index = sorted(glob.glob('out/index/*.npy'))
npy_list_test = sorted(glob.glob('out/test/*.npy'))
images = []
score_list = []

for ind,npy_test_path in tqdm(enumerate(npy_list_test)):
    aps = []
    npy_test = np.load(npy_test_path)
    indexer = NN(dim=1280*7*7, batch_size=npy_test.shape[0])
    for i,npy_index_path in tqdm(enumerate(npy_list_index)):
        npy_index = np.load(npy_index_path)
        indexer.add_item(npy_index)
        indexes = indexer.search(npy_test)
    top_k_array = indexer.get_top_k()

    y_true = list_to_array(test_dataset.label_list[ind*BATCH_SIZE:min((ind+1)*BATCH_SIZE, len(test_dataset.label_list))])
    y_pred = index_to_label(top_k_array.T,np.array(index_dataset.label_list))
    map_100,ap_list = map_at_k(y_true= y_true,
             y_denominator= comt_denominater(y_true, y_pred),
             y_pred= y_pred)
    
    aps.extend(ap_list)
    score_list.extend(list(np.sort(indexer.score, axis=0)[-1,:]))

In [None]:
print(top_k_array[:,0])
print(-np.sort(-indexer.score, axis=0)[:100,0])

In [None]:
top k image of first query image
[14504 47536 18644 43736 75480 14519 81534 73586 70895 72316 18623 83840
 13294 82929 21267 14500 74674 27915 81898 35572 69159 83546 63602 33261
 58871 10561 57898  8168  1175 77246 27914 75431 35857 55351  2635 38369
 32829 71297 57583 78443 49462 29161 15876 49800 43737 78856 23063 38355
 79096 12883 27126 65356 23695  9800 79094 39211  9216 34398 21268 77864
 78652 75433 73588 68733 13445 88561 56412 96758 38363 77410 24919 61989
 72317 22326 11865 18355 42789 60493 45976  1459 14739 49801 33206 71841
 52752 35784 26040 28580 16611 27259 85051 38371 61988 63137 69141 47534
 93836 80475 54062 55326]
cosine similary
[0.42445731 0.42254734 0.4198465  0.40426522 0.39979428 0.39052811
 0.38588095 0.38452125 0.36647618 0.36147875 0.35442877 0.35082251
 0.34764612 0.34743255 0.34553725 0.3427453  0.34068626 0.33792084
 0.33552909 0.33410934 0.33327612 0.33082718 0.3260178  0.32338801
 0.31944689 0.31682131 0.31613293 0.31523979 0.31228456 0.31199259
 0.31029612 0.30850211 0.30778471 0.30773336 0.3046695  0.30431691
 0.30350095 0.30320835 0.30281579 0.30151001 0.30040783 0.30040491
 0.29998979 0.29889715 0.29859236 0.29551837 0.29544222 0.29460022
 0.29416415 0.29407382 0.29201961 0.29159325 0.29109785 0.29054362
 0.28978673 0.28872064 0.28865123 0.28807366 0.28637445 0.28631157
 0.28621984 0.28541684 0.28477892 0.28386804 0.28284135 0.28166395
 0.28079492 0.28076956 0.28013963 0.27968758 0.27917382 0.27838394
 0.27830783 0.277899   0.27706721 0.27689159 0.27631074 0.27618903
 0.27558699 0.2741988  0.27390873 0.27379748 0.27373117 0.27301171
 0.27196723 0.27196717 0.27176923 0.27165937 0.27116722 0.27068016
 0.26954094 0.26909375 0.26893041 0.26876268 0.26841876 0.26837063
 0.26766989 0.26755553 0.2673935  0.26726878]

In [None]:
print(score_list)
print('The map@100 is : {}'.format(np.mean(aps)))
print('The max ap@100 is : {}'.format(np.max(aps)))
print('The min ap@100 is : {}'.format(np.min(aps)))
print('The max cos similarity is : {}'.format(max(score_list)))

In [None]:
The map@100 is : 0.22316579233875491
The max ap@100 is : 1.0
The min ap@100 is : 0.0
The max cos similarity is : 0.8267913460731506