# NetVLAD Tutorial
아래 코드는 https://github.com/Nanne/pytorch-NetVlad 와, https://github.com/Relja/netvlad 를 검토하여 작성했습니다. 

0. Header 

1. Wholedata dataset load
  - positive_within_thr : Test/Eval 용
  - nontrivial_posisitive : Train용
  - potential_negatives 

2. define Network  

3. First Run  
  - model.eval()+torch.no_grad()
  - store feature_vector of images (query, db)

4. TripletDataset mining
  - set triplet
     --collate_fn (query, positive, negative)

4. Train Epoch, Feature extraction, train network
   - model.train()

5. Back to #.4 . update triplet, especiall feature_vector. 
   do loop until epoch is over

6. test / eval


# Header / Preprocessing
 - 동작을 위한 라이브러리
 - 데이터 경로 및 파일 읽기


In [1]:
#Torch Library
import torch
import timm
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data
from torch.utils.data import DataLoader, SubsetRandomSampler
from torch.utils.data.dataset import Subset
import torchvision.models as models

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

#Standard Library
import os
from PIL import Image
import matplotlib.pyplot as plt
from scipy.io import loadmat
import numpy as np
from collections import namedtuple
import random

# Vector library
import sklearn
from sklearn.neighbors import NearestNeighbors

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
root_dir = './data/Pittsburgh250k/'  #dbData file
struct_dir = os.path.join(root_dir, 'netvlad_v100_datasets/datasets/')
structFile = 'pitts250k_train.mat'
queries_dir = os.path.join(root_dir, 'queries_real/')


def parse_dbStruct(structfile, struct_dir):

    structfile
    dataset = structfile  #db의 이름을 넣기 위한 위치

    mat = loadmat(os.path.join(struct_dir,structfile))

    matStruct = mat['dbStruct'].item()

    #debugging 용 출력
    print(len(matStruct))
    first_col = list(map(lambda x: x[0], matStruct))
    for i in range(len(matStruct)):
        print(f"matStruct[{i}] :{first_col[i]}")

    whichSet = matStruct[0].item()

    dbImage = [f[0].item() for f in matStruct[1]]  #이미지리스트
    utmDb = matStruct[2].T

    qImage = [f[0].item() for f in matStruct[3]] #쿼리 이미지
    utmQ = matStruct[4].T

    numDb = matStruct[5].item()
    numQ = matStruct[6].item()

    posDistThr = matStruct[7].item()  #25
    posDistSqThr = matStruct[8].item() #625 --> 25^2
    nonTrivPosDistSqThr = matStruct[9].item() #100 -->10^2

    return dbStruct(whichSet, dataset, dbImage, utmDb, qImage, 
        utmQ, numDb, numQ, posDistThr, 
        posDistSqThr, nonTrivPosDistSqThr)

dbStruct = namedtuple('dbStruct', ['whichSet', 'dataset', 
    'dbImage', 'utmDb', 'qImage', 'utmQ', 'numDb', 'numQ',
    'posDistThr', 'posDistSqThr', 'nonTrivPosDistSqThr'])

test = parse_dbStruct('pitts250k_train.mat',struct_dir)
print(test[1])

10
matStruct[0] :train
matStruct[1] :[array(['000/000021_pitch1_yaw1.jpg'], dtype='<U26')]
matStruct[2] :[585323.61300252 585323.61300252 585323.61300252 ... 584463.92173637
 584463.92173637 584463.92173637]
matStruct[3] :[array(['001/001381_pitch1_yaw1.jpg'], dtype='<U26')]
matStruct[4] :[585089.36032141 585089.36032141 585089.36032141 ... 584861.3359102
 584861.3359102  584861.3359102 ]
matStruct[5] :[91464]
matStruct[6] :[7824]
matStruct[7] :[25]
matStruct[8] :[625]
matStruct[9] :[100]
pitts250k_train.mat


In [13]:
#DataLoader 를 instantiation할때 사용할 transform 선언
def input_transform():
    return transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225]),
    ])


In [None]:
# 이 데이터셋 클래스 하나로, triplet을 구성하기 위한 초기 분류도 함께 수행한다. 
class WholeDatset(data.Dataset):

    def __init__(self, rootPath, stPath, qPath, structFile, transform=None, onlyDB=False, ):
        super().__init__()  #parent class 초기화용이나, 현재는 크게필요하지 않음. 
        self.rootPath = rootPath
        self.stPath = stPath
        self.qPath = qPath
        self.structFile = structFile
        self.input_transform = transform #tensor로 변환
        self.onlyDB = onlyDB

        self.dbStruct = parse_dbStruct(self.structFile, self.stPath) #dataset에 대한 파일 읽기

        #Test/Eval 시 WholeDataset 만으로도 수행 가능하도록 images도 선언함. (query + db) or(only db)
        self.images = [os.path.join(self.rootPath, dbIm) for dbIm in self.dbStruct.dbImage]
        if not onlyDB:
            self.images += [os.path.join(self.rootPath, qIm) for qIm in self.dbStruct.qImage]

        self.whichSet = self.dbStruct.whichSet  #train, test, val 중 하나
        self.dataset = self.dbStruct.dataset   # pittsburgh250k, 30k 등

        self.positives = None   #현재는 없음
        self.distances = None   #현재는 없음


        #mat data load and parsing
        self.whichSet = self.dbStruct.whichSet              #train
        self.dataset = self.dbStruct.dataset                #pittsburch250k
        

        #Dataset Class를 초기화 하면서, 
        #db관련 mat 파일을 통해 UTM좌표를 기반으로 positive와 negative를 구한다. 

        # 각 Threshold의 설명 
        #  1) Trivial Pos : Positive중 Query와 너무 가까워서 제외되는 것들. < nontrivialPosDistSqThr
        #  2) nonTrivial Pos : nonTrivialPos~PosDisThr 사이에 존재하는 Positive들
        #  3) Negative : PosDistThr 밖의 좌표들
        # 참조 
        # posDistThr = matStruct[7].item()  #25
        # posDistSqThr = matStruct[8].item() #625 --> 25^2
        # nonTrivPosDistSqThr = matStruct[9].item() #100 -->10^2

        print("utmDb shape:", self.dbStruct.utmDb.shape, "utmQ shape:", self.dbStruct.utmQ.shape)
        
        knn = NearestNeighbors(n_jobs=8)
        knn.fit(self.dbStruct.utmDb)

        #거리 25m 이내의 positive 후보군들을 검색한다. 
        #기본 설정은 민코프스키 로 되어있으므로, 자동으로 유클리디안 거리가 나온다. 
        #여기서 의문은, trivial positive를 어떻게 제외하지 하는 부분이다. 

        PositiveDistance,PositiveIndex = knn.radius_neighbors(
            self.dbStruct.utmQ,
            radius=self.dbStruct.posDistThr,
            return_distance=True
            )
        
        nontrivial_positives = []

        for i in range(len(PositiveIndex)):
            fDist = PositiveDistance[i]
            nIndex = PositiveIndex[i]
            fDistSq = fDist ** 2
            mask = fDistSq > self.dbStruct.nonTrivPosDistSqThr
            #np masking 방법 https://m.blog.naver.com/baek2sm/221844619151
            nontrivial_positives.append(nIndex[mask])

        self.pos_within_Thr = PositiveIndex  #trivial 도 포함
        self.nontrivial_pos = nontrivial_positives 
            
        #예외처리를 위한 부분. 만약 nontrivial possitive가 없다면 해당 쿼리는 제외한다. 
        self.queries = np.where(np.array([len(x) for x in self.nontrivial_pos])>0)[0]

        # Negative 후보 만들기 
        self.potential_negatives = []        

        # Index로 연산하기 위해 전체 Index를 하나 만든다. 
        self.numDbIndex = np.arange(self.dbStruct.numDb)

              
        #전체 dbImage 배열에서 Potential Positve를 뺀 나머지 배열을 만든다. 
        for pos in self.pos_within_Thr:
            self.potential_negatives.append(
                                            np.setdiff1d(
                                                         self.numDbIndex, pos, 
                                                         assume_unique=True
                                                         )
                                            )


        #debugging code
        print(f"  numDb={self.dbStruct.numDb}, numQ={self.dbStruct.numQ}")
        print(f"  posDistThr={self.dbStruct.posDistThr} (m)")
        print(f"  nonTrivPosDistSqThr={self.dbStruct.nonTrivPosDistSqThr} (m^2)")
        print(f"  valid queries (nontrivial exists): {len(self.queries)}")

        sample_query = np.random.choice(self.queries,5,replace=False)
        for i in sample_query:
            print(f"Query : {i}, Nontrivial Pos : {len(self.nontrivial_pos[i])}, Negative : {len(self.potential_negatives[i])}")

    def __len__(self):
            return len(self.queries)

    #WholeDataset 그대로 test/eval에 사용될 경우, 이미지를 출력하기 위한 필수 메서드드
    def __getitem__(self, index):
        img = Image.open(self.images[index])  #dataset의 이미지를 불러와 출력

        if self.input_transform:
            img = self.input_transform(img)  #tensor로 변환한다. 
        return img, index

    def getPositive(self):   #학습에선 사용하지 않음. 이후 Test/Evaluation에서 GT추출용으로 사용 
        return self.pos_within_Thr

    def getNegative(self):
        return self.potential_negatives

    def getNontrivialPositive(self):
        return self.nontrivial_pos
    
    def getValidQueries(self):
        return self.queries


In [35]:
ds = WholeDatset(rootPath = root_dir, stPath=struct_dir, qPath=queries_dir, transform = input_transform(), structFile='pitts250k_train.mat')


10
matStruct[0] :train
matStruct[1] :[array(['000/000021_pitch1_yaw1.jpg'], dtype='<U26')]
matStruct[2] :[585323.61300252 585323.61300252 585323.61300252 ... 584463.92173637
 584463.92173637 584463.92173637]
matStruct[3] :[array(['001/001381_pitch1_yaw1.jpg'], dtype='<U26')]
matStruct[4] :[585089.36032141 585089.36032141 585089.36032141 ... 584861.3359102
 584861.3359102  584861.3359102 ]
matStruct[5] :[91464]
matStruct[6] :[7824]
matStruct[7] :[25]
matStruct[8] :[625]
matStruct[9] :[100]
utmDb shape: (91464, 2) utmQ shape: (7824, 2)
  numDb=91464, numQ=7824
  posDistThr=25 (m)
  nonTrivPosDistSqThr=100 (m^2)
  valid queries (nontrivial exists): 7824
Query : 3753, Nontrivial Pos : 96, Negative : 91320
Query : 3933, Nontrivial Pos : 48, Negative : 91368
Query : 6125, Nontrivial Pos : 72, Negative : 91368
Query : 4512, Nontrivial Pos : 48, Negative : 91368
Query : 3584, Nontrivial Pos : 48, Negative : 91368


# 2. Define Network

VGG16-Conv5 까지만 설정하고, 
이후 VLAD Layer를 연결한다. 

Network를 선언해야 feature 를 추출하고 triplet 데이터셋을 구성할 수 있다. 
Triplet 구성까지 테스트를 위해 VGG-Conv5-pooling 까지만 테스트로 구현해본다. 

참고자료 : https://stydy-sturdy.tistory.com/11  
참고자료 : https://www.digitalocean.com/community/tutorials/vgg-from-scratch-pytorch  
VGG16 논문자료 : https://arxiv.org/pdf/1409.1556  

![](2026-02-25-13-52-58.png)

![](2026-02-25-13-53-50.png)

![](2026-02-25-14-07-56.png)

In [38]:
batch_size = 64
feature_dataloader = DataLoader(ds,batch_size=batch_size)
print(len(feature_dataloader))
imgs, idx = next(iter(feature_dataloader))
print(type(imgs[0]), len(imgs))  # PIL.Image, 64
print(type(idx[0]), len(idx))    # int, 64
print(f"Shape of X: {imgs.shape}")
print(f"Shape of y : {idx.shape}")


1552
<class 'torch.Tensor'> 64
<class 'torch.Tensor'> 64
Shape of X: torch.Size([64, 3, 480, 640])
Shape of y : torch.Size([64])


In [None]:
# device
if torch.cuda.is_available() :
    device = 'cuda'
else :
    device = 'cpu'

print(device)



cuda


In [None]:
class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU())
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU())
        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(7*7*512, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))