In [1]:
# Importing the necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import torchvision
import torch
from sklearn.cluster import KMeans

plt.rcParams['figure.figsize'] = 15, 10

## Load SIFT dataset

In [2]:
def mmap_fvecs(fname):
    x = np.memmap(fname, dtype='int32', mode='r')
    d = x[0]
    return x.view('float32').reshape(-1, d + 1)[:, 1:]

def mmap_bvecs(fname):
    x = np.memmap(fname, dtype='uint8', mode='r')
    d = x[:4].view('int32')[0]
    return x.reshape(-1, d + 4)[:, 4:]

def ivecs_read(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    # Wenqi: Format of ground truth (for 10000 query vectors):
    #   1000(topK), [1000 ids]
    #   1000(topK), [1000 ids]
    #        ...     ...
    #   1000(topK), [1000 ids]
    # 10000 rows in total, 10000 * 1001 elements, 10000 * 1001 * 4 bytes
    return a.reshape(-1, d + 1)[:, 1:].copy()

def fvecs_read(fname):
    return ivecs_read(fname).view('float32')

In [3]:
dbname = 'SIFT1M'

if dbname.startswith('SIFT'):
    # SIFT1M to SIFT1000M
    dbsize = int(dbname[4:-1])
    xb = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_base.bvecs')
    xq = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_query.bvecs')
    gt = ivecs_read('/mnt/scratch/wenqi/Faiss_experiments/bigann/gnd/idx_%dM.ivecs' % dbsize)

    N_VEC = int(dbsize * 1000 * 1000)

    # trim xb to correct size
    xb = xb[:dbsize * 1000 * 1000]

    # Wenqi: load xq to main memory and reshape
    xq = xq.astype('float32').copy()
#     xq = np.array(xq, dtype=np.float32)
    xb = xb.astype('float32').copy()
    gt = np.array(gt, dtype=np.int32)

    print("Vector shapes:")
    print("Base vector xb: ", xb.shape)
    print("Query vector xq: ", xq.shape)
    print("Ground truth gt: ", gt.shape)
else:
    print('unknown dataset', dbname, file=sys.stderr)
    sys.exit(1)

dim = xb.shape[1] # should be 128
nq = xq.shape[0]

# Normalize all to 0~1
xb = xb / 256
xq = xq / 256

Vector shapes:
Base vector xb:  (1000000, 128)
Query vector xq:  (10000, 128)
Ground truth gt:  (10000, 1000)


In [4]:
# Using the first 10K vectors for training, just as the graph & IVF experiments
n_train = int(1e4)
xt = xb[:n_train]
print(xt.shape)

(10000, 128)


In [5]:
xb[0]

array([0.        , 0.        , 0.        , 0.00390625, 0.03125   ,
       0.02734375, 0.01171875, 0.0078125 , 0.01953125, 0.        ,
       0.        , 0.01171875, 0.01953125, 0.02734375, 0.04296875,
       0.12109375, 0.05078125, 0.        , 0.        , 0.        ,
       0.        , 0.11328125, 0.4140625 , 0.41796875, 0.05078125,
       0.        , 0.        , 0.        , 0.00390625, 0.23828125,
       0.2734375 , 0.1640625 , 0.        , 0.        , 0.        ,
       0.        , 0.00390625, 0.08984375, 0.109375  , 0.0625    ,
       0.24609375, 0.015625  , 0.        , 0.        , 0.        ,
       0.0234375 , 0.32421875, 0.31640625, 0.45703125, 0.3359375 ,
       0.09765625, 0.05859375, 0.06640625, 0.1953125 , 0.328125  ,
       0.45703125, 0.12109375, 0.08984375, 0.0703125 , 0.13671875,
       0.37890625, 0.45703125, 0.19140625, 0.09375   , 0.265625  ,
       0.10546875, 0.        , 0.        , 0.        , 0.015625  ,
       0.11328125, 0.27734375, 0.31640625, 0.18359375, 0.05078

## Model Declaration

Maybe I need to get rid of the sigmoid function, because there are a lot of 0s and 255s, and using sigmoid to get those values are very hard

In [6]:
# Creating a DeepAutoencoder class

# We partition the data to 32 shards (bottleneck dimension)
# The input data is 128-dimension
class DeepAutoencoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 32),
#             torch.nn.Sigmoid()
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(32, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 128),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Instantiating the model and hyperparameters
model = DeepAutoencoder()
# loss_func = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [7]:
# def loss_func(reconstructed_feat, input_feat, hidden_feat, centroid_vectors, partition_ids):
#     """
    
#     """
#     full_dim = reconstructed_feat.shape[1]
#     hidden_dim = hidden_feat.shape[1]
#     hidden_feat_factor = full_dim / hidden_dim
    
#     batch_size = reconstructed_feat.shape[0]
# #     assert batch_size == input_feat.shape[0] and batch_size == hidden_feat.shape[0] and \
# #         batch_size == partition_ids.shape[0]]
# #     assert hidden_dim == centroid_vectors.shape[1]

#     min_square_error = (reconstructed_feat - input_feat)**2
#     cluster_error = 0
#     for i in range(hidden_feat.shape[0]):
#         cluster_error += torch.sum((hidden_feat[i] - centroid_vectors[partition_ids[i]]) ** 2)
#     ave_cluster_error = hidden_feat_factor * cluster_error / batch_size / hidden_dim
#     loss = torch.mean(min_square_error) + ave_cluster_error
    
#     return loss

In [8]:
def loss_func(reconstructed_feat, input_feat, hidden_feat, centroid_vectors):
    """
    centroid vectors: same shape as hidden_feat
    """
    
    num_vec = reconstructed_feat.shape[0]
    full_dim = reconstructed_feat.shape[1]
    hidden_dim = hidden_feat.shape[1]
    num_partition = hidden_dim
    hidden_feat_factor = full_dim / hidden_dim
    
    reconstruct_dist = (reconstructed_feat - input_feat)**2
    min_square_error = torch.sum(reconstruct_dist) / num_vec
    
    cluster_error = 0
    for i in range(num_vec):
        dist_mat = (hidden_feature[i].repeat(num_partition, 1) - centroid_vectors) ** 2
        dist_to_each_cluster = torch.sum(dist_mat, dim=1)
        min_cluster_dist = torch.min(dist_to_each_cluster)
        cluster_error += min_cluster_dist
#         print('dist_mat', dist_mat)
#         print('dist_to_each_cluster', dist_to_each_cluster)
#         print('min_cluster_dist', min_cluster_dist)
#         print('cluster_error', cluster_error)
    
#     cluster_error = 
    ave_cluster_error = hidden_feat_factor * cluster_error / num_vec
    
    print('min_square_error', min_square_error)
    print('ave_cluster_error', ave_cluster_error)
    
    loss = min_square_error + ave_cluster_error
    
    return loss

In [9]:
model.parameters

<bound method Module.parameters of DeepAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=32, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=128, bias=True)
    (5): Sigmoid()
  )
)>

## Train the model

Model 1: no sigmoid in the middle, with sigmoid in the end; 3 layers
Init loss (sigmoid) = 0.06 (given batch size = 100, ~600)
loss after 100 epoch = 0.005 (~10% init)
loss after 1100 epoch = 0.00236 (~4% init)

Model 2: no sigmoid in the middle, no sigmoid in the end; 4 layers
Init loss = 492
loss after 100 epoch = 221 (~50% init)
loss after 600 epoch = 161
loss after 1100 epoch = 153 (switching batch size to 1000): 

Model 3: no sigmoid in the middle, with sigmoid in the end; 4 layers
Init loss (sigmoid) = 8
loss after 100 epoch = 0.500 (~10% init)
loss after 1100 epoch = 0.260 (~3% init)

Model 3: with sigmoid in the middle, with sigmoid in the end; 4 layers
Init loss (sigmoid) = 5.4
loss after 100 epoch = 0.57 (~10% init)
loss after 1000 epoch = 0.29 (~3% init)


Model 3: no sigmoid in the middle, no sigmoid in the end; 5 layers
Init loss (sigmoid) = 1.0
loss after 100 epoch = xx (~10% init)
loss after 1000 epoch = xx (~3% init)

In [10]:
def update_centroid_vectors(hidden_feature, centroid_vectors):
    """
    Input: 
      - hidden_feature (torch tensor): (num_vectors, num_partitions)
      - center_vectors: (num_partitions, num_partitions)
    
    Output (updated):
      - center_vectors (torch tensor): (num_partitions, num_partitions) 
          first dim: different centroid IDs
          second dim: an entire centroid vector
    """
    num_vectors = hidden_feature.shape[0]
    num_partition = hidden_feature.shape[1]
    
    min_cluster_ID_list = [] # len = num vectors 
    for i in range(num_vectors):
        dist_mat = (hidden_feature[i].repeat(32, 1) - centroid_vectors) ** 2
        dist_to_each_cluster = torch.sum(dist_mat, dim=1)
        min_cluster_ID = torch.argmin(dist_to_each_cluster)
        min_cluster_ID_list.append(int(min_cluster_ID))
    print('min_cluster_ID_list (first 10)', len(min_cluster_ID_list), min_cluster_ID_list[:10])
    
    num_vectors_per_partition = np.zeros(num_partition)
    new_centroid_vectors_sum = torch.zeros(num_partitions, num_partitions) 
    for vec_id in range(num_vectors):
        partition_ID = min_cluster_ID_list[vec_id]
        num_vectors_per_partition[partition_ID] += 1
        new_centroid_vectors_sum[partition_ID] += hidden_feature[vec_id]
        
#     print('new_centroid_vectors_sum', new_centroid_vectors_sum)
    new_centroid_vectors = torch.zeros(num_partitions, num_partitions) 
    for partition_id in range(num_partition):
        new_centroid_vectors[partition_id] = \
            new_centroid_vectors_sum[partition_id] / num_vectors_per_partition[partition_id] 
    
    return new_centroid_vectors

In [11]:
# def get_centroid_vectors(hidden_feature, partition_id_list):
#     """
#     Input: 
#       - hidden_feature (torch tensor): (num_vectors, num_partitions)
#       - ID list (array): size = vector number; vec ID -> partition ID
    
#     Output (updated):
#       - centroids info (torch tensor): (num_partitions, num_partitions) 
#           first dim: different centroid IDs
#           second dim: an entire centroid vector
#     """
#     num_vectors = len(partition_id_list)
#     num_partition = hidden_feature.shape[1]
    
#     num_vectors_per_partition = np.zeros(num_partition)
#     new_centroid_vectors_sum = torch.zeros(num_partitions, num_partitions) 
#     for vec_id in range(num_vectors):
#         partition_ID = partition_id_list[vec_id]
#         num_vectors_per_partition[partition_ID] += 1
#         new_centroid_vectors_sum[partition_ID] += hidden_feature[vec_id]
        
#     new_centroid_vectors = torch.zeros(num_partitions, num_partitions) 
#     for partition_id in range(num_partition):
#         new_centroid_vectors[partition_id] = \
#             new_centroid_vectors_sum[partition_id] / num_vectors_per_partition[partition_id] 
    
#     return new_centroid_vectors

# def update_centroids_info(hidden_feature, partition_id_list, centroid_vectors):
#     """
#     Input: 
#       - hidden_feature (torch tensor): (num_vectors, num_partitions)
#       - ID list (array): size = vector number; vec ID -> partition ID
#       - centroids info (torch tensor): (num_partitions, num_partitions) 
#           first dim: different centroid IDs
#           second dim: an entire centroid vector
    
#     Output (updated):
#       - ID list (array): vec ID -> partition ID
#       - centroids info (torch tensor): (num_partitions, num_partitions) 
#           first dim: different centroid IDs
#           second dim: an entire centroid vector
#     """
    
#     num_vectors = len(partition_id_list)
#     num_partition = centroid_vectors.shape[0]
    
#     num_vectors_per_partition = np.zeros(num_partition)
#     new_centroid_vectors_sum = torch.zeros(num_partitions, num_partitions) 
#     for vec_id in range(num_vectors):
#         partition_ID = partition_id_list[vec_id]
#         num_vectors_per_partition[partition_ID] += 1
#         new_centroid_vectors_sum[partition_ID] += hidden_feature[vec_id]
        
#     new_centroid_vectors = torch.zeros(num_partitions, num_partitions) 
#     for partition_id in range(num_partition):
#         new_centroid_vectors[partition_id] = \
#             new_centroid_vectors_sum[partition_id] / num_vectors_per_partition[partition_id]
        
#     new_L2_dist = torch.zeros((num_partition, num_vectors))
#     for paritition_id in range(num_partition):
#         # L2 distance to the i th centroid, output size = num_vec
#         new_L2_dist[paritition_id] = torch.sum((hidden_feature - centroid_vectors[paritition_id]) ** 2, dim=1)
#     new_partition_id_list = torch.argmin(new_L2_dist, dim = 0)
    
#     return new_partition_id_list, new_centroid_vectors

In [12]:
num_epochs = 10 # Wenqi: original iteration = 100
batch_size = 100
num_train_vectors = int(1e4)
num_partitions = 32

# List that will store the training loss
train_loss = []
validation_loss = []

# Dictionary that will store the
# different images and outputs for
# various epochs
outputs = {}
# x_val = torch.FloatTensor(xb[num_train_vectors: 2 * num_train_vectors])
xt_tensor = torch.FloatTensor(xt)

hidden_feature = model.encoder(xt_tensor)
# Initiation: each vector starts with a random partition ID; 
#   each partition has a center vector
# xt_closest_partition_id = np.random.randint(0, high=num_partitions, size=num_train_vectors)
# centroid_vectors = get_centroid_vectors(hidden_feature, xt_closest_partition_id)


kmeans = KMeans(n_clusters=32)
kmeans.fit(hidden_feature.detach().numpy())
centroid_vectors = torch.tensor(kmeans.cluster_centers_)

# print('partition ID list', xt_closest_partition_id)
print('centroid vectors', centroid_vectors)

# Check
print('centroid_vectors', centroid_vectors)
for i in range(centroid_vectors.shape[0]):
    for j in range(centroid_vectors.shape[1]):
        if torch.isnan(centroid_vectors[i][j]):
            print('Error: centroid_vectors[{}][{}] is NaN!'.format(i, j))

centroid vectors tensor([[-0.0028,  0.1540,  0.0474,  ..., -0.0945, -0.0572, -0.0862],
        [-0.0059,  0.1389,  0.0396,  ..., -0.1163, -0.0445, -0.0839],
        [-0.0137,  0.1601,  0.0562,  ..., -0.1031, -0.0604, -0.0754],
        ...,
        [ 0.0040,  0.1552,  0.0719,  ..., -0.1166, -0.0390, -0.0862],
        [-0.0011,  0.1667,  0.0589,  ..., -0.0964, -0.0484, -0.0779],
        [-0.0080,  0.1607,  0.0551,  ..., -0.1108, -0.0573, -0.0815]])
centroid_vectors tensor([[-0.0028,  0.1540,  0.0474,  ..., -0.0945, -0.0572, -0.0862],
        [-0.0059,  0.1389,  0.0396,  ..., -0.1163, -0.0445, -0.0839],
        [-0.0137,  0.1601,  0.0562,  ..., -0.1031, -0.0604, -0.0754],
        ...,
        [ 0.0040,  0.1552,  0.0719,  ..., -0.1166, -0.0390, -0.0862],
        [-0.0011,  0.1667,  0.0589,  ..., -0.0964, -0.0484, -0.0779],
        [-0.0080,  0.1607,  0.0551,  ..., -0.1108, -0.0573, -0.0815]])


In [13]:
centroid_vectors = update_centroid_vectors(hidden_feature, centroid_vectors)
print('centroid_vectors', centroid_vectors)
for i in range(centroid_vectors.shape[0]):
    for j in range(centroid_vectors.shape[1]):
        if torch.isnan(centroid_vectors[i][j]):
            print('Error: centroid_vectors[{}][{}] is NaN!'.format(i, j))

min_cluster_ID_list (first 10) 10000 [30, 7, 30, 18, 30, 9, 7, 30, 17, 29]
centroid_vectors tensor([[-0.0028,  0.1540,  0.0474,  ..., -0.0945, -0.0572, -0.0862],
        [-0.0059,  0.1389,  0.0396,  ..., -0.1163, -0.0445, -0.0839],
        [-0.0137,  0.1601,  0.0562,  ..., -0.1031, -0.0604, -0.0754],
        ...,
        [ 0.0040,  0.1552,  0.0719,  ..., -0.1166, -0.0390, -0.0862],
        [-0.0011,  0.1667,  0.0589,  ..., -0.0964, -0.0484, -0.0779],
        [-0.0080,  0.1607,  0.0551,  ..., -0.1108, -0.0573, -0.0815]],
       grad_fn=<CopySlices>)


In [14]:
sift_in_batch = xt_tensor[0 * batch_size: (0 + 1) * batch_size]
print((sift_in_batch ** 2).shape)
print(torch.mean(sift_in_batch), torch.mean(sift_in_batch).shape)

torch.Size([100, 128])
tensor(0.1074) torch.Size([])


Initial loss: 22

After 10 iteration: 2.3 (batch size = 100, training sample = 100K)

In [15]:
# Training loop starts
for epoch in range(num_epochs):

    # Initializing variable for storing
    # loss
    running_loss = 0
    
    # Iterating over the training dataset
    batch_num_per_epoch = int(np.ceil(n_train / batch_size))
#     print(batch_num_per_epoch)

    hidden_feature = model.encoder(xt_tensor)
    for i_batch in range(batch_num_per_epoch):

        start_id = i_batch * batch_size
        end_id = (i_batch + 1) * batch_size
        
        # WENQI: NOTE That the input is mapped to 0~1 -> such that we use sigmod to get the output value
        sift_in_batch = xt_tensor[start_id: end_id]
#         print('in', sift_in_batch.shape, sift_in_batch[0])

        # Generating output
        out_batch = model(sift_in_batch)
        hidden_feature_batch = model.encoder(sift_in_batch)
        hidden_feature[start_id: end_id] = hidden_feature_batch
#         print('out', out.shape, out[0])

        # Calculating loss
        loss = loss_func(out_batch, sift_in_batch, hidden_feature_batch, centroid_vectors)

        # Updating weights according
        # to the calculated loss
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        # Incrementing loss
        running_loss += loss.item()
        
        # update centroids after a mini-batch
        kmeans.fit(hidden_feature.detach().numpy())
        centroid_vectors = torch.tensor(kmeans.cluster_centers_)

    # Averaging out loss over entire batch
    running_loss /= batch_size
    train_loss.append(running_loss)
    
#     out_val = model(x_val)
#     out_hidden = 
#     loss_val = loss_func(out_val, x_val)
#     validation_loss.append(loss_val)
    
    print("epoch:", epoch, "train loss:", running_loss)
    print(centroid_vectors)
#     print("epoch:", epoch, "train loss:", running_loss, "val loss: ", loss_val.item())

    # Storing useful images and
    # reconstructed outputs for the last batch
#     outputs[epoch+1] = {'in': sift_in_batch, 'out': out}
    
    # update centroids after an epoch
#     xt_closest_partition_id, centroid_vectors = update_centroids_info(
#         hidden_feature, xt_closest_partition_id, centroid_vectors)
#     kmeans.fit(hidden_feature.detach().numpy())
#     centroid_vectors = torch.tensor(kmeans.cluster_centers_)
    


# Plotting the training loss
plt.plot(range(1,num_epochs+1),train_loss)
# plt.plot(range(1,num_epochs+1),validation_loss)
plt.xlabel("Number of epochs")
plt.ylabel("Training Loss")
plt.show()

min_square_error tensor(22.5145, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0070, grad_fn=<DivBackward0>)
min_square_error tensor(22.4619, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0067, grad_fn=<DivBackward0>)
min_square_error tensor(22.4636, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0069, grad_fn=<DivBackward0>)
min_square_error tensor(22.4664, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0069, grad_fn=<DivBackward0>)
min_square_error tensor(21.9538, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0070, grad_fn=<DivBackward0>)
min_square_error tensor(21.9538, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0071, grad_fn=<DivBackward0>)
min_square_error tensor(22.0210, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0070, grad_fn=<DivBackward0>)
min_square_error tensor(21.8290, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0070, grad_fn=<DivBackward0>)
min_square_error tensor(21.9610, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0070

min_square_error tensor(2.4703, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0147, grad_fn=<DivBackward0>)
min_square_error tensor(2.4067, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0144, grad_fn=<DivBackward0>)
min_square_error tensor(2.3945, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0144, grad_fn=<DivBackward0>)
min_square_error tensor(2.2773, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0147, grad_fn=<DivBackward0>)
min_square_error tensor(2.3209, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0144, grad_fn=<DivBackward0>)
min_square_error tensor(2.4736, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0147, grad_fn=<DivBackward0>)
min_square_error tensor(2.5633, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0147, grad_fn=<DivBackward0>)
min_square_error tensor(2.3040, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0149, grad_fn=<DivBackward0>)
min_square_error tensor(2.4008, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.0149, grad_fn

min_square_error tensor(2.3821, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2147, grad_fn=<DivBackward0>)
min_square_error tensor(2.2925, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2275, grad_fn=<DivBackward0>)
min_square_error tensor(2.3687, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2168, grad_fn=<DivBackward0>)
min_square_error tensor(2.2501, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2277, grad_fn=<DivBackward0>)
min_square_error tensor(2.3431, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2216, grad_fn=<DivBackward0>)
min_square_error tensor(2.1741, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2313, grad_fn=<DivBackward0>)
min_square_error tensor(2.3547, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2172, grad_fn=<DivBackward0>)
min_square_error tensor(2.3506, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2396, grad_fn=<DivBackward0>)
min_square_error tensor(2.3032, grad_fn=<DivBackward0>)
ave_cluster_error tensor(0.2388, grad_fn

min_square_error tensor(2.6133, grad_fn=<DivBackward0>)
ave_cluster_error tensor(10.8502, grad_fn=<DivBackward0>)
min_square_error tensor(2.4342, grad_fn=<DivBackward0>)
ave_cluster_error tensor(10.7777, grad_fn=<DivBackward0>)
min_square_error tensor(2.5046, grad_fn=<DivBackward0>)
ave_cluster_error tensor(11.1223, grad_fn=<DivBackward0>)
min_square_error tensor(2.4381, grad_fn=<DivBackward0>)
ave_cluster_error tensor(9.3074, grad_fn=<DivBackward0>)
min_square_error tensor(2.4122, grad_fn=<DivBackward0>)
ave_cluster_error tensor(12.3266, grad_fn=<DivBackward0>)
min_square_error tensor(2.4521, grad_fn=<DivBackward0>)
ave_cluster_error tensor(10.0413, grad_fn=<DivBackward0>)
min_square_error tensor(2.5452, grad_fn=<DivBackward0>)
ave_cluster_error tensor(11.6350, grad_fn=<DivBackward0>)
min_square_error tensor(2.3856, grad_fn=<DivBackward0>)
ave_cluster_error tensor(15.5933, grad_fn=<DivBackward0>)
min_square_error tensor(2.4438, grad_fn=<DivBackward0>)
ave_cluster_error tensor(15.6608,


KeyboardInterrupt



In [None]:
centroid_vectors

In [None]:
hidden_feature

In [None]:
# Training loop starts ===》 WITHOUT BATCH LOOP ===> HUGE BATCH

for epoch in range(num_epochs):

    # Initializing variable for storing
    # loss
    running_loss = 0
        
    # WENQI: NOTE That the input is mapped to 0~1 -> such that we use sigmod to get the output value
    sift_in_batch = xt_tensor
#         print('in', sift_in_batch.shape, sift_in_batch[0])

    # Generating output
    out_batch = model(sift_in_batch)
    hidden_feature = model.encoder(sift_in_batch)

    # Calculating loss
    print('out_batch', out_batch.shape, out_batch)
    print('sift_in_batch', sift_in_batch.shape, sift_in_batch)
    print('hidden_feature', hidden_feature.shape, hidden_feature)
    print('centroid_vectors', centroid_vectors.shape, centroid_vectors)
    
    loss = loss_func(out_batch, xt_tensor, hidden_feature, centroid_vectors)
    
    if torch.isnan(loss):
        print('NaN loss, break...')
        break
    # Updating weights according
    # to the calculated loss
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    clip_gradient = 1
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip_gradient) # clip gradieent
    optimizer.step()

    # Incrementing loss
    running_loss += loss.item()
        
    train_loss.append(running_loss)
    
#     out_val = model(x_val)
#     out_hidden = 
#     loss_val = loss_func(out_val, x_val)
#     validation_loss.append(loss_val)
    
    print("epoch:", epoch, "train loss:", running_loss)
#     print("epoch:", epoch, "train loss:", running_loss, "val loss: ", loss_val.item())

    # Storing useful images and
    # reconstructed outputs for the last batch
    outputs[epoch+1] = {'in': sift_in_batch, 'out': out_batch}
    
    # update centroids after an epoch
#     xt_closest_partition_id, centroid_vectors = update_centroids_info(
#         hidden_feature, xt_closest_partition_id, centroid_vectors)
    kmeans.fit(hidden_feature.detach().numpy())
    centroid_vectors = torch.tensor(kmeans.cluster_centers_)
    


# Plotting the training loss
plt.plot(range(1,num_epochs+1),train_loss)
# plt.plot(range(1,num_epochs+1),validation_loss)
plt.xlabel("Number of epochs")
plt.ylabel("Training Loss")
plt.show()

In [None]:
# Inspecting the result of a single vector

i_batch = 0

# Loading image(s) and
# reshaping it into a 1-d vector
sift_in_batch = torch.FloatTensor(xt[i_batch * batch_size: (i_batch + 1) * batch_size])
print('in', sift_in_batch.shape, sift_in_batch[0])

# Generating output
out = model(sift_in_batch)
print('out', out.shape, out[0])
    
for i in range(128):
    print('in: ', sift_in_batch[0][i], 'out: ', out[0][i])

In [None]:
# sift_in_batch / 256

## K means on hidden feature

In [16]:
from sklearn.cluster import KMeans

In [17]:
hidden_features_10K = model.encoder(torch.FloatTensor(xt)).detach().numpy()
print(hidden_features_10K.shape)
print(hidden_features_10K[0])

(10000, 32)
[  75.355736   -70.56735    -60.421646   -16.449442   -29.355614
  -85.70103    -42.211414   -48.778187    35.060528    20.643835
   50.602898   -40.27656   -104.86069     82.05809    -13.047523
  -48.313778   -27.030766    -3.285881   -36.619286   -16.039133
  -59.87265    -10.0474615   95.26721     27.586655   -78.94651
  -30.944077   -92.002266   -17.261911   -45.702194   -36.359085
  -30.051886    59.703403 ]


In [18]:
kmeans = KMeans(n_clusters=32)
kmeans.fit(hidden_features_10K)
y_kmeans = kmeans.predict(hidden_features_10K)

In [19]:
print(y_kmeans)

[10 21 16 ...  2 19 16]


In [20]:
hidden_feature_1M = model.encoder(torch.FloatTensor(xb)).detach().numpy()

partition_IDs = kmeans.predict(hidden_feature_1M)
print(partition_IDs)
print(partition_IDs.shape)

[10 21 16 ... 14  6 20]
(1000000,)


In [21]:
# Create a mapping: partition ID -> {list of vector IDs}

num_partition = 32

partition_id_vec_id_list_1M = dict()
for i in range(num_partition):
    partition_id_vec_id_list_1M[i] = []


for i in range(int(1e6)):
    partition_ID = int(partition_IDs[i])
    partition_id_vec_id_list_1M[partition_ID].append(i)
    
for i in range(num_partition):
    print('items in partition ', i, len(partition_id_vec_id_list_1M[i]), 'average =', int(1e6/num_partition))


items in partition  0 49766 average = 31250
items in partition  1 25322 average = 31250
items in partition  2 38218 average = 31250
items in partition  3 45423 average = 31250
items in partition  4 14486 average = 31250
items in partition  5 33688 average = 31250
items in partition  6 46448 average = 31250
items in partition  7 12505 average = 31250
items in partition  8 10576 average = 31250
items in partition  9 41493 average = 31250
items in partition  10 24963 average = 31250
items in partition  11 49621 average = 31250
items in partition  12 39266 average = 31250
items in partition  13 29643 average = 31250
items in partition  14 17298 average = 31250
items in partition  15 47384 average = 31250
items in partition  16 32738 average = 31250
items in partition  17 4790 average = 31250
items in partition  18 33463 average = 31250
items in partition  19 49141 average = 31250
items in partition  20 12246 average = 31250
items in partition  21 17563 average = 31250
items in partition  2

In [22]:
import heapq

def scan_partition(query_vec, partition_id_list, vector_set):
    """
    query_vec = (128, )
    partition_id_list = (N_num_vec, )
    vector_set = 1M dataset (1M, 128)
    """
    min_dist = 1e10
    min_dist_ID = None
    for vec_id in partition_id_list:
        dataset_vec = vector_set[vec_id]
        dist = np.linalg.norm(query_vec - dataset_vec)
        if dist <= min_dist:
            min_dist = dist
            min_dist_ID = vec_id
            
    return min_dist_ID

In [23]:
nearest_neighbors = []

N = 100
#### Wenqi: here had a bug: previously xb, now xq
query_hidden_feature = model.encoder(torch.FloatTensor(xq)).detach().numpy()
print(query_hidden_feature.shape)
query_partition = kmeans.predict(query_hidden_feature)

for i in range(N):
    partition_id = int(query_partition[i])
    nearest_neighbor_ID = scan_partition(xq[i], partition_id_vec_id_list_1M[partition_id], xb)
    nearest_neighbors.append(nearest_neighbor_ID)
    print(i, nearest_neighbor_ID)

(10000, 32)
0 133171
1 790327
2 502074
3 814387
4 921963
5 175043
6 843295
7 848955
8 76782
9 413852
10 878295
11 742512
12 696749
13 462513
14 839470
15 372248
16 423980
17 862268
18 602078
19 947435
20 68023
21 334619
22 712156
23 804213
24 272283
25 297655
26 387383
27 221028
28 95867
29 460882
30 125853
31 397619
32 16108
33 61454
34 656767
35 538502
36 942225
37 252712
38 452205
39 176702
40 709856
41 79741
42 137064
43 845200
44 77190
45 469831
46 302575
47 521108
48 468406
49 489446
50 849174
51 802781
52 365999
53 441473
54 226422
55 120806
56 480173
57 628861
58 408532
59 951575
60 859542
61 833563
62 437031
63 331243
64 121265
65 715285
66 827936
67 850184
68 60810
69 847531
70 761548
71 452789
72 5106
73 556801
74 178558
75 518522
76 712139
77 781746
78 680004
79 95340
80 48350
81 380662
82 16235
83 254697
84 594097
85 53667
86 419880
87 635731
88 51333
89 407749
90 640347
91 972183
92 190435
93 978575
94 147900
95 171630
96 242602
97 763985
98 842667
99 767376


In [24]:
print(gt[:100, :1])

[[504814]
 [588616]
 [552515]
 [335355]
 [482427]
 [508403]
 [167240]
 [327960]
 [834657]
 [592948]
 [878295]
 [771023]
 [215771]
 [717949]
 [368047]
 [776345]
 [373550]
 [862239]
 [602078]
 [ 84644]
 [ 68023]
 [671173]
 [ 47363]
 [258880]
 [698614]
 [838692]
 [922290]
 [221028]
 [962851]
 [785288]
 [425493]
 [407192]
 [229032]
 [909787]
 [303455]
 [825435]
 [602485]
 [812777]
 [341091]
 [856200]
 [982373]
 [499763]
 [915089]
 [ 43368]
 [640127]
 [858815]
 [436180]
 [283703]
 [ 28097]
 [426503]
 [849174]
 [467756]
 [803688]
 [ 74183]
 [819365]
 [882827]
 [535502]
 [779586]
 [702690]
 [ 12747]
 [534495]
 [895845]
 [795958]
 [ 45494]
 [943477]
 [809566]
 [  3367]
 [850184]
 [185761]
 [434096]
 [116383]
 [753756]
 [222336]
 [787309]
 [436415]
 [629700]
 [470987]
 [717995]
 [680004]
 [694111]
 [659382]
 [380662]
 [555466]
 [ 34810]
 [632431]
 [853398]
 [299980]
 [138747]
 [698591]
 [158669]
 [ 32380]
 [490742]
 [190435]
 [617987]
 [292659]
 [330662]
 [898436]
 [763985]
 [532460]
 [ 72670]]

First 100 queries: recall@1 =  0.69
First 1000 queries: recall@1 =  0.636

In [25]:
correct_count = 0
for i in range(N):
    if nearest_neighbors[i] == gt[i][0]:
        correct_count += 1
        
print(correct_count, 'recall@1 = ', correct_count / N)

10 recall@1 =  0.1


## Wenqi: naive auto-encoder is basically a dimensionality reduction technique

It cannot improve the k-means quality, because the clustering with the full dimensions should perform better.