# Implementation of the paper "Fine-grained generalized zero-shot learning via dense attribute-based attention"

In [1]:
import os,sys
import torch
import torchvision
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.models.resnet as models
from PIL import Image
import h5py
import numpy as np
import scipy.io as sio
import pickle
import pdb
import matplotlib.pyplot as plt
import pandas as pd
import gensim.downloader as api
import torch.optim as optim
import importlib


# CUB dataset
#images = 17188

#classes = 200, 150 seen classes and 50 unseen classes

Each class has 200 number of attributes that represent the class infromation

In [3]:
img_dir = 'C:/Sushree/Jio_Institute/Dataset/'
print(img_dir)

file_paths = 'C:/Sushree/Jio_Institute/Dataset/data/xlsa17/data/CUB/res101.mat'
print(file_paths)

#resNet101.mat includes the following fields:
#-features: columns correspond to image instances
#-labels: label number of a class is its row number in allclasses.txt
#-image_files: image sources  

C:/Sushree/Jio_Institute/Dataset/
C:/Sushree/Jio_Institute/Dataset/data/xlsa17/data/CUB/res101.mat


# Let's visualize the data

In [None]:
def visualize_data_distribution(file_paths):    
    matcontent = sio.loadmat(file_paths)
    print(matcontent)

    image_files = np.squeeze(matcontent['image_files'])
    #print(image_files)

    labels = np.squeeze(matcontent['labels'])
    print(labels)
    print(labels.size)  # 11788 for CUB

    class_names = []
    for idx in range(len(image_files)):
        image_file = image_files[idx][0]
        class_name = image_file.split('/')[5:][3]
        class_names.append(class_name)

    print(len(class_names))   
    #print(class_names)
    
    num_bins = 200 # # for CUB
    
    plt.figure(figsize=(35,6))
    
    plt.title("Data Distribution: CUB")
    plt.xlabel("Categories")
    plt.ylabel("Number of Classes")
    
    plt.xticks(rotation = 90)
    plt.grid(color = 'red', linestyle = '--', linewidth = 0.3)
    plt.hist(class_names, num_bins, align="mid")

visualize_data_distribution(file_paths)    

# Let's extract deep features (consider pre-trained ResNet 101 with no fine-tuning)

In [4]:
class CustomedDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, img_dir , file_paths, transform=None):
        self.matcontent = sio.loadmat(file_paths)
        self.image_files = np.squeeze(self.matcontent['image_files'])
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_file = self.image_files[idx][0]
        image_file = os.path.join(self.img_dir, '/'.join(image_file.split('/')[5:]))
        image = Image.open(image_file)
        
        if image.mode == 'L':
            image=image.convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image

In [5]:
input_size = 224
data_transforms = transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    
CUBDataset = CustomedDataset(img_dir , file_paths, data_transforms)


In [6]:
model_name = "resnet"

# Batch size for training (change depending on how much memory you have)
batch_size = 32

model_ref = models.resnet101(pretrained=True)
model_ref.eval()

model_f = nn.Sequential(*list(model_ref.children())[:-2])
model_f.eval()


for param in model_f.parameters():
    param.requires_grad = False
    
print(model_f)
        
from torchsummary import summary
summary(model_f, (3, 224, 224))    

dataset_loader = torch.utils.data.DataLoader(CUBDataset, batch_size=batch_size, shuffle=False, num_workers=0)



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [7]:
all_features = []
for i_batch, imgs in enumerate(dataset_loader):
    print(i_batch)
    #pdb.set_trace()
    #imgs = imgs.to(device)
    features = model_f(imgs)
    all_features.append(features.numpy())
    
all_features = np.concatenate(all_features, axis=0)
print(all_features, all_features.shape)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

# Let's extract semantic attributes of each category (consider pre-trained word2vec model with no fine-tuning)

In [8]:
print('Load pretrain w2v model')

model_name = 'word2vec-google-news-300'#best model
model = api.load(model_name)

dim_w2v = 300

#%%
replace_word = [('spatulate','broad'),('upperparts','upper parts'),('grey','gray')] # for CUB


path = 'C:/Sushree/Jio_Institute/Dataset/CUB_200_2011/CUB_200_2011/attributes/attributes.txt'
df=pd.read_csv(path,sep=' ',header = None, names = ['idx','des'])
des = df['des'].values

#%% filter
new_des = [' '.join(i.split('_')) for i in des]
new_des = [' '.join(i.split('-')) for i in new_des]
new_des = [' '.join(i.split('::')) for i in new_des]
new_des = [i.split('(')[0] for i in new_des]
new_des = [i[4:] for i in new_des]

#%% replace out of dictionary (OOD) words
for pair in replace_word:
    for idx,s in enumerate(des):
        des[idx] = s.replace(pair[0],pair[1])
print('Done replacing OOD words')

df['new_des'] = des
df.to_csv('C:/Sushree/Jio_Institute/Dataset/CUB_200_2011/CUB_200_2011/attributes/new_des.csv')
print('Done preprocessing attribute des')

Load pretrain w2v model


AttributeError: 'numpy.float64' object has no attribute 'split'

In [None]:
counter_err = 0

all_w2v = []
for s in des:
    print(s)
    words = s.split(' ')
    if words[-1] == '':     #remove empty element
        words = words[:-1]
    w2v = np.zeros(dim_w2v)
    for w in words:
        try:
            w2v += model[w]
        except Exception as e:
            print(e)
            counter_err += 1
    all_w2v.append(w2v[np.newaxis,:])
    
print('counter_err ',counter_err)

#%%
all_w2v=np.concatenate(all_w2v,axis=0)
#pdb.set_trace()
#%%

with open('C:/Sushree/Jio_Institute/Dataset/Animals_with_Attributes2/w2v/AWA2_attribute.pkl','wb') as f:
    pickle.dump(all_w2v,f)  

# Read the attributes and save as "w2v_att"

In [None]:
attribute_path = 'C:/Sushree/Jio_Institute/Dataset/Animals_with_Attributes2/w2v/AWA2_attribute.pkl'

with open(attribute_path,'rb') as f:
    w2v_att = pickle.load(f)
assert w2v_att.shape == (85,300) # for AWA2
print('save w2v_att')

print(w2v_att, w2v_att.shape)

# Let's gather additional information (training, validation, and test indexes)

In [None]:
#%% get remaining metadata
matcontent = AWA2Dataset.matcontent
labels = matcontent['labels'].astype(int).squeeze() - 1

split_path = 'C:/Sushree/Jio_Institute/Dataset/data/xlsa17/data/AWA2/att_splits.mat'
print(split_path)
    
#att_splits.mat includes the following fields:
#-att: columns correpond to class attribute vectors normalized to have unit l2 norm, following the classes order in allclasses.txt 
#-original_att: the original class attribute vectors without normalization
#-trainval_loc: instances indexes of train+val set features (for only seen classes) in resNet101.mat
#-test_seen_loc: instances indexes of test set features for seen classes
#-test_unseen_loc: instances indexes of test set features for unseen classes    


In [None]:
def get_index_details(split_path):
    matcontent = sio.loadmat(split_path)
    print(matcontent)
    
    trainval_loc = matcontent['trainval_loc'].squeeze() - 1
    print(trainval_loc, len(trainval_loc))

    test_seen_loc = matcontent['test_seen_loc'].squeeze() - 1
    print(test_seen_loc, len(test_seen_loc))

    test_unseen_loc = matcontent['test_unseen_loc'].squeeze() - 1
    print(test_unseen_loc, len(test_unseen_loc))
    
    att = matcontent['att'].T
    print(att, att.shape)
    
    original_att = matcontent['original_att'].T
    print(original_att, original_att.shape)
    return trainval_loc, test_seen_loc, test_unseen_loc, att, original_att
    
trainval_loc, test_seen_loc, test_unseen_loc, att, original_att = get_index_details(split_path)    

# Save the feature map that includes ResNet50 features, labels, training and test (seen and unseen) data indexes, semantic attributes, and 

In [None]:

save_path = 'C:/Sushree/Jio_Institute/Dataset/Animals_with_Attributes2/feature_map_ResNet_101_AWA2.hdf5'

f = h5py.File(save_path, "w")
f.create_dataset('feature_map', data=all_features,compression="gzip")
f.create_dataset('labels', data=labels,compression="gzip")
f.create_dataset('trainval_loc', data=trainval_loc,compression="gzip")
#    f.create_dataset('train_loc', data=train_loc,compression="gzip")
#    f.create_dataset('val_unseen_loc', data=val_unseen_loc,compression="gzip")
f.create_dataset('test_seen_loc', data=test_seen_loc,compression="gzip")
f.create_dataset('test_unseen_loc', data=test_unseen_loc,compression="gzip")
f.create_dataset('att', data=att,compression="gzip")
f.create_dataset('original_att', data=original_att,compression="gzip")
f.create_dataset('w2v_att', data=w2v_att,compression="gzip")
f.close()

In [None]:
hf = h5py.File('C:/Sushree/Jio_Institute/Dataset/Animals_with_Attributes2/feature_map_ResNet_101_AWA2.hdf5', 'r')
features = np.array(hf.get('feature_map'))
print(features.shape)

In [None]:
print(features)

In [None]:
att = np.array(hf.get('att'))
print(att)


In [None]:
f.close()

# Train the DAZLE model for AWA2

In [None]:
import os,sys
import torch
import torchvision
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.models.resnet as models
from PIL import Image
import h5py
import numpy as np
import scipy.io as sio
import pickle
import pdb
import matplotlib.pyplot as plt
import pandas as pd
import gensim.downloader as api
import torch.optim as optim
import importlib


In [None]:
from DAZLE import DAZLE
from AWA2DataLoader import AWA2DataLoader
from helper_func import eval_zs_gzsl,visualize_attention#,get_attribute_attention_stats

In [None]:
data_path = 'C:/Sushree/Jio_Institute/Dataset/'
feature_path = 'C:/Sushree/Jio_Institute/Dataset/Animals_with_Attributes2/'
dataloader = AWA2DataLoader(data_path, feature_path, device = None)


In [None]:
def get_lr(optimizer):
    lr = []
    for param_group in optimizer.param_groups:
        lr.append(param_group['lr'])
    return lr

In [None]:
seed = 214
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)

batch_size = 32
nepoches = 100
niters = dataloader.ntrain * nepoches//batch_size
dim_f = 2048
dim_v = 300
init_w2v_att = dataloader.w2v_att # load the attribute features
att = dataloader.att
att[att<0] = 0
normalize_att = dataloader.normalize_att


trainable_w2v = True
lambda_ = 0.1
bias = 0
prob_prune = 0
uniform_att_1 = False
uniform_att_2 = False

seenclass = dataloader.seenclasses #load seen and unseen data
unseenclass = dataloader.unseenclasses
desired_mass = 1
report_interval = niters//nepoches

device = None

model = DAZLE(dim_f,dim_v,init_w2v_att,att,normalize_att,
            seenclass,unseenclass,
            lambda_,
            trainable_w2v,normalize_V=True,normalize_F=True,is_conservative=True,
            uniform_att_1=uniform_att_1,uniform_att_2=uniform_att_2,
            prob_prune=prob_prune,desired_mass=desired_mass, is_conv=False,
            is_bias=True)
model.to(device)

setup = {'pmp':{'init_lambda':0.1,'final_lambda':0.1,'phase':0.8},
         'desired_mass':{'init_lambda':-1,'final_lambda':-1,'phase':0.8}}
print(setup)

params_to_update = []
params_names = []
for name,param in model.named_parameters():
    if param.requires_grad == True:
        params_to_update.append(param)
        params_names.append(name)
        print("\t",name)
#%%
lr = 0.0001
weight_decay = 0.0001
momentum = 0.
#%%
lr_seperator = 1
lr_factor = 1
print('default lr {} {}x lr {}'.format(params_names[:lr_seperator],lr_factor,params_names[lr_seperator:]))
optimizer  = optim.RMSprop( params_to_update ,lr=lr,weight_decay=weight_decay, momentum=momentum)
print('-'*30)
print('learing rate {}'.format(lr))
print('trainable V {}'.format(trainable_w2v))
print('lambda_ {}'.format(lambda_))
print('optimized seen only')
print('optimizer: RMSProp with momentum = {} and weight_decay = {}'.format(momentum,weight_decay))
print('-'*30)

In [None]:
best_performance = [0,0,0,0]
for i in range(0,niters):
    model.train()
    optimizer.zero_grad()
    
    batch_label, batch_feature, batch_att = dataloader.next_batch(batch_size)
    out_package = model(batch_feature)
    
    in_package = out_package
    in_package['batch_label'] = batch_label
    
    out_package=model.compute_loss(in_package)
    loss,loss_CE,loss_cal = out_package['loss'],out_package['loss_CE'],out_package['loss_cal']
    
    loss.backward()
    optimizer.step()
    if i%report_interval==0:
        print('-'*30)
        acc_seen, acc_novel, H, acc_zs = eval_zs_gzsl(dataloader,model,device,bias_seen=-bias,bias_unseen=bias)
        
        if H > best_performance[2]:
            best_performance = [acc_seen, acc_novel, H, acc_zs]
        stats_package = {'iter':i, 'loss':loss.item(), 'loss_CE':loss_CE.item(),
                         'loss_cal': loss_cal.item(),
                         'acc_seen':best_performance[0], 'acc_novel':best_performance[1], 'H':best_performance[2], 'acc_zs':best_performance[3]}
        
        print(stats_package)