<a href="https://colab.research.google.com/github/OUCTheoryGroup/colab_demo/blob/master/DSCMR_CVPR2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DSCMR 深度监督跨模态检索 

Liangli Zhen, Peng Hu, Xu Wang, Dezhong Peng, Deep Supervised Cross-modal Retrieval. *CVPR 2029*

In [0]:
! mkdir pascal
! wget -P pascal https://raw.githubusercontent.com/penghu-cs/DSCMR/master/data/pascal/test_img.mat
! wget -P pascal https://raw.githubusercontent.com/penghu-cs/DSCMR/master/data/pascal/test_img_lab.mat
! wget -P pascal https://raw.githubusercontent.com/penghu-cs/DSCMR/master/data/pascal/test_txt.mat
! wget -P pascal https://raw.githubusercontent.com/penghu-cs/DSCMR/master/data/pascal/train_img.mat
! wget -P pascal https://raw.githubusercontent.com/penghu-cs/DSCMR/master/data/pascal/train_img_lab.mat
! wget -P pascal https://raw.githubusercontent.com/penghu-cs/DSCMR/master/data/pascal/train_txt.mat

In [0]:
import torch
from datetime import datetime
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset
from scipy.io import loadmat, savemat
from torch.utils.data import DataLoader
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import time
import copy
import scipy.spatial

In [0]:
class ImgNN(nn.Module):
    """Network to learn image representations"""
    def __init__(self, input_dim=4096, output_dim=1024):
        super(ImgNN, self).__init__()
        self.denseL1 = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        out = F.relu(self.denseL1(x))
        return out

class TextNN(nn.Module):
    """Network to learn text representations"""
    def __init__(self, input_dim=1024, output_dim=1024):
        super(TextNN, self).__init__()
        self.denseL1 = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        out = F.relu(self.denseL1(x))
        return out

In [0]:
class IDCM_NN(nn.Module):
    """Network to learn text representations"""
    def __init__(self, img_input_dim=4096, img_output_dim=2048,
                 text_input_dim=1024, text_output_dim=2048, minus_one_dim=1024, output_dim=10):
        super(IDCM_NN, self).__init__()
        self.img_net = ImgNN(img_input_dim, img_output_dim)
        self.text_net = TextNN(text_input_dim, text_output_dim)
        self.linearLayer = nn.Linear(img_output_dim, minus_one_dim)
        self.linearLayer2 = nn.Linear(minus_one_dim, output_dim)

    def forward(self, img, text):
        view1_feature = self.img_net(img)
        view2_feature = self.text_net(text)
        view1_feature = self.linearLayer(view1_feature)
        view2_feature = self.linearLayer(view2_feature)

        view1_predict = self.linearLayer2(view1_feature)
        view2_predict = self.linearLayer2(view2_feature)
        return view1_feature, view2_feature, view1_predict, view2_predict

In [0]:
class CustomDataSet(Dataset):
    def __init__(
            self,
            images,
            texts,
            labels):
        self.images = images
        self.texts = texts
        self.labels = labels

    def __getitem__(self, index):
        img = self.images[index]
        text = self.texts[index]
        label = self.labels[index]
        return img, text, label

    def __len__(self):
        count = len(self.images)
        assert len(
            self.images) == len(self.labels)
        return count

def ind2vec(ind, N=None):
    ind = np.asarray(ind)
    if N is None:
        N = ind.max() + 1
    return np.arange(N) == np.repeat(ind, N, axis=1)

def get_loader(path, batch_size):
    img_train = loadmat(path+"train_img.mat")['train_img']
    img_test = loadmat(path + "test_img.mat")['test_img']
    text_train = loadmat(path+"train_txt.mat")['train_txt']
    text_test = loadmat(path + "test_txt.mat")['test_txt']
    label_train = loadmat(path+"train_img_lab.mat")['train_img_lab']
    label_test = loadmat(path + "test_img_lab.mat")['test_img_lab']

    label_train = ind2vec(label_train).astype(int)
    label_test = ind2vec(label_test).astype(int)

    imgs = {'train': img_train, 'test': img_test}
    texts = {'train': text_train, 'test': text_test}
    labels = {'train': label_train, 'test': label_test}
    dataset = {x: CustomDataSet(images=imgs[x], texts=texts[x], labels=labels[x])
               for x in ['train', 'test']}

    shuffle = {'train': False, 'test': False}

    dataloader = {x: DataLoader(dataset[x], batch_size=batch_size,
                                shuffle=shuffle[x], num_workers=0) for x in ['train', 'test']}

    img_dim = img_train.shape[1]
    text_dim = text_train.shape[1]
    num_class = label_train.shape[1]

    input_data_par = {}
    input_data_par['img_test'] = img_test
    input_data_par['text_test'] = text_test
    input_data_par['label_test'] = label_test
    input_data_par['img_train'] = img_train
    input_data_par['text_train'] = text_train
    input_data_par['label_train'] = label_train
    input_data_par['img_dim'] = img_dim
    input_data_par['text_dim'] = text_dim
    input_data_par['num_class'] = num_class
    return dataloader, input_data_par

In [0]:
def train_model(model, data_loaders, optimizer, alpha, beta, device="cpu", num_epochs=500):
    since = time.time()
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_img_acc_history = []
    test_txt_acc_history = []
    epoch_loss_history =[]

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 20)

        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                # Set model to training mode
                model.train()
            else:
                # Set model to evaluate mode
                model.eval()

            running_loss = 0.0
            running_corrects_img = 0
            running_corrects_txt = 0
            # Iterate over data.
            for imgs, txts, labels in data_loaders[phase]:
                # imgs = imgs.to(device)
                # txts = txts.to(device)
                # labels = labels.to(device)
                if torch.sum(imgs != imgs)>1 or torch.sum(txts != txts)>1:
                    print("Data contains Nan.")

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    if torch.cuda.is_available():
                        imgs = imgs.cuda()
                        txts = txts.cuda()
                        labels = labels.cuda()


                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # Forward
                    view1_feature, view2_feature, view1_predict, view2_predict = model(imgs, txts)

                    loss = calc_loss(view1_feature, view2_feature, view1_predict,
                                     view2_predict, labels, labels, alpha, beta)

                    img_preds = view1_predict
                    txt_preds = view2_predict

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item()
                running_corrects_img += torch.sum(torch.argmax(img_preds, dim=1) == torch.argmax(labels, dim=1))
                running_corrects_txt += torch.sum(torch.argmax(txt_preds, dim=1) == torch.argmax(labels, dim=1))

            epoch_loss = running_loss / len(data_loaders[phase].dataset)
            # epoch_img_acc = running_corrects_img.double() / len(data_loaders[phase].dataset)
            # epoch_txt_acc = running_corrects_txt.double() / len(data_loaders[phase].dataset)
            t_imgs, t_txts, t_labels = [], [], []
            with torch.no_grad():
                for imgs, txts, labels in data_loaders['test']:
                    if torch.cuda.is_available():
                            imgs = imgs.cuda()
                            txts = txts.cuda()
                            labels = labels.cuda()
                    t_view1_feature, t_view2_feature, _, _ = model(imgs, txts)
                    t_imgs.append(t_view1_feature.cpu().numpy())
                    t_txts.append(t_view2_feature.cpu().numpy())
                    t_labels.append(labels.cpu().numpy())
            t_imgs = np.concatenate(t_imgs)
            t_txts = np.concatenate(t_txts)
            t_labels = np.concatenate(t_labels).argmax(1)
            img2text = fx_calc_map_label(t_imgs, t_txts, t_labels)
            txt2img = fx_calc_map_label(t_txts, t_imgs, t_labels)

            print('{} Loss: {:.4f} Img2Txt: {:.4f}  Txt2Img: {:.4f}'.format(phase, epoch_loss, img2text, txt2img))

            # deep copy the model
            if phase == 'test' and (img2text + txt2img) / 2. > best_acc:
                best_acc = (img2text + txt2img) / 2.
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'test':
                test_img_acc_history.append(img2text)
                test_txt_acc_history.append(txt2img)
                epoch_loss_history.append(epoch_loss)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best average ACC: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, test_img_acc_history, test_txt_acc_history, epoch_loss_history


In [0]:
def calc_loss(view1_feature, view2_feature, view1_predict, view2_predict, labels_1, labels_2, alpha, beta):
    term1 = ((view1_predict-labels_1.float())**2).sum(1).sqrt().mean() + ((view2_predict-labels_2.float())**2).sum(1).sqrt().mean()

    cos = lambda x, y: x.mm(y.t()) / ((x ** 2).sum(1, keepdim=True).sqrt().mm((y ** 2).sum(1, keepdim=True).sqrt().t())).clamp(min=1e-6) / 2.
    theta11 = cos(view1_feature, view1_feature)
    theta12 = cos(view1_feature, view2_feature)
    theta22 = cos(view2_feature, view2_feature)
    Sim11 = calc_label_sim(labels_1, labels_1).float()
    Sim12 = calc_label_sim(labels_1, labels_2).float()
    Sim22 = calc_label_sim(labels_2, labels_2).float()
    term21 = ((1+torch.exp(theta11)).log() - Sim11 * theta11).mean()
    term22 = ((1+torch.exp(theta12)).log() - Sim12 * theta12).mean()
    term23 = ((1 + torch.exp(theta22)).log() - Sim22 * theta22).mean()
    term2 = term21 + term22 + term23

    term3 = ((view1_feature - view2_feature)**2).sum(1).sqrt().mean()

    im_loss = term1 + alpha * term2 + beta * term3
    return im_loss

def calc_label_sim(label_1, label_2):
    Sim = label_1.float().mm(label_2.float().t())
    return Sim

In [0]:
def fx_calc_map_label(image, text, label, k = 0, dist_method='COS'):
  if dist_method == 'L2':
    dist = scipy.spatial.distance.cdist(image, text, 'euclidean')
  elif dist_method == 'COS':
    dist = scipy.spatial.distance.cdist(image, text, 'cosine')
  ord = dist.argsort()
  numcases = dist.shape[0]
  if k == 0:
    k = numcases
  res = []
  for i in range(numcases):
    order = ord[i]
    p = 0.0
    r = 0.0
    for j in range(k):
      if label[i] == label[order[j]]:
        r += 1
        p += (r / (j + 1))
    if r > 0:
      res += [p / r]
    else:
      res += [0]
  return np.mean(res)

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# data parameters
DATA_DIR = 'pascal/'
alpha = 1e-3
beta = 1e-1
MAX_EPOCH = 200
batch_size = 100
# batch_size = 512
lr = 1e-4
betas = (0.5, 0.999)
weight_decay = 0

print('...Data loading is beginning...')

data_loader, input_data_par = get_loader(DATA_DIR, batch_size)

print('...Data loading is completed...')

model_ft = IDCM_NN(img_input_dim=input_data_par['img_dim'], text_input_dim=input_data_par['text_dim'], output_dim=input_data_par['num_class']).to(device)
params_to_update = list(model_ft.parameters())

# Observe that all parameters are being optimized
optimizer = optim.Adam(params_to_update, lr=lr, betas=betas)

print('...Training is beginning...')
# Train and evaluate
model_ft, img_acc_hist, txt_acc_hist, loss_hist = train_model(model_ft, data_loader, optimizer, alpha, beta, MAX_EPOCH)
print('...Training is completed...')

print('...Evaluation on testing data...')
view1_feature, view2_feature, view1_predict, view2_predict = model_ft(torch.tensor(input_data_par['img_test']).to(device), torch.tensor(input_data_par['text_test']).to(device))
label = torch.argmax(torch.tensor(input_data_par['label_test']), dim=1)
view1_feature = view1_feature.detach().cpu().numpy()
view2_feature = view2_feature.detach().cpu().numpy()
view1_predict = view1_predict.detach().cpu().numpy()
view2_predict = view2_predict.detach().cpu().numpy()
img_to_txt = fx_calc_map_label(view1_feature, view2_feature, label)
print('...Image to Text MAP = {}'.format(img_to_txt))

txt_to_img = fx_calc_map_label(view2_feature, view1_feature, label)
print('...Text to Image MAP = {}'.format(txt_to_img))

print('...Average MAP = {}'.format(((img_to_txt + txt_to_img) / 2.)))


...Data loading is beginning...
...Data loading is completed...
...Training is beginning...
Epoch 1/500
--------------------
train Loss: 0.0267 Img2Txt: 0.2081  Txt2Img: 0.2477
test Loss: 0.0229 Img2Txt: 0.2081  Txt2Img: 0.2477

Epoch 2/500
--------------------
train Loss: 0.0228 Img2Txt: 0.4067  Txt2Img: 0.4179
test Loss: 0.0204 Img2Txt: 0.4067  Txt2Img: 0.4179

Epoch 3/500
--------------------
train Loss: 0.0207 Img2Txt: 0.5217  Txt2Img: 0.5267
test Loss: 0.0187 Img2Txt: 0.5217  Txt2Img: 0.5267

Epoch 4/500
--------------------
train Loss: 0.0187 Img2Txt: 0.5594  Txt2Img: 0.5960
test Loss: 0.0174 Img2Txt: 0.5594  Txt2Img: 0.5960

Epoch 5/500
--------------------
train Loss: 0.0170 Img2Txt: 0.5938  Txt2Img: 0.6371
test Loss: 0.0165 Img2Txt: 0.5938  Txt2Img: 0.6371

Epoch 6/500
--------------------
train Loss: 0.0156 Img2Txt: 0.6224  Txt2Img: 0.6591
test Loss: 0.0158 Img2Txt: 0.6224  Txt2Img: 0.6591

Epoch 7/500
--------------------
train Loss: 0.0146 Img2Txt: 0.6271  Txt2Img: 0.6685
t