# CAM: Class Activation Map


<center>
<img src="https://i.imgur.com/ZbuAN3A.png" width="400">
</center>

In this work, we revisit the **global average pooling** layer proposed, and shed light on how it explicitly enables the convolutional neural network (CNN) to have remarkable localization ability despite being trained on image level labels. 




**Global Average Pooling for localizable deep representation**

While this technique was previously proposed
as a means for regularizing training, researchers find that it actually builds a generic localizable deep representation that
exposes the implicit attention of CNNs on an image.

<center/>
<img src="https://you359.github.io/images/contents/cam_gap.png" width=400>
</center>



**How GAP represent localizable representation?**

Let's compare [flatten] + [fully connected layerS] and [global average pooling] +  [one fully connected layer].

1. [flatten] + [fully connected layerS]

    <center/>
    <img src="https://www.researchgate.net/profile/Budiman_Minasny/publication/334783857/figure/fig4/AS:786596169269249@1564550549811/Illustration-of-flatten-layer-that-is-connecting-the-pooling-layers-to-the-fully.png" width=300>
    </center>


2. [global average pooling] +  [one fully connected layer]

    It can contain localizable representation.
    <center/>
    <img src="https://you359.github.io/images/contents/cam_gap.png" width=300>
    </center>
    <center/>
    <img src="https://jsideas.net/assets/materials/20180104/S_c.png" width=300>
    </center>


Here is a **[paper link.](http://cnnlocalization.csail.mit.edu/Zhou_Learning_Deep_Features_CVPR_2016_paper.pdf)**





# Import module


In [0]:
# import module
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.autograd import Variable

import torch.nn.functional as F
from skimage.transform import resize
import matplotlib.pyplot as plt
%matplotlib inline  
import random

# Hyperparameters



In [0]:
batch_size = 32
learning_rate = 0.001
num_epoch = 1

# Data load: MNIST


In [0]:
mnist_train = dset.MNIST("./", train=True, 
                         transform=transforms.Compose([
                            transforms.RandomCrop(22),
                            transforms.Resize(226),
                            transforms.ToTensor(),
                         ]), 
                         target_transform=None, 
                         download=True)

mnist_test = dset.MNIST("./", train=False,
                        transform=transforms.Compose([
                            transforms.RandomCrop(22),
                            transforms.Resize(226),
                            transforms.ToTensor(),
                        ]),
                        target_transform=None, 
                        download=True)

train_loader = torch.utils.data.DataLoader(mnist_train,batch_size=batch_size, shuffle=True,num_workers=2,drop_last=True)
test_loader = torch.utils.data.DataLoader(mnist_test,batch_size=batch_size, shuffle=True,num_workers=2,drop_last=True)

# Model: Resnet34

Resnet use global average pooling. Please check this image.


<center/>
<img src="https://i.stack.imgur.com/ElFiI.png" width=250>
</center>


In [0]:
import torchvision.models as models
resnet34 = models.resnet34(pretrained=True)
resnet34.fc = nn.Linear(resnet34.fc.in_features, 10)

## Train


In [0]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet34.parameters(), lr=learning_rate)
resnet34.cuda()
for i in range(num_epoch):
    resnet34.train()
    for j,[image,label] in enumerate(train_loader):
        x = Variable(image)
        x = torch.cat([x, x, x], dim=1).type(torch.FloatTensor).cuda()
        y_= Variable(label).cuda()
        optimizer.zero_grad()
        output = resnet34.forward(x)
        loss = loss_func(output,y_)
        loss.backward()
        optimizer.step()
        
    top_1_count = torch.FloatTensor([0])
    total = torch.FloatTensor([0])
    resnet34.eval() 
    for image,label in test_loader:
        x = Variable(image)
        x = torch.cat([x, x, x], dim=1).type(torch.FloatTensor).cuda()
        y_= Variable(label).cuda()
        output = resnet34.forward(x)
        values,idx = output.max(dim=1)
        top_1_count += torch.sum(y_==idx).float().cpu().data

        total += label.size(0)

    print("Test Data Accuracy: {}%".format(100*(top_1_count/total).numpy()))
    if (top_1_count/total).numpy() > 0.98:
        break

Test Data Accuracy: [98.97836]%


# Class Activation Map module

Class activation map is computed by weight sum of feature-maps. 

It means that we compute a
weighted sum of the feature maps of **the last convolutional**
layer to obtain our class activation maps.

$$
W_1^c \cdot F_1^c + W_2^c \cdot F_2^c + \cdots + W_N^c \cdot F_N^c = CAM_c
$$


- $W_k$: importance of F_k, weight value for predicting the input image as class $c$.
- $F_k$: Feature map k-th
- $c$: class index

In [0]:
from torchvision import transforms
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np
import cv2, torch

# generate class activation mapping for the top1 prediction
def returnCAM(feature_conv, weight_softmax, class_idx):
    # generate the class activation maps upsample to 256x256
    bz, nc, h, w = feature_conv.shape
    output_cam = []
    cam = weight_softmax[class_idx].dot(feature_conv.reshape((nc, h*w))) # weight_softmax: shape 1 * 512 featureConv: shape 512 * h * w
    cam = np.reshape(cam, (w, h))
    cam = cam - np.min(cam)
    cam_img = cam / np.max(cam)
    cam_img = np.uint8(255 * cam_img)
    return cam_img

def get_cam(net, features_blobs, img, classes):
    params = list(net.parameters())
    weight_softmax = np.squeeze(params[-2].data.cpu().numpy())
    img_pil = torch.cat([img, img, img], dim=0)
    img_pil = transforms.ToPILImage()(img_pil.type(torch.FloatTensor))
   
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
    preprocess = transforms.Compose([
        transforms.Resize((226, 226)),
        transforms.ToTensor(),
        normalize
    ])
    
    img_tensor = preprocess(img_pil)
    img_variable = Variable(img_tensor.unsqueeze(0)).cuda()
    logit = net(img_variable)
    h_x = F.softmax(logit, dim=1).data.squeeze()
    probs, idx = h_x.sort(0, True)

    # output: the prediction
    for i in range(0, 10):
        line = '{:.3f} -> {}'.format(probs[i], classes[idx[i].item()])
        print(line)

    CAMs = returnCAM(features_blobs[0], weight_softmax, [idx[0].item()])

    # render the CAM and output
    print('output CAM.jpg for the top1 prediction: %s' % classes[idx[0].item()])
    _, height, width = img.shape
    img = img.detach().cpu().numpy()
    img = np.squeeze(img)
    
    CAM = cv2.resize(CAMs, (width, height))
    result = img * 255 * 0.5 

    plt.imshow(result, cmap="gray")
    plt.imshow(CAM, alpha=0.5, cmap='hot')
    plt.show()


In [0]:
# hook the feature extractor
features_blobs = []

def hook_feature(module, input, output):
    features_blobs.append(output.data.cpu().numpy())

resnet34.layer4[-1].conv2.register_forward_hook(hook_feature)

<torch.utils.hooks.RemovableHandle at 0x7f1799cd3320>

In [0]:
classes = {0: '1', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8:'8', 9:'9'}
features_blobs = []
resnet34.cuda()
for image, label in mnist_train:
    get_cam(resnet34, features_blobs, image[0].unsqueeze(0), classes)
    

## Reproduce!

In [0]:
class CAM:
    """Class Activation Map for resnet34"""

    def __init__(self, model: torch.nn.module, weight: np.ndarray):
        self.model = model
        self.params = list(self.model.parameters())
        self.weight_softmax = np.squeeze(self.params[-2].data.cpu().numpy())
        self.features_blobs = []

        # last conv
        self.model.layer4[-1].conv2.register_forward_hook(self._hook_forward())

    def _hook_forward(self):
        def hook_feature(module, input, output):
            self.features_blobs.append(output.data.cpu().numpy())
        return hook_feature


    def get_cam(self, img:np.ndarray, target:int=None):
        _, height, width = img.shape
        input_tensor = self._make_tensor(img)
        logit = self.model(output)
        softmax, idx = logit.sort(0, True)
        CAM = self.calculate_cam(self.features_blobs[-1], self.weight_softmax,idx[0].item())
        return CAM

    def calculate_cam(self,feature_map, weight, class_idx):
        bz, nc, h, w = feature_map.shape
        output_cam = []
        cam = weight_softmax[class_idx].dot(feature_conv.reshape((nc, h*w))) # weight_softmax: shape 1 * 512 featureConv: shape 512 * h * w
        cam = np.reshape(cam, (w, h))
        cam = cam - np.min(cam)
        cam = cam / np.max(cam)
        cam = np.uint8(255 * cam)
        return cam

     def _make_tensor(self, img:np.ndarray):
        img_pil = torch.cat([img, img, img], dim=0)
        img_pil = transforms.ToPILImage()(img_pil.type(torch.FloatTensor))
    
        normalize = transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
        preprocess = transforms.Compose([
            transforms.Resize((226, 226)),
            transforms.ToTensor(),
            normalize
        ])
        
        img_tensor = preprocess(img_pil)
        img_variable = Variable(img_tensor.unsqueeze(0)).cuda()

        h_x = F.softmax(logit, dim=1).data.squeeze()
        probs, idx = h_x.sort(0, True)
        return img_variable
    
     
