### Summary of the notebook:
In this notebook, we provide implementation of generating explanation on an autonomous vehicle's action using a visual question answering (VQA) approach. Briefly, we fine-tune the pretrained VGG-19 architecture on the video data provided by the DDPG-based autonomous driving on the CARLA simulator. We then elementwise multiply the obtained image features with question encoder acquired by LSTM. The resultant vector is then passed to a fully-connected layer and softmax probability is applied. By this way, we select top 5 probability scores with corresponding explanations out of possible 1000 explanatory answer vocabulary and the explanation with the highest probability score becomes an answer to the asked question about the action performed within that scene. For instance, below we show an autonomous car going straight in the image frame. If we ask "Why is going straight decided?", the softmax produces top 5 explanations and the explanation with the highest score (i.e., Because road is clear.) becomes an answer to this question on the performed action of a car at that scene.

In [93]:
import torch
import torch.nn as nn
import torchvision.models as models
import numpy as np
import matplotlib.pyplot as plt


#### The activations from the last hidden layer of VGG-19 with L2 normalization as 4096-dim image embedding

In [94]:
class ImgEncoder(nn.Module):

    def __init__(self, embed_size):
        super(ImgEncoder, self).__init__()
        model = models.vgg19(pretrained=True)
        in_features = model.classifier[-1].in_features # input size 
        model.classifier = nn.Sequential(*list(model.classifier.children())[:-1]) # remove output layer

        self.model = model # vgg19 without output layer
        self.fc = nn.Linear(in_features, embed_size)

    def forward(self, image):
        with torch.no_grad():
            img_feature = self.model(image)
        img_feature = self.fc(img_feature)
        l2_norm = img_feature.norm(p=2, dim=1, keepdim=True).detach()
        img_feature = img_feature.div(l2_norm) #1xn 1d vector

        # return the new encoding of the input image

        return img_feature

#### LSTM to get 1024-dim embedding as the question encoder

In [95]:
class QstEncoder(nn.Module):
    def __init__(self, vocab_size, featd, hidden_size, num_layers, out_size):
        super(QstEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, featd)
        self.tanh = nn.Tanh()
        self.lstm = nn.LSTM(featd, hidden_size, num_layers)
        self.fc = nn.Linear(2*num_layers*hidden_size, out_size)

    def forward(self, question):
        qst_vec = self.embedding(question) # [batchsize, max_qst_len=30, word_emb=300]
        
        qst_vec = self.tanh(qst_vec) 
        qst_vec = qst_vec.transpose(0, 1) 
        _, (hidden, cell) = self.lstm(qst_vec)
        qst_feature = torch.cat((hidden, cell), 2) # [num_layer=2, batchsize, 2*hiddensize=1024]
        qst_feature = qst_feature.transpose(0, 1) 
        qst_feature = qst_feature.reshape(qst_feature.size()[0], -1) 
        qst_feature = self.tanh(qst_feature)
        qst_feature = self.fc(qst_feature)

        return qst_feature



#### Combining image encoder and question embedding as elementwise multiplication, passing to the fully connected layer, and applying softmax probability

In [96]:
class VqaModel(nn.Module):
    def __init__(self,vocab_size, feat_dim, hidden_size, num_layers, out_size ):
        super(VqaModel, self).__init__()
        self.img_enc = ImgEncoder(out_size)
        self.qst_enc = QstEncoder(vocab_size, feat_dim, hidden_size, num_layers, out_size)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(out_size, vocab_size)
        self.out = nn.Linear(vocab_size, vocab_size)
        self.outsoft = nn.Softmax()

    def forward(self, img, qst):
        img_feat = self.img_enc(img)
        qst_feat = self.qst_enc(qst)
        combined_feat = torch.mul(img_feat, qst_feat)
        combined_feat = self.tanh(combined_feat)
        combined_feat = self.dropout(combined_feat)
        combined_feat = self.fc1(combined_feat)
        combined_feat = self.dropout(combined_feat)
        output_probs = self.out(combined_feat) #[batch_size, vocab_size]
        softmaxout = self.outsoft(output_probs)
        return output_probs, softmaxout

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [98]:
embed_size = 1204
word_embed_size = 300
num_layers = 2
hidden_size = 512
qst_vocab_size = 17856
ans_vocab_size = 1000

qamodel = VqaModel(vocab_size=qst_vocab_size, feat_dim=word_embed_size, hidden_size=hidden_size, num_layers=num_layers, out_size=ans_vocab_size).to(device)

#### Loading the fine-tuned driving model

In [99]:
qamodel.load_state_dict(torch.load('modelsv2/best_model_drive.pt'))

<All keys matched successfully>

In [100]:
import numpy as np
import sys
np_load_old = np.load

np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True)


In [101]:
qamodel = qamodel.to(device)
qamodel.eval()

VqaModel(
  (img_enc): ImgEncoder(
    (model): VGG(
      (features): Sequential(
        (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): ReLU(inplace=True)
        (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (6): ReLU(inplace=True)
        (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (8): ReLU(inplace=True)
        (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (11): ReLU(inplace=True)
        (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (13): ReLU(inplace=True)
        (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1),

In [102]:
def load_str_list(fname):
    with open(fname) as f:
        lines = f.readlines()
    lines = [l.strip() for l in lines]
    return lines


qst_vocab = load_str_list("./COCO-2015/datasets/vocab_questions.txt")
ans_vocab = load_str_list("./COCO-2015/datasets/vocab_answers.txt")
word_to_index_dict = {w:n_w for n_w, w in enumerate(qst_vocab)}
unknown_to_index = word_to_index_dict['<unk>'] if '<unk>' in word_to_index_dict else None
vocab_size = len(qst_vocab)

In [103]:
def word_to_index(w):
    if w in word_to_index_dict:
        return word_to_index_dict[w]
    elif unknown_to_index is not None:
         return unknown_to_index
 
    else:
        raise ValueError('word %s not in dictionary (while dictionary does not contain <unk>)' % w)

#### Testing on a sample image to see the top probable explanations (i.e., answer) to a question on a corresponding driving scene action 

In [104]:
import numpy as np
max_qst_length=30

question = 'why is going straight decided?'
q_list = list(question.split(" "))
#     print(q_list)

idx = 'valid'
qst2idc = np.array([word_to_index('<pad>')] * max_qst_length)  # padded with '<pad>' in 'ans_vocab'
qst2idc[:len(q_list)] = [word_to_index(w) for w in q_list]

question = qst2idc
question = torch.from_numpy(question).long()

question = question.to(device)
question = question.unsqueeze(dim=0)
import cv2
image = cv2.imread("./Selected_segments_frames_test_data/img_test.png")
image = cv2.resize(image, (640, 480)) 
image = torch.from_numpy(image).float()
image = image.to(device)
image = image.unsqueeze(dim=0)
image = image.view(1,3,640,480)
output, probs = qamodel(image, question)




In [105]:
probs, indices = torch.topk(probs, k=5, dim=1)

#### Model picks up the correct  answer with ~0.97 probability score on a sample image.

In [106]:
probs = probs.squeeze()
indices = indices.squeeze()
print("Top 5 predictions with the probability scores:")
for i in range(5):
    print("'{}' - {:.4f}".format(ans_vocab[indices[i].item()], probs[i].item()))

Top 5 predictions with the probability scores:
'Because road is clear.' - 0.9726
'Because the road is bending to the left.' - 0.0176
'<unk>' - 0.0082
'Because the road is bending to the right.' - 0.0011
'can' - 0.0001
