## **clevrDialog_dataset.py**


In [10]:
#for cleaning the CPU ram
import gc
gc.collect()

%reset -f

In [11]:
# path to notebook folder, use os.path.join to concat 
import os


In [12]:
#Here we set variables for number of iterations and validation
TOTAL_ITER = 5000
VALID_EVE =1000

In [13]:
#for cleaning the GPU ram
import torch 
torch.cuda.empty_cache()

In [14]:
import h5py
import json
import os
import numpy as np
import torch
from torch.utils.data import Dataset
import argparse#***

In [15]:
def invertDict(_dict):
    return {v: k for k, v in _dict.items()}

In [16]:
class ClevrDialogDataset(Dataset):
    def __init__(self, dataPath, vocabPath, split, indStart=0, indEnd=-1):
        super(ClevrDialogDataset, self).__init__()
        self.data = h5py.File(dataPath, "r")
        with open(vocabPath, "r") as f:
            self.vocab = json.load(f)
        self.vocab["idx_text_to_token"] = invertDict(self.vocab["text_token_to_idx"])
        self.vocab["idx_prog_to_token"] = invertDict(self.vocab["prog_token_to_idx"])
        self.vocab["idx_prog_to_token"] = invertDict(self.vocab["prog_token_to_idx"])
        self.lenVocabText = len(self.vocab["text_token_to_idx"])
        self.lenVocabProg = len(self.vocab["prog_token_to_idx"])

        self.split = split
        self.indStart = indStart
        self.indEnd = indEnd
        self.maxSamples = indEnd - indStart
        self.maxLenProg = 6

    def __len__(self):
        raise NotImplementedError

    def __getitem__(self, index):
        raise NotImplementedError

In [17]:
class ClevrDialogQuestionDataset(ClevrDialogDataset):
    def __init__(self, dataPath, vocabPath, split, name, train=True, indStart=0, indEnd=-1):
        super(ClevrDialogQuestionDataset, self).__init__(dataPath, vocabPath, split, indStart=indStart, indEnd=indEnd)
        self.questions = torch.LongTensor(np.asarray(self.data["questions"], dtype=np.int64)[indStart: indEnd])
        self.quesProgs = torch.LongTensor(np.asarray(self.data["questionProgs"], dtype=np.int64)[indStart: indEnd])
        self.questionRounds = torch.LongTensor(np.asarray(self.data["questionRounds"], dtype=np.int64)[indStart: indEnd])
        self.questionImgIdx = torch.LongTensor(np.asarray(self.data["questionImgIdx"], dtype=np.int64)[indStart: indEnd])
        self.histories = torch.LongTensor(np.asarray(self.data["histories"], dtype=np.int64)[indStart: indEnd])
        self.historiesProgs = torch.LongTensor(np.asarray(self.data["historiesProg"], dtype=np.int64)[indStart: indEnd])

        self.answers = torch.LongTensor(np.asarray(self.data["answers"], dtype=np.int64)[indStart: indEnd])
        self.name = name
        self.train = train

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        assert idx < len(self)
        question = self.questions[idx]
        questionPrg = self.quesProgs[idx]
        questionImgIdx = self.questionImgIdx[idx]
        questionRound = self.questionRounds[idx]

        history = self.histories[idx]
        historiesProg = self.historiesProgs[idx]

        answer = self.answers[idx]
        if self.train:
            return question, history, questionPrg, questionRound, answer
        else:
            return question, questionPrg, questionImgIdx, questionRound, history, historiesProg, answer

## **Clevr_statistics**

In [18]:
COLORS = ["blue", "brown", "cyan", "gray", "green", "purple", "red", "yellow"]
MATERIALS = ["rubber", "metal"]
SHAPES = ["cube", "cylinder", "sphere"]
SIZES = ["large", "small"]

ATTRIBUTES_ALL = COLORS + MATERIALS + SHAPES + SIZES

ANSWER_CANDIDATES = {
    # Count questions
    "count-all": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
    "count-other": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
    "count-all-group": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
    "count-attribute": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
    "count-attribure-group": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
    "count-obj-rel-imm": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
    "count-obj-rel-imm2": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
    "count-obj-rel-early": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
    "count-obj-exclude-imm": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
    "count-obj-exclude-early": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],

    # Existence questions
    "exist-other": ["yes", "no"],
    "exist-attribute": ["yes", "no"],
    "exist-attribute-group": ["yes", "no"],
    "exist-obj-rel-imm": ["yes", "no"],
    "exist-obj-rel-imm2": ["yes", "no"],
    "exist-obj-rel-early": ["yes", "no"],
    "exist-obj-exclude-imm": ["yes", "no"],
    "exist-obj-exclude-early": ["yes", "no"],

    # Seek questions
    "seek-attr-imm": ATTRIBUTES_ALL,
    "seek-attr-imm2": ATTRIBUTES_ALL,
    "seek-attr-early": ATTRIBUTES_ALL,
    "seek-attr-sim-early": ATTRIBUTES_ALL,
    "seek-attr-rel-imm": ATTRIBUTES_ALL,
    "seek-attr-rel-early": ATTRIBUTES_ALL,
}


## **Utils_m** just a function

In [19]:
import json
import numpy as np


def merge_captions_question_programs(path_cap, path_ques, caption_first=True):
    with open(path_cap, "r"):
        c_progs = path_cap.readlines()
    with open(path_ques, "r"):
        q_progs = path_ques.readlines()

    all_merged_progs = []
    i = 0
    while i < len(q_progs):
        cap_idx = i % 11 if caption_first else i % 10
        start_idx_p = i + 1 if caption_first else i
        end_idx_p = start_idx_p + 12 if caption_first else  start_idx_p + 11
        temp = c_progs[cap_idx] + q_progs[start_idx_p, end_idx_p]
        all_merged_progs.append(temp)
        i = end_idx_p


def load_clevr_scenes(scenes_json):
    with open(scenes_json) as f:
        scenes_raw = json.load(f)
    if type(scenes_raw) == dict:
        scenes_raw = scenes_raw["scenes"]

    scenes = []
    for s in scenes_raw:
        table = []
        for i, o in enumerate(s['objects']):
            item = {}
            item['id'] = '%d-%d' % (s['image_index'], i)
            if '3d_coords' in o:
                item['position'] = [np.dot(o['3d_coords'], s['directions']['right']),
                                    np.dot(o['3d_coords'], s['directions']['front']),
                                    o['3d_coords'][2]]
            else:
                item['position'] = o['position']
            item['color'] = o['color']
            item['material'] = o['material']
            item['shape'] = o['shape']
            item['size'] = o['size']
            table.append(item)
        scenes.append(table)
    return scenes


def load_minecraft_scenes(scenes_json):
    with open(scenes_json) as f:
        scenes_raw = json.load(f)
    if type(scenes_raw) == dict:
        scenes_raw = scenes_raw["scenes"]

    scenes = []
    for s in scenes_raw:
        table = []
        for i, o in enumerate(s['objects']):
            item = {}
            item['id'] = '%d-%d' % (s['image_index'], i)
            if '3d_coords' in o:
                item['position'] = [np.dot(o['3d_coords'], s['directions']['right']),
                                    np.dot(o['3d_coords'], s['directions']['front']),
                                    o['3d_coords'][2]]
            else:
                item['position'] = o['position']
            item['nature'] = o['nature']
            item['class'] = o['class']
            item['direction'] = "facing_"
            if o['direction'] == "front":
                item['direction'] += "forward"
            elif o['direction'] == "back":
                item['direction'] += "backward"
            elif o['direction'] == "right":
                item['direction'] += "right"
            elif o['direction'] == "left":
                item['direction'] += "left"
            table.append(item)
        scenes.append(table)
    return scenes

## **Symbolic_executor**

In [20]:
import numpy as np
from copy import deepcopy



class SymbolicExecutorClevr(object):
    """Symbolic executor for clevr-dialog
    """
    def __init__(self, scenesPath):
        super(SymbolicExecutorClevr, self).__init__()
        self.functions = {}
        self.registerFunctions()
        self.uniqueObjFlag = False
        self.colors = COLORS
        self.materials = MATERIALS
        self.shapes = SHAPES
        self.sizes = SIZES
        self.answer_candidates = ANSWER_CANDIDATES#***
        self.attribute_all = ATTRIBUTES_ALL#***
        self.scenes = load_clevr_scenes(scenesPath)

    def reset(self, sceneIdx):
        """Resets the scene

        Args:
            sceneIdx: The index of the new scene
        """
        self.scene = self.scenes[sceneIdx]
        for _obj in self.scene:
            _obj["identifier"] = None
        # store previous objects in a list to better answer
        # xxx-imm, xxx-imm2, xxx-group and xxx-early questions.
        self.objs = []
        self.groups = []
        self.visited = []
        self.currentObj = None
        self.currentGrp = []
        self.uniqueObjFlag = False

    def registerFunctions(self):
        """Registers the available functions of the executor.
        """
        # Captions - extreme location
        self.functions["extreme-right"] = self.extremeRight
        self.functions["extreme-left"] = self.extremeLeft
        self.functions["extreme-behind"] = self.extremeBehind
        self.functions["extreme-front"] = self.extremeFront
        self.functions["extreme-center"] = self.extremeCenter

        # Captions - multiple objects
        self.functions["count-att"] = self.countAttributeCaption

        # Captions - object relations
        self.functions["obj-relation"] = self.objRelation

        # Captions - unique object
        self.functions["unique-obj"] = self.uniqueObject

        # Questions - Count
        self.functions["count-all"] = self.countAll
        self.functions["count-other"] = self.countOther
        self.functions["count-all-group"] = self.countAllGroup
        self.functions["count-attribute"] = self.countAttribute
        self.functions["count-attribute-group"] = self.countAttributeGroup
        self.functions["count-obj-rel-imm"] = self.countObjRelImm
        self.functions["count-obj-rel-imm2"] = self.countObjRelImm2
        self.functions["count-obj-rel-early"] = self.countObjRelEarly
        self.functions["count-obj-exclude-imm"] = self.countObjExcludeImm
        self.functions["count-obj-exclude-early"] = self.countObjExcludeEarly

        # Questions - Exist
        self.functions["exist-other"] = self.existOther
        self.functions["exist-attribute"] = self.existAttribute
        self.functions["exist-attribute-group"] = self.existAttributeGroup
        self.functions["exist-obj-rel-imm"] = self.existObjRelImm
        self.functions["exist-obj-rel-imm2"] = self.existObjRelImm
        self.functions["exist-obj-rel-early"] = self.existObjRelEarly
        self.functions["exist-obj-exclude-imm"] = self.existObjExcludeImm
        self.functions["exist-obj-exclude-early"] = self.existObjExcludeEarly

        # Questions - Seek
        self.functions["seek-attr-imm"] = self.seekAttrImm
        self.functions["seek-attr-imm2"] = self.seekAttrImm
        self.functions["seek-attr-early"] = self.seekAttributeEarly
        self.functions["seek-attr-rel-imm"] = self.seekAttributeRelImm
        self.functions["seek-attr-rel-early"] = self.seekAttributeRelEarly


    ########################################################
    #                   Helper functions                   #
    ########################################################
    def getAttributeType(self, attribute):
        assert attribute in self.attribute_all, "The attribute {} is unkown".format(
            attribute)
        if attribute in self.colors:
            return "color"
        elif attribute in self.materials:
            return "material"
        elif attribute in self.shapes:
            return "shape"
        elif attribute in self.sizes:
            return "size"

    def execute(self, functionLabel, functionArgs):
        assert functionLabel in self.functions, "{} is not a valid function".format(
            functionLabel)
        function = self.functions[functionLabel]
        answer = function(*functionArgs)
        return answer

    def updateCurrentObj(self, obj):
        self.currentObj = obj
        objsCopy = deepcopy(self.objs)
        for i, _obj in enumerate(objsCopy):
            if _obj["id"] == obj["id"]:
                del self.objs[i]
        # Current obj is always kept at the end of the visited objs
        self.objs.append(obj)

    def updateVisited(self, obj):
        if len(self.visited) == 0:
            self.visited.append(obj)
        else:
            newObjFlag = True
            for _obj in self.visited:
                if _obj["id"] == obj["id"]:
                    newObjFlag = False
                    break
            if newObjFlag:
                self.visited.append(obj)

    def getOther(self):
        others = []
        if len(self.visited) < len(self.scene):
            for _obj in self.scene:
                notExisting = True
                for __obj in self.visited:
                    if __obj["id"] == _obj["id"]:
                        notExisting = False
                        break
                if notExisting:
                    others.append(_obj)
        return others

    def updateIdentifier(self, obj, attribute):
        if obj["identifier"] is None:
            obj["identifier"] = attribute
        else:
            identifiers = obj["identifier"].split("-")
            if attribute not in identifiers:
                identifiers.append(attribute)
                obj["identifier"] = "-".join(identifiers)


    ########################################################
    #                   Caption programs                   #
    ########################################################

    def extremeRight(self, *attributes):
        attributes = list(attributes)
        attributeTypes = list(
            map(lambda att: self.getAttributeType(att), attributes))

        leftToRight = deepcopy(self.scene)
        leftToRight.sort(key=lambda o: o["position"][0])
        extremeRightObj = leftToRight[-1]
        for attributeType, attribute in zip(attributeTypes, attributes):
            assert extremeRightObj[attributeType] == attribute
            self.updateIdentifier(extremeRightObj, attribute)

        self.updateCurrentObj(extremeRightObj)
        self.updateVisited(extremeRightObj)
        del leftToRight

    def extremeLeft(self, *attributes):
        attributes = list(attributes)
        attributeTypes = list(
            map(lambda att: self.getAttributeType(att), attributes))

        leftToRight = deepcopy(self.scene)
       
        leftToRight.sort(key=lambda o: o["position"][0])
        extremeLeftObj = leftToRight[0]
        
        for attributeType, attribute in zip(attributeTypes, attributes):
            assert extremeLeftObj[attributeType] == attribute
            self.updateIdentifier(extremeLeftObj, attribute)

        self.updateCurrentObj(extremeLeftObj)
        self.updateVisited(extremeLeftObj)
        del leftToRight

    def extremeFront(self, *attributes):
        attributes = list(attributes)
        attributeTypes = list(
            map(lambda att: self.getAttributeType(att), attributes))

        backToFront = deepcopy(self.scene)
        backToFront.sort(key=lambda o: o["position"][1])
        extremeFrontObj = backToFront[-1]
        for attributeType, attribute in zip(attributeTypes, attributes):
        

            assert extremeFrontObj[attributeType] == attribute
            self.updateIdentifier(extremeFrontObj, attribute)

        self.updateCurrentObj(extremeFrontObj)
        self.updateVisited(extremeFrontObj)
        del backToFront

    def extremeBehind(self, *attributes):
        attributes = list(attributes)
        attributeTypes = list(
            map(lambda att: self.getAttributeType(att), attributes))

        backToFront = deepcopy(self.scene)
        backToFront.sort(key=lambda o: o["position"][1])
        extremeBehindObj = backToFront[0]
        for attributeType, attribute in zip(attributeTypes, attributes):
            assert extremeBehindObj[attributeType] == attribute
            self.updateIdentifier(extremeBehindObj, attribute)

        self.updateCurrentObj(extremeBehindObj)
        self.updateVisited(extremeBehindObj)
        del backToFront

    def extremeCenter(self, *attributes):
        attributes = list(attributes)
        attributeTypes = list(
            map(lambda att: self.getAttributeType(att), attributes))
        numObjs = len(self.scene)

        frontToBack = deepcopy(self.scene)
        frontToBack.sort(key=lambda o: o["position"][1], reverse=True)

        rightToLeft = deepcopy(self.scene)
        rightToLeft.sort(key=lambda o: o["position"][0], reverse=True)

        prelimenaryCandidates = []

        for i, objFrontToBack in enumerate(frontToBack):
            numObjsInFront = i
            numObjsBehind = len(rightToLeft) - i - 1
            if numObjsInFront <= numObjs / 2 and numObjsBehind <= numObjs / 2:
                prelimenaryCandidates.append(objFrontToBack)
        foundCenter = False
        for _obj in prelimenaryCandidates:
            for i, objRightToLeft in enumerate(rightToLeft):
                if _obj["id"] == objRightToLeft["id"]:
                    numObjsToTheRight = i
                    numObjsToTheLeft = len(frontToBack) - i - 1
                    if numObjsToTheRight <= numObjs / 2 and numObjsToTheLeft <= numObjs / 2:
                        foundCenter = True
                        for attributeType, attribute in zip(attributeTypes, attributes):
                            if _obj[attributeType] != attribute:
                                foundCenter = False
                                break
                        break
            if foundCenter:
                break
        for attributeType, attribute in zip(attributeTypes, attributes):
            self.updateIdentifier(_obj, attribute)
        self.updateCurrentObj(_obj)
        self.updateVisited(_obj)
        del rightToLeft, frontToBack

    def countAttributeCaption(self, attribute):
        attributeType = self.getAttributeType(attribute)
        objs = []
        for _obj in self.scene:
            if _obj[attributeType] == attribute:
                objs.append(deepcopy(_obj))
        for _obj in objs:
            self.updateIdentifier(_obj, attribute)
        # update the current group
        self.currentGrp = objs

        # update the visited objects list
        for _obj in objs:
            self.updateVisited(_obj)

    def getAnchorAttribute(self, attribute_1, attribute_2, scene):
        # The anchor object is unique. If we filter the object list
        # based on the attribute anchor, we must find only one object.
        filterAttribute_1 = self.filterAttribute(scene, attribute_1)
        if len(filterAttribute_1) == 1:
            return attribute_1
        else:
            return attribute_2

    def objRelation(self, attribute, attributeAnchor, relation):
        assert relation in ["left", "right", "front", "behind"]
        # find the anchor object
        if attributeAnchor != self.getAnchorAttribute(attribute, attributeAnchor, self.scene):
            temp = deepcopy(attribute)
            attribute = deepcopy(attributeAnchor)
            attributeAnchor = temp
            if relation == "left":
                relation = "right"
            elif relation == "right":
                relation = "left"
            elif relation == "behind":
                relation = "front"
            elif relation == "front":
                relation = "behind"

        # Order the objects in the scene w.r.t. the relation
        sceneCopy = deepcopy(self.scene)

        if relation in ["left", "right"]:
            sceneCopy.sort(key=lambda o: o["position"][0])
        else:
            sceneCopy.sort(key=lambda o: o["position"][1])

        # get the anchor object
        attributeTypeAnchor = self.getAttributeType(attributeAnchor)
        for i, _obj in enumerate(sceneCopy):
            if _obj[attributeTypeAnchor] == attributeAnchor:
                break
        # save the anchor object before the main object
        anchorObj = _obj
        self.updateIdentifier(anchorObj, attributeAnchor)
        self.updateCurrentObj(anchorObj)
        self.updateVisited(anchorObj)

        if relation in ["left", "behind"]:
            sceneCopy = list(reversed(sceneCopy[:i]))
        else:
            sceneCopy = sceneCopy[i+1:]

        attributeType = self.getAttributeType(attribute)
        # get the main object
        for _obj in sceneCopy:
            # and not equalDicts(_obj, anchorObj):
            if _obj[attributeType] == attribute:
                break
        self.updateIdentifier(_obj, attribute)
        self.updateCurrentObj(_obj)
        self.updateVisited(_obj)
        del sceneCopy

    def uniqueObject(self, *attributes):
        attributes = list(attributes)
        attributeTypes = list(
            map(lambda att: self.getAttributeType(att), attributes))

        for _obj in self.scene:
            found = True
            for attributeType, attribute in zip(attributeTypes, attributes):
                if _obj[attributeType] != attribute:
                    found = False
                    break

            if found:
                break
        for att in attributes:
            self.updateIdentifier(_obj, att)

        self.updateCurrentObj(_obj)
        self.updateVisited(_obj)

    ######################################## Question Programs ########################################
    def filterOutObj(self, scene, obj):
        sceneCopy = deepcopy(scene)
        for i, _obj in enumerate(scene):
            if obj["id"] == _obj["id"]:
                break
        del sceneCopy[i]
        return sceneCopy

    def filterAttribute(self, scene, attribute):
        attributeType = self.getAttributeType(attribute)
        filtered = []
        if len(scene) == 0:
            return filtered

        for _obj in scene:
            if _obj[attributeType] == attribute:
                filtered.append(_obj)
        return filtered

    def excludeAttribute(self, scene, obj, attributeType):
        filtered = []
        if len(scene) == 0:
            return filtered
        for _obj in scene:
            if _obj["id"] != obj["id"] and obj[attributeType] == _obj[attributeType]:
                filtered.append(_obj)

        # Update the visited objects list
        if len(filtered) > 0:
            for _obj in filtered:
                self.updateVisited(_obj)
        return filtered

    def filterLeft(self, scene, obj):
        filtered = []
        if len(scene) == 0:
            return filtered

        for _obj in self.scene:
            # if the x-coordinate of _obj is smaller than the x-coordinate of slef.currentObj,
            # then _obj is located to the left of self.currentObj
            if _obj["position"][0] < obj["position"][0] and _obj["id"] != obj["id"]:
                filtered.append(_obj)
        return filtered

    def filterRight(self, scene, obj):
        filtered = []
        for _obj in self.scene:
            # if the x-coordinate of _obj is bigger than the x-coordinate of slef.currentObj,
            # then _obj is located to the right of self.currentObj
            if _obj["position"][0] > obj["position"][0] and _obj["id"] != obj["id"]:
                filtered.append(_obj)
        return filtered

    def filterFront(self, scene, obj):
        filtered = []
        if len(scene) == 0:
            return filtered

        for _obj in self.scene:
            # if the y-coordinate of _obj is smaller than the y-coordinate of slef.currentObj,
            # then _obj is located in front of self.currentObj
            if _obj["position"][1] > obj["position"][1] and _obj["id"] != obj["id"]:
                filtered.append(_obj)
        return filtered

    def filterBehind(self, scene, obj):
        # assert type(scene) == list, "Excpected type list got {} instead".format(type(scene))
        filtered = []
        if len(scene) == 0:
            return filtered

        for _obj in scene:
            # if the y-coordinate of _obj is bigger than the y-coordinate of slef.currentObj,
            # then _obj is located behind self.currentObj
            if _obj["position"][1] < obj["position"][1] and _obj["id"] != obj["id"]:
                filtered.append(_obj)
        return filtered

    def filterPosition(self, scene, obj, pos):
        # assert type(scene) == list, "Excpected type list got {} instead".format(type(scene))
        assert pos in ["left", "right", "front", "behind"]
        if pos == "left":
            filtered = self.filterLeft(scene, obj)
        elif pos == "right":
            filtered = self.filterRight(scene, obj)
        elif pos == "front":
            filtered = self.filterFront(scene, obj)
        elif pos == "behind":
            filtered = self.filterBehind(scene, obj)

        return filtered

    ###########################################################################
    #                           Counting questions                            #
    ###########################################################################
    def countAll(self):
        self.currentGrp = deepcopy(self.scene)
        self.groups.append(deepcopy(self.scene))
        return len(self.scene)

    def countOther(self):
        others = self.getOther()
        if len(others) > 0:
            self.currentGrp = others
            self.groups.append(others)
        if len(others) == 1:
            obj = others[0]
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj = _obj
                    break
            self.updateCurrentObj(obj)

            self.updateVisited(obj)
        return len(others)

    def countAllGroup(self):
        return len(self.currentGrp)

    def countAttribute(self, attribute, updateCurrentObj=True):
        filtered = self.filterAttribute(self.scene, attribute)
        if len(filtered) == 0:
            return 0
        # Update the visited objects list
        for _obj in filtered:
            self.updateVisited(_obj)
        if len(filtered) == 1:
            obj = filtered[0]
            new = True
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj = _obj
                    new = False
                    break
            self.updateIdentifier(obj, attribute)
            self.updateVisited(obj)
            if updateCurrentObj:
                self.updateCurrentObj(obj)
            else:
                if new:
                    self.objs.append(obj)

        self.groups.append(filtered)
        self.currentGrp = filtered
        return len(filtered)

    def countAttributeGroup(self, attribute, updateCurrentObj=True):
        filtered = self.filterAttribute(self.currentGrp, attribute)
        if len(filtered) == 0:
            return 0
        # Update the visited objects list
        for _obj in filtered:
            self.updateVisited(_obj)
        if len(filtered) == 1:
            obj = filtered[0]
            new = True
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj = _obj
                    new = False
                    break
            self.updateIdentifier(obj, attribute)
            self.updateVisited(obj)

            if updateCurrentObj:
                self.updateCurrentObj(obj)
            else:
                if new:
                    self.objs.append(obj)

        self.groups.append(filtered)
        self.currentGrp = filtered
        return len(filtered)

    def countObjRelImm(self, pos, updateCurrentObj=True):
        filtered = self.filterPosition(self.scene, self.currentObj, pos)
        if len(filtered) == 0:
            return 0
        # Update the visited objects list
        for _obj in filtered:
            self.updateVisited(_obj)

        self.currentGrp = filtered
        self.groups.append(filtered)

        if len(filtered) == 1:
            obj = filtered[0]
            new = True
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj = _obj
                    new = False
                    break
            if updateCurrentObj:
                self.updateCurrentObj(obj)
                self.uniqueObjFlag = True
            else:
                if new:
                    self.objs.append(obj)
        return len(filtered)

    def countObjRelImm2(self, pos):
        if self.uniqueObjFlag:
            # del self.objs[-1]
            self.updateCurrentObj(self.objs[-2])
            self.uniqueObjFlag = False
        return self.countObjRelImm(pos)

    def countObjRelEarly(self, pos, earlyObjAttribute, updateCurrentObj=True):
        for objEarly in reversed(self.objs):
            if objEarly["identifier"] is not None:
                identifiers = objEarly["identifier"].split("-")
                if earlyObjAttribute in identifiers:
                    break
            else:
                continue
        filtered = self.filterPosition(self.scene, objEarly, pos)
        if len(filtered) == 0:
            return 0
        # Update the visited objects list
        for _obj in filtered:
            self.updateVisited(_obj)

        if len(filtered) == 1:
            obj = filtered[0]
            new = True
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj = _obj
                    new = False
                    break
            if updateCurrentObj:
                self.updateCurrentObj(obj)
            else:
                if new:
                    self.objs.append(obj)
        else:
            self.updateCurrentObj(objEarly)

        self.currentGrp = filtered
        self.groups.append(filtered)
        return len(filtered)

    def countObjExcludeImm(self, attributeType, updateCurrentObj=True):
        filtered = self.excludeAttribute(
            self.scene, self.currentObj, attributeType)
        if len(filtered) == 0:
            return 0

        if len(filtered) == 1:
            obj = filtered[0]
            new = True
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj = _obj
                    new = False
                    break
            if updateCurrentObj:
                self.updateCurrentObj(obj)
            else:
                if new:
                    self.objs.append(obj)

        self.currentGrp = filtered
        self.groups.append(filtered)
        return len(filtered)

    def countObjExcludeEarly(self, attributeType, earlyObjAttribute, updateCurrentObj=True):
        for objEarly in reversed(self.objs):
            if objEarly["identifier"] is not None:
                identifiers = objEarly["identifier"].split("-")
                if earlyObjAttribute in identifiers:
                    break
            else:
                continue

        filtered = self.excludeAttribute(self.scene, objEarly, attributeType)
        if len(filtered) == 0:
            return 0

        if len(filtered) == 1:
            obj = filtered[0]
            new = True
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj = _obj
                    new = False
                    break
            if updateCurrentObj:
                self.updateCurrentObj(obj)
            else:
                if new:
                    self.objs.append(obj)
        else:
            self.updateCurrentObj(objEarly)
        self.currentGrp = filtered
        self.groups.append(filtered)
        return len(filtered)

    ###########################################################################
    #                           Existence questions                           #
    ###########################################################################

    def existOther(self):
        others = self.getOther()
        numOther = len(others)
        if numOther > 0:
            self.currentGrp = others
            self.groups.append(others)
            for _obj in others:
                self.updateVisited(_obj)
        return "yes" if numOther > 0 else "no"

    def existAttribute(self, attribute):
        filtered = self.filterAttribute(self.scene, attribute)
        numAttribute = len(filtered)
        if numAttribute == 0:
            return "no"

        # Update the visited objects list
        for _obj in filtered:
            self.updateVisited(_obj)
        if len(filtered) == 1:
            obj = filtered[0]
            new = True
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    self.updateIdentifier(_obj, attribute)
                    new = False
                    break
            if new:
                self.updateIdentifier(obj, attribute)
                self.objs.append(obj)
                # self.updateCurrentObj(obj)

        self.currentGrp = filtered
        self.groups.append(filtered)
        return "yes"

    def existAttributeGroup(self, attribute):
        numAttributeGrp = self.countAttributeGroup(
            attribute, updateCurrentObj=False)
        return "yes" if numAttributeGrp > 0 else "no"

    def existObjRelImm(self, pos):
        numObjs = self.countObjRelImm(pos, updateCurrentObj=False)
        return "yes" if numObjs > 0 else "no"

    def existObjRelEarly(self, pos, earlyObjAttribute):
        numObjs = self.countObjRelEarly(
            pos, earlyObjAttribute, updateCurrentObj=False)
        return "yes" if numObjs > 0 else "no"

    def existObjExcludeImm(self, attributeType):
        numObjs = self.countObjExcludeImm(
            attributeType, updateCurrentObj=False)
        return "yes" if numObjs > 0 else "no"

    def existObjExcludeEarly(self, attributeType, earlyObjAttribute):
        for objEarly in reversed(self.objs):
            if objEarly["identifier"] is not None:
                identifiers = objEarly["identifier"].split("-")
                if earlyObjAttribute in identifiers:
                    break
            else:
                continue

        filtered = self.excludeAttribute(self.scene, objEarly, attributeType)
        numObjs = len(filtered)
        if numObjs == 0:
            return "no"
        self.currentGrp = filtered
        self.groups.append(filtered)
        return "yes"

    ###########################################################################
    #                             Seek questions                              #
    ###########################################################################

    def seekAttrImm(self, attributeType):
        assert attributeType in self.currentObj, "Attributre <{}> is not valid"
        self.updateIdentifier(self.currentObj, self.currentObj[attributeType])
        return self.currentObj[attributeType]

    def seekAttributeEarly(self, attributeType, earlyObjAttribute):
        for objEarly in reversed(self.objs):
            if objEarly["identifier"] is not None:
                identifiers = objEarly["identifier"].split("-")
                if earlyObjAttribute in identifiers:
                    break
            else:
                continue
        self.updateIdentifier(objEarly, objEarly[attributeType])
        self.updateCurrentObj(objEarly)
        self.updateVisited(objEarly)
        return objEarly[attributeType]

    def seekAttributeRelImm(self, attributeType, pos):
        filtered = self.filterPosition(self.scene, self.currentObj, pos)
        if len(filtered) == 0:
            return "none"
        else:
            # Get the closest object to slef.obj
            if pos == "left":
                filtered.sort(key=lambda x: x["position"][0])
                obj = filtered[-1]
            elif pos == "right":
                filtered.sort(key=lambda x: x["position"][0])
                obj = filtered[0]
            elif pos == "front":
                filtered.sort(key=lambda x: x["position"][1])
                obj = filtered[0]
            elif pos == "behind":
                filtered.sort(key=lambda x: x["position"][1])
              
                obj = filtered[-1]

            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj["identifier"] = _obj["identifier"]
                    break
            self.updateIdentifier(obj, obj[attributeType])
            self.updateCurrentObj(obj)
            self.updateVisited(obj)
            return obj[attributeType]

    def seekAttributeRelEarly(self, attributeType, pos, earlyObjAttribute):
        for objEarly in reversed(self.objs):
            if objEarly["identifier"] is not None:
                identifiers = objEarly["identifier"].split("-")
                if earlyObjAttribute in identifiers:
                    break
            else:
                continue

        filtered = self.filterPosition(self.scene, objEarly, pos)
        if len(filtered) == 0:
            return "none"
        else:
            # Get the closest object to slef.obj
            if pos == "left":
                filtered.sort(key=lambda x: x["position"][0])
                obj = filtered[-1]
            elif pos == "right":
                filtered.sort(key=lambda x: x["position"][0])
                obj = filtered[0]
            elif pos == "front":
                filtered.sort(key=lambda x: x["position"][1])
                obj = filtered[0]
            elif pos == "behind":
                filtered.sort(key=lambda x: x["position"][1])
                obj = filtered[-1]
            for _obj in self.objs:
                if _obj["id"] == obj["id"]:
                    obj["identifier"] = _obj["identifier"]
                    break
            self.updateIdentifier(obj, obj[attributeType])
            self.updateCurrentObj(obj)
            self.updateVisited(obj)
            return obj[attributeType]



In [21]:
import argparse
import os
import torch
#import utils_m

### **Option_caption_parser**

In [22]:
#!touch "/kaggle/working/" res.txt

In [23]:
class OptionsC():#changed optiopn class as Option_c to differentiate it with the one belong to question
    def __init__(self):
        self.parser = argparse.ArgumentParser()
        self.initialized = False

    def initialize(self):
        self.parser.add_argument(
            '--mode',
            default="train",
            # required=True,
            type=str,
            choices=['train', 'test'],
            help='The mode of the experiment')

        self.parser.add_argument(
            '--run_dir',
            default="kaggle/working",
            # required=True,
            type=str,
            help='The experiment directory')

        self.parser.add_argument(
            '--load_checkpoint_path',
            default='None',
            type=str,
            help='The path the the pretrained CaptionNet')

        self.parser.add_argument(
            '--res_path',
            default="kaggle/working/res.txt",#***
            # required=True,
            type=str,
            help='Path where to log the predicted caption programs')

        self.parser.add_argument(
            '--gpu_ids',
            default='0',
            type=str,
            help='Id of the gpu to be used')

        self.parser.add_argument(
            '--seed',
            default=42,
            type=int,
            help='The seed used in training')

        self.parser.add_argument(
            '--dataPathTr',
            default="/kaggle/input/caption_small/tr_cap_s.h5", 
            # required=True,
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed training data')

        self.parser.add_argument(
            '--dataPathVal',
            default="/kaggle/input/caption_small/val_cap_s.h5",
            # required=True,
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed validation data')

        self.parser.add_argument(
            '--dataPathTest',
            # required=True,
            default="/kaggle/input/caption_small/test_cap_s.h5",
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed test data')

        self.parser.add_argument(
            '--vocabPath',
            default="/kaggle/input/caption/vocab_output_caption.json",

            # required=True,
            type=str,
            help='Path to the generated vocabulary')

        self.parser.add_argument(
            '--batch_size',
            default=64,
            type=int,
            help='Batch size')

        self.parser.add_argument(
            '--num_workers',
            default=0,
            type=int,
            help='Number of workers for loading')

        self.parser.add_argument(
            '--num_iters',
            #default=5000,
            default=TOTAL_ITER,
            type=int,
            help='Total number of iterations')

        self.parser.add_argument(
            '--display_every',
            default=5,
            type=int,
            help='Display training information every N iterations')

        self.parser.add_argument(
            '--debug_every',
            default=100,
            type=int,
            help='Display debug message every N iterations')

        self.parser.add_argument(
            '--validate_every',
            default=VALID_EVE,
            type=int,
            help='Validate every N iterations')

        self.parser.add_argument(
            '--shuffle_data',
            default=1,
            type=int,
            help='Activate to shuffle the training data')

        self.parser.add_argument(
            '--optim',
            default='adam',
            type=str,
            help='The name of the optimizer to be used')

        self.parser.add_argument(
            '--lr',
            default=1e-3,
            type=float,
            help='Base learning rate')

        self.parser.add_argument(
            '--betas',
            default='0.9, 0.98',
            type=str,
            help='Adam optimizer\'s betas')

        self.parser.add_argument(
            '--eps',
            default='1e-9',
            type=float,
            help='Adam optimizer\'s epsilon')

        self.parser.add_argument(
            '--lr_decay_marks',
            default='50000, 55000',
            type=str,
            help='Learing rate decay marks')

        self.parser.add_argument(
            '--lr_decay_factor',
            default=0.5,
            type=float,
            help='Learning rate decay factor')

        self.parser.add_argument(
            '--weight_decay',
            default=1e-6,
            type=float,
            help='Weight decay')

        self.parser.add_argument(
            '--embedDim',
            default=300,
            type=int,
            help='Embedding dimension')

        self.parser.add_argument(
            '--hiddenDim',
            default=512,
            type=int,
            help='LSTM hidden dimension')

        self.parser.add_argument(
            '--numLayers',
            default=2,
            type=int,
            help='Number of hidden LSTM layers')

        self.parser.add_argument(
            '--dropout',
            default=0.1,
            type=float,
            help='Dropout value')

        self.parser.add_argument(
            '--multiHead',
            default=8,
            type=int,
            help='Number of attention heads')

        self.parser.add_argument(
            '--hiddenSizeHead',
            default=64,
            type=int,
            help='Dimension of each attention head')

        self.parser.add_argument(
            '--FeedForwardSize',
            default=2048,
            type=int,
            help='Dimension of the feed forward layer')

        self.parser.add_argument(
            '--FlatMLPSize',
            default=512,
            type=int,
            help='MLP flatten size')

        self.parser.add_argument(
            '--FlatGlimpses',
            default=1,
            type=int,
            help='Number of flatten glimpses')

        self.parser.add_argument(
            '--FlatOutSize',
            default=512,
            type=int,
            help='Final attention reduction dimension')

        self.parser.add_argument(
            '--layers',
            default=6,
            type=int,
            help='Number of self attention layers')

        self.parser.add_argument(
            '--bidirectional',
            default=1,
            type=int,
            help='Activate to use bidirectional LSTMs')

        self.initialized = True

    def parse(self):
        # initialize parser
        if not self.initialized:
            self.initialize()
       # self.opts = self.parser.parse_args()
        self.opts, unknown = self.parser.parse_known_args()#this is added by me to fix the error of command line arguments.

        # parse gpu id list
        str_gpu_ids = self.opts.gpu_ids.split(',')
        self.opts.gpu_ids = []
        for str_id in str_gpu_ids:
            if str_id.isdigit() and int(str_id) >= 0:
                self.opts.gpu_ids.append(int(str_id))
        if len(self.opts.gpu_ids) > 0 and torch.cuda.is_available():
            print('\n[INFO] Using {} CUDA device(s) ...'.format(len(self.opts.gpu_ids)))
        else:
            print('\n[INFO] Using cpu ...')
            self.opts.gpu_ids = []

        # parse the optimizer's betas and lr decay marks
        self.opts.betas = [float(beta) for beta in self.opts.betas.split(',')]
        lr_decay_marks = [int(m) for m in self.opts.lr_decay_marks.split(',')]
        for i in range(1, len(lr_decay_marks)):
            assert lr_decay_marks[i] > lr_decay_marks[i-1]
        self.opts.lr_decay_marks = lr_decay_marks

        # print and save options
        args = vars(self.opts)
        print('\n ' + 30*'-' + 'Opts' + 30*'-')
        for k, v in args.items():
            print('%s: %s' % (str(k), str(v)))

        if not os.path.isdir(self.opts.run_dir):
            os.makedirs(self.opts.run_dir)
        filename = 'opts_c.txt'
        file_path = os.path.join(self.opts.run_dir, filename)
        with open(file_path, 'wt') as fout:
            fout.write('| options\n')
            for k, v in sorted(args.items()):
                fout.write('%s: %s\n' % (str(k), str(v)))
        return self.opts

## **models.py**

In [24]:
import torch
import math
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [25]:
class FC(nn.Module):
    def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
        super(FC, self).__init__()
        self.dropout_r = dropout_r
        self.use_relu = use_relu

        self.linear = nn.Linear(in_size, out_size)

        if use_relu:
            self.relu = nn.ReLU(inplace=True)

        if dropout_r > 0:
            self.dropout = nn.Dropout(dropout_r)

    def forward(self, x):
        x = self.linear(x)

        if self.use_relu:
            x = self.relu(x)

        if self.dropout_r > 0:
            x = self.dropout(x)

        return x


In [26]:
class MLP(nn.Module):
    def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
        super(MLP, self).__init__()

        self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
        self.linear = nn.Linear(mid_size, out_size)

    def forward(self, x):
        return self.linear(self.fc(x))

In [27]:
class LayerNorm(nn.Module):
    def __init__(self, size, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.eps = eps

        self.a_2 = nn.Parameter(torch.ones(size))
        self.b_2 = nn.Parameter(torch.zeros(size))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)

        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [28]:
class MHAtt(nn.Module):
    def __init__(self, opts):
        super(MHAtt, self).__init__()
        self.opts = opts

        self.linear_v = nn.Linear(opts.hiddenDim, opts.hiddenDim)
        self.linear_k = nn.Linear(opts.hiddenDim, opts.hiddenDim)
        self.linear_q = nn.Linear(opts.hiddenDim, opts.hiddenDim)
        self.linear_merge = nn.Linear(opts.hiddenDim, opts.hiddenDim)

        self.dropout = nn.Dropout(opts.dropout)

    def forward(self, v, k, q, mask):
        n_batches = q.size(0)

        v = self.linear_v(v).view(
            n_batches,
            -1,
            self.opts.multiHead,
            self.opts.hiddenSizeHead
        ).transpose(1, 2)

        k = self.linear_k(k).view(
            n_batches,
            -1,
            self.opts.multiHead,
            self.opts.hiddenSizeHead
        ).transpose(1, 2)

        q = self.linear_q(q).view(
            n_batches,
            -1,
            self.opts.multiHead,
            self.opts.hiddenSizeHead
        ).transpose(1, 2)

        atted = self.att(v, k, q, mask)
        atted = atted.transpose(1, 2).contiguous().view(
            n_batches,
            -1,
            self.opts.hiddenDim
        )

        atted = self.linear_merge(atted)

        return atted

    def att(self, value, key, query, mask):
        d_k = query.size(-1)

        scores = torch.matmul(
            query, key.transpose(-2, -1)
        ) / math.sqrt(d_k)

        if mask is not None:
            scores = scores.masked_fill(mask, -1e9)

        att_map = F.softmax(scores, dim=-1)
        att_map = self.dropout(att_map)

        return torch.matmul(att_map, value)

In [29]:
class FFN(nn.Module):
    def __init__(self, opts):
        super(FFN, self).__init__()

        self.mlp = MLP(
            in_size=opts.hiddenDim,
            mid_size=opts.FeedForwardSize,
            out_size=opts.hiddenDim,
            dropout_r=opts.dropout,
            use_relu=True
        )

    def forward(self, x):
        return self.mlp(x)


In [30]:
class SA(nn.Module):
    def __init__(self, opts):
        super(SA, self).__init__()
        self.mhatt = MHAtt(opts)
        self.ffn = FFN(opts)

        self.dropout1 = nn.Dropout(opts.dropout)
        self.norm1 = LayerNorm(opts.hiddenDim)

        self.dropout2 = nn.Dropout(opts.dropout)
        self.norm2 = LayerNorm(opts.hiddenDim)

    def forward(self, x, x_mask):
        x = self.norm1(x + self.dropout1(
            self.mhatt(x, x, x, x_mask)
        ))

        x = self.norm2(x + self.dropout2(
            self.ffn(x)
        ))

        return x

In [31]:
class AttFlat(nn.Module):
    def __init__(self, opts):
        super(AttFlat, self).__init__()
        self.opts = opts

        self.mlp = MLP(
            in_size=opts.hiddenDim,
            mid_size=opts.FlatMLPSize,
            out_size=opts.FlatGlimpses,
            dropout_r=opts.dropout,
            use_relu=True
        )
        # FLAT_GLIMPSES = 1
        self.linear_merge = nn.Linear(
            opts.hiddenDim * opts.FlatGlimpses,
            opts.FlatOutSize
        )

    def forward(self, x, x_mask):
        att = self.mlp(x)
        att = att.masked_fill(
            x_mask.squeeze(1).squeeze(1).unsqueeze(2),
            -1e9
        )
        att = F.softmax(att, dim=1)

        att_list = []
        for i in range(self.opts.FlatGlimpses):
            att_list.append(
                torch.sum(att[:, :, i: i + 1] * x, dim=1)
            )

        x_atted = torch.cat(att_list, dim=1)
        x_atted = self.linear_merge(x_atted)

        return x_atted

In [32]:
class QuestEncoder_1(nn.Module):
    """
        Concat encoder
    """
    def __init__(self, opts, textVocabSize):
        super(QuestEncoder_1, self).__init__()
        bidirectional = opts.bidirectional > 0

        self.embedding = nn.Embedding(textVocabSize, opts.embedDim)
        self.lstmQ = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=opts.hiddenDim,
            num_layers=opts.numLayers,
            bidirectional=bidirectional,
            batch_first=True
        )

        self.lstmH = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=opts.hiddenDim,
            num_layers=opts.numLayers,
            bidirectional=bidirectional,
            batch_first=True)

        if bidirectional:
            opts.hiddenDim *= 2
            opts.hiddenSizeHead *= 2
            opts.FlatOutSize *= 2
        self.attQues = nn.ModuleList([SA(opts) for _ in range(opts.layers)])
        self.attHist = nn.ModuleList([SA(opts) for _ in range(opts.layers)])

        self.attFlatQuest = AttFlat(opts)
        self.fc = nn.Linear(2 * opts.hiddenDim, opts.hiddenDim)

    def forward(self, quest, hist):
        questMask = self.make_mask(quest.unsqueeze(2))
        histMask = self.make_mask(hist.unsqueeze(2))

        # quest = F.tanh(self.embedding(quest))
        quest = self.embedding(quest)

        quest, (_, _) = self.lstmQ(quest)
        questO = quest.detach().clone()

        hist = self.embedding(hist)
        hist, (_, _) = self.lstmH(hist)

        for attQ, attH in zip(self.attQues, self.attHist):
            quest = attQ(quest, questMask)
            hist = attH(hist, histMask)
        # (batchSize, 512)
        quest = self.attFlatQuest(quest, questMask)

        # hist: (batchSize, length, 512)
        attWeights = torch.sum(torch.mul(hist, quest.unsqueeze(1)), -1)
        attWeights = torch.softmax(attWeights, -1)
        hist = torch.sum(torch.mul(hist, attWeights.unsqueeze(2)), 1)
        encOut = self.fc(torch.cat([quest, hist], -1))

        return encOut, questO

    # Masking
    def make_mask(self, feature):
        return (torch.sum(
            torch.abs(feature),
            dim=-1
        ) == 0).unsqueeze(1).unsqueeze(2)

In [33]:
class QuestEncoder_2(nn.Module):
    """
        Stack encoder
    """
    def __init__(self, opts, textVocabSize):
        super(QuestEncoder_2, self).__init__()
        bidirectional = opts.bidirectional > 0
        self.embedding = nn.Embedding(textVocabSize, opts.embedDim)
        self.lstmQ = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=opts.hiddenDim,
            num_layers=opts.numLayers,
            batch_first=True,
            bidirectional=bidirectional,
        )

        self.lstmH = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=opts.hiddenDim,
            num_layers=opts.numLayers,
            batch_first=True,
            bidirectional=bidirectional,
        )
        if bidirectional:
            opts.hiddenDim *= 2

        self.fc = nn.Linear(2 * opts.hiddenDim, opts.hiddenDim)

    def forward(self, quest, hist):

        quest = F.tanh(self.embedding(quest))
        quest, (questH, _) = self.lstmQ(quest)

        # concatenate the last hidden states from the forward and backward pass
        # of the bidirectional lstm
        lastHiddenForward = questH[1:2, :, :].squeeze(0)
        lastHiddenBackward = questH[3:4, :, :].squeeze(0)

        # questH: (batchSize, 512)
        questH = torch.cat([lastHiddenForward, lastHiddenBackward], -1)

        questO = quest.detach().clone()

        hist = F.tanh(self.embedding(hist))
        numRounds = hist.size(1)
        histFeat = []
        for i in range(numRounds):
            round_i = hist[:, i, :, :]
            _, (round_i_h, _) = self.lstmH(round_i)

            #Same as before
            lastHiddenForward = round_i_h[1:2, :, :].squeeze(0)
            lastHiddenBackward = round_i_h[3:4, :, :].squeeze(0)
            histFeat.append(torch.cat([lastHiddenForward, lastHiddenBackward], -1))

        # hist: (batchSize, rounds, 512)
        histFeat = torch.stack(histFeat, 1)
        attWeights = torch.sum(torch.mul(histFeat, questH.unsqueeze(1)), -1)
        attWeights = torch.softmax(attWeights, -1)
        histFeat = torch.sum(torch.mul(histFeat, attWeights.unsqueeze(2)), 1)
        encOut = self.fc(torch.cat([questH, histFeat], -1))
        return encOut, questO

In [34]:
#*** ->for qiansu: copy and paste the whole class
class Decoder(nn.Module):
    def __init__(self, opts, progVocabSize, maxLen, startID=1, endID=2):
        super(Decoder, self).__init__()
        self.numLayers = opts.numLayers
        self.bidirectional = opts.bidirectional > 0
        self.maxLen = maxLen
        self.startID = startID
        self.endID = endID

        self.embedding = nn.Embedding(progVocabSize, opts.embedDim)
        self.lstmProg = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=2*opts.hiddenDim if self.bidirectional else opts.hiddenDim,
            num_layers=opts.numLayers,
            batch_first=True,
            #bidirectional=self.bidirectional,#???????
        )
        hiddenDim = opts.hiddenDim
        if self.bidirectional:
            hiddenDim *= 2

        self.fcAtt = nn.Linear(2*hiddenDim, hiddenDim)
        self.fcOut = nn.Linear(hiddenDim, progVocabSize)

    def initPrgHidden(self, encOut):
        hidden = [encOut for _ in range(self.numLayers)]
        hidden = torch.stack(hidden, 0).contiguous()
        return hidden, hidden

    def forwardStep(self, prog, progH, questO):
        #**********************************************our error relates to this prog cause in our case it is not acting as tensor anymore.
        batchSize = prog.size(0)
        inputDim = questO.size(1)
        prog = self.embedding(prog)
        outProg, progH = self.lstmProg(prog, progH)

        att = torch.bmm(outProg, questO.transpose(1, 2))
        att = F.softmax(att.view(-1, inputDim), 1).view(batchSize, -1, inputDim)
        context = torch.bmm(att, questO)
        # (batchSize, progLength, hiddenDim)
        out = F.tanh(self.fcAtt(torch.cat([outProg, context], dim=-1)))

        # (batchSize, progLength, progVocabSize)
        out = self.fcOut(out)
        predSoftmax = F.log_softmax(out, 2)
        return predSoftmax, progH

    def forward(self, prog, encOut, questO):
        progH = self.initPrgHidden(encOut)
        predSoftmax, progH = self.forwardStep(prog, progH, questO)

        return predSoftmax, progH

    def sample(self, encOut, questO):
        batchSize = encOut.size(0)
        cudaFlag = encOut.is_cuda
        progH = self.initPrgHidden(encOut)
        # prog = progCopy[:, 0:3]
        prog = torch.LongTensor(batchSize, 1).fill_(self.startID)
        # prog = torch.cat((progStart, progEnd), -1)
        if cudaFlag:
            prog = prog.cuda()
        outputLogProbs = []
        outputTokens = []
     

        def decode(i, output):
            tokens = output.topk(1, dim=-1)[1].view(batchSize, -1)
            #print("This is inside of the decode local function and this is the tockens=", tokens)
            return tokens

        for i in range(self.maxLen):
            predSoftmax, progH = self.forwardStep(prog, progH, questO)
            prog = decode(i, predSoftmax)
            prog_flat = list(chain(*prog))
            flat_list = [item.item() for item in prog_flat]

        #****************************************my modification
            outputTokens.append(flat_list)#new
       #print("lets check what is inside outputTocken", outputTokens)    
       # print("-----------------------------------------")
        return outputTokens, outputLogProbs


In [35]:
class SeqToSeqC(nn.Module):
    def __init__(self, encoder, decoder):
        super(SeqToSeqC, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, cap, prog):
        encOut, capO = self.encoder(cap)
        predSoftmax, progHC = self.decoder(prog, encOut, capO)
        return predSoftmax, progHC
   
    def sample(self, cap):
        with torch.no_grad():
            encOut, capO = self.encoder(cap)
        outputTokens, outputLogProbs = self.decoder.sample(encOut, capO)
        #if not outputTokens:
          #  print("***")
        # Handle the case where outputTokens is empty, for example, return a placeholder tensor
           # return torch.tensor([])
        #***************************************************************** 
        #outputTokens = torch.stack(outputTokens, 0).transpose(0, 1)
        #outputTokens = torch.stack(outputTokens, dim=0).transpose(0, 1)
        outputTokens_t = [[row[i] for row in outputTokens] for i in range(len(outputTokens[0]))]#***transpose 
        return outputTokens_t


In [36]:
class SeqToSeqQ(nn.Module):
    def __init__(self, encoder, decoder):
        super(SeqToSeqQ, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, quest, hist, prog):
        encOut, questO = self.encoder(quest, hist)
        predSoftmax, progHC = self.decoder(prog, encOut, questO)
        return predSoftmax, progHC

    def sample(self, quest, hist):
        with torch.no_grad():
            encOut, questO = self.encoder(quest, hist)
            outputTokens, outputLogProbs = self.decoder.sample(encOut, questO)
      
        outputTokens_t = [[row[i] for row in outputTokens] for i in range(len(outputTokens[0]))]#***transpose 
        return outputTokens_t

## **optim.py**

In [37]:
import torch
import torch.optim as Optim
from itertools import chain #***

In [38]:
class WarmupOptimizer(object):
    def __init__(self, lr_base, optimizer, data_size, batch_size):
        self.optimizer = optimizer
        self._step = 0
        self.lr_base = lr_base
        self._rate = 0
        self.data_size = data_size
        self.batch_size = batch_size

    def step(self):
        self._step += 1

        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate

        self.optimizer.step()

    def zero_grad(self):
        self.optimizer.zero_grad()

    def rate(self, step=None):
        if step is None:
            step = self._step

        if step <= int(self.data_size / self.batch_size * 1):
            r = self.lr_base * 1/2.
        else:
            r = self.lr_base

        return r


def get_optim(opts, model, data_size, lr_base=None):
    if lr_base is None:
        lr_base = opts.lr

    if opts.optim == 'adam':
        optim = Optim.Adam(
                filter(lambda p: p.requires_grad, model.parameters()),
                lr=0,
                betas=opts.betas,
                eps=opts.eps,

            )
    elif opts.optim == 'rmsprop':
        optim = Optim.RMSprop(
                filter(lambda p: p.requires_grad, model.parameters()),
                lr=0,
                eps=opts.eps,
                weight_decay=opts.weight_decay
            )
    else:
        raise ValueError('{} optimizer is not supported'.fromat(opts.optim))
    return WarmupOptimizer(
        lr_base,
        optim,
        data_size,
        opts.batch_size
    )

def adjust_lr(optim, decay_r):
    optim.lr_base *= decay_r


## **option_question_parser.py**

In [39]:
import argparse
import os
#import utils_m
import torch

In [40]:
class OptionsQ():#changed optiopn class as Option_q to differentiate it with the one belong to caption
    def __init__(self):
        self.parser = argparse.ArgumentParser()
        self.initialized = False
        

    def initialize(self):
        self.parser.add_argument(
            '--mode',
            default='train',#***
            type=str,
            #choices=['train', 'test_with_gt', 'test_with_pred'],
            help='The mode of the experiment')

        self.parser.add_argument(
            '--run_dir',
            #required=True,
            default= '/kaggle/working',
            type=str,
            help='The experiment directory')
        #***
        self.parser.add_argument(
            '--useCuda',
            default=1,
            type=int,
            help='To be able to use cuda')

        self.parser.add_argument(
            '--text_log_dir',
            #required=True,
            default="/kaggle/working/res.txt",
            type=str,
            help='File to save the logged text')

        self.parser.add_argument(
            '--questionNetPath',
            #default='',
            default = '',
            type=str,
            help='Path to the pretrained QuestionNet that will be used for testing.')

        self.parser.add_argument(
            '--captionNetPath',
            default = '',
            type=str,
            help='Path to the pretrained CaptionNet that will be used for testing.')

        self.parser.add_argument(
            '--dialogLen',
            default=10,
            type=int,
            help='Length of the dialogs to be used for testing. We used 10, 15, and 20 in our experiments.')

        self.parser.add_argument(
            '--last_n_rounds',
            default=10,
            type=int,
            help='Number of the last rounds to consider in the history. We used 1, 2, 3, 4, and 10 in our experiments. ')

        self.parser.add_argument(
            '--encoderType',
            #required=True,
            default=1,
            type=int,
            choices=[1, 2],
            help='Type of the encoder: 1 --> Concat, 2 --> Stack')

        self.parser.add_argument(
            '--load_checkpoint_path',
            default='None',
            type=str,
            help='Path to a QestionNet checkpoint path to resume training')

        self.parser.add_argument(
            '--gpu_ids',
            default='0',
            type=str,
            help='Id of the gpu to be used')

        self.parser.add_argument(
            '--seed',
            default=42,
            type=int,
            help='The seed used in training')

        self.parser.add_argument(
            '--dataPathTr',
            #required=True,
            default= "/kaggle/input/Small_Tr_Val_Test_Final/train_concat_half.h5",
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed training data')

        self.parser.add_argument(
            '--dataPathVal',
            #required=True,
            default="/kaggle/input/Small_Tr_Val_Test_Final/cap_val_half.h5",
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed validation data')

        self.parser.add_argument(
            '--dataPathTest',
            #required=True,
            default = "/kaggle/input/Small_Tr_Val_Test_Final/test_concat_1000.h5",
            type=str,
            help='Path to the h5 file of the Clevr-Dialog preprocessed test data')

        self.parser.add_argument(
            '--scenesPath',
            #required=True,
            default="/kaggle/input/data/CLEVR_train_scenes.json",
            
            type=str,
            help='Path to the derendered clevr-dialog scenes')


        self.parser.add_argument(
            '--vocabTestPath',
            #required=True,
            default = "/kaggle/input/test_concat/vocab_output.json",
            type=str,
            help='Path to the test vocabulary')


        

        self.parser.add_argument(
            '--vocabPath',
            #required=True,
            default =  "/kaggle/input/train_concat/vocab_output.json",
            type=str,
            help='Path to the generated vocabulary')

        self.parser.add_argument(
            '--batch_size',
            #default=64,
            default=32,
            type=int,
            help='Batch size')

        self.parser.add_argument(
            '--countFirstFailueRound',
            default=0,
            type=int,
            help='If activated, we count the first failure round')

        self.parser.add_argument(
            '--maxSamples',
            default=-1,
            type=int,
            help='Maximum number of training samples')

        self.parser.add_argument(
            '--num_workers',
            default=0,
            type=int,
            help='Number of workers for loading')

        self.parser.add_argument(
            '--num_iters',
            default=TOTAL_ITER,
            type=int,
            help='Total number of iterations')

        self.parser.add_argument(
            '--display_every',
            default=5,
            type=int,
            help='Display training information every N iterations')

        self.parser.add_argument(
            '--validate_every',
            default=VALID_EVE,
            type=int,
            help='Validate every N iterations')

        self.parser.add_argument(
            '--shuffle_data',
            default=1,
            type=int,
            help='Activate to shuffle the training data')

        self.parser.add_argument(
            '--optim',
            default='adam',
            type=str,
            help='The name of the optimizer to be used')

        self.parser.add_argument(
            '--lr',
            default=1e-3,
            type=float,
            help='Base learning rate')

        self.parser.add_argument(
            '--betas',
            default='0.9, 0.98',
            type=str,
            help='Adam optimizer\'s betas')

        self.parser.add_argument(
            '--eps',
            default='1e-9',
            type=float,
            help='Adam optimizer\'s epsilon')

        self.parser.add_argument(
            '--lr_decay_marks',
            default='50000, 55000',
            type=str,
            help='Learing rate decay marks')

        self.parser.add_argument(
            '--lr_decay_factor',
            default=0.5,
            type=float,
            help='Learning rate decay factor')

        self.parser.add_argument(
            '--weight_decay',
            default=1e-6,
            type=float,
            help='Weight decay')

        self.parser.add_argument(
            '--embedDim',
            default=300,
            type=int,
            help='Embedding dimension')

        self.parser.add_argument(
            '--hiddenDim',
            default=512,
            type=int,
            help='LSTM hidden dimension')

        self.parser.add_argument(
            '--numLayers',
            default=2,
            type=int,
            help='Number of hidden LSTM layers')

        self.parser.add_argument(
            '--dropout',
            default=0.1,
            type=float,
            help='Dropout value')

        self.parser.add_argument(
            '--multiHead',
            default=8,
            type=int,
            help='Number of attention heads')

        self.parser.add_argument(
            '--hiddenSizeHead',
            default=64,
            type=int,
            help='Dimension of each attention head')

        self.parser.add_argument(
            '--FeedForwardSize',
            default=2048,
            type=int,
            help='Dimension of the feed forward layer')

        self.parser.add_argument(
            '--FlatMLPSize',
            default=512,
            type=int,
            help='MLP flatten size')

        self.parser.add_argument(
            '--FlatGlimpses',
            default=1,
            type=int,
            help='Number of flatten glimpses')

        self.parser.add_argument(
            '--FlatOutSize',
            default=512,
            type=int,
            help='Final attention reduction dimension')

        self.parser.add_argument(
            '--layers',
            default=6,
            type=int,
            help='Number of self attention layers')

        self.parser.add_argument(
            '--bidirectional',
            default=1,
            type=int,
            help='Activate to use bidirectional LSTMs')

        self.initialized = True

    def parse(self):
        # initialize parser
        if not self.initialized:
            self.initialize()
        #self.opts = self.parser.parse_args()#***
        self.opts, unknown = self.parser.parse_known_args()#this is added by me to fix the error of command line arguments.
        # parse gpu id list
        str_gpu_ids = self.opts.gpu_ids.split(',')
        self.opts.gpu_ids = []
        for str_id in str_gpu_ids:
            if str_id.isdigit() and int(str_id) >= 0:
                self.opts.gpu_ids.append(int(str_id))
        if len(self.opts.gpu_ids) > 0 and torch.cuda.is_available():
            print('\n[INFO] Using {} CUDA device(s) ...'.format(
                len(self.opts.gpu_ids)))
        else:
            print('\n[INFO] Using cpu ...')
            self.opts.gpu_ids = []

        # parse the optimizer's betas and lr decay marks
        self.opts.betas = [float(beta) for beta in self.opts.betas.split(',')]
        lr_decay_marks = [int(m) for m in self.opts.lr_decay_marks.split(',')]
        for i in range(1, len(lr_decay_marks)):
            assert lr_decay_marks[i] > lr_decay_marks[i-1]
        self.opts.lr_decay_marks = lr_decay_marks

        # print and save options
        args = vars(self.opts)
        print('\n ' + 30*'-' + 'Opts' + 30*'-')
        for k, v in args.items():
            print('%s: %s' % (str(k), str(v)))

        if not os.path.isdir(self.opts.run_dir):
            os.makedirs(self.opts.run_dir)
        filename = 'opts.txt'
        file_path = os.path.join(self.opts.run_dir, filename)
        with open(file_path, 'wt') as fout:
            fout.write('| options\n')
            for k, v in sorted(args.items()):
                fout.write('%s: %s\n' % (str(k), str(v)))
        return self.opts


## **Train_question_parser**

In [41]:
import os
import sys
import json, torch, pickle, copy, time
import numpy as np
import argparse
import torch.nn as nn
import torch.utils.data as Data
from tensorboardX import SummaryWriter
from copy import deepcopy
#from clevrDialog_dataset import ClevrDialogQuestionDataset
import pickle
from tqdm import tqdm



In [42]:
class CaptionEncoder(nn.Module):
    def __init__(self, opts, textVocabSize):
        super(CaptionEncoder, self).__init__()
        self.embedding = nn.Embedding(textVocabSize, opts.embedDim)
        bidirectional = opts.bidirectional > 0
        self.lstmC = nn.LSTM(
            input_size=opts.embedDim,
            hidden_size=opts.hiddenDim,
            num_layers=opts.numLayers,
            batch_first=True,
            bidirectional=bidirectional
        )
        if bidirectional:
            opts.hiddenDim *= 2
            opts.hiddenSizeHead *= 2
            opts.FlatOutSize *= 2

        self.attCap = nn.ModuleList([SA(opts) for _ in range(opts.layers)])
        self.attFlatCap = AttFlat(opts)
        self.fc = nn.Linear(opts.hiddenDim, opts.hiddenDim)

    def forward(self, cap, hist=None):
        capMask = self.make_mask(cap.unsqueeze(2))
        cap = self.embedding(cap)
        cap, (_, _) = self.lstmC(cap)
        capO = cap.detach().clone()

        for attC in self.attCap:
            cap = attC(cap, capMask)
        # (batchSize, 512)
        cap = self.attFlatCap(cap, capMask)
        encOut = self.fc(cap)
        return encOut, capO
    
    # Masking
    def make_mask(self, feature):
        return (torch.sum(
            torch.abs(feature),
            dim=-1
        ) == 0).unsqueeze(1).unsqueeze(2)

In [43]:
class Execution:
    def __init__(self, optsQ, optsC):
        self.opts = deepcopy(optsQ)
        if self.opts.useCuda > 0 and torch.cuda.is_available():
            self.device = torch.device("cuda:0")
            print("[INFO] Using GPU {} ...".format(torch.cuda.get_device_name(0)))
        else:
            print("[INFO] Using CPU ...")
            self.device = torch.device("cpu")

        self.loss_fn = torch.nn.NLLLoss().to(self.device)

        print("[INFO] Loading dataset ...")

        self.datasetTr = ClevrDialogQuestionDataset(
            self.opts.dataPathTr, self.opts.vocabPath, "train", "All tr data")

        self.datasetVal = ClevrDialogQuestionDataset(
            self.opts.dataPathVal, self.opts.vocabPath, "val", "All val data", train=False)

        self.datasetTest = ClevrDialogQuestionDataset(
            self.opts.dataPathTest, self.opts.vocabTestPath, "test", "All val data", train=False)
        


     
        self.QuestionNet = constructQuestionNet(
            self.opts,
            self.datasetTr.lenVocabText,
            self.datasetTr.lenVocabProg,
            self.datasetTr.maxLenProg,
            #self.datasetTest.lenVocabText,#*** to solve mismatch problems
            #self.datasetTest.lenVocabProg,#***
            #self.datasetTest.maxLenProg#***
            )

        if os.path.isfile(self.opts.captionNetPath):
            self.CaptionNet = constructCaptionNet(
                optsC,
                self.datasetTr.lenVocabText,
                self.datasetTr.lenVocabProg,
                self.datasetTr.maxLenProg,
                #self.datasetTest.lenVocabText,#*** 
                #self.datasetTest.lenVocabProg,#***
                #self.datasetTest.maxLenProg#***
                )
            print('Loading CaptionNet from {}'.format(self.opts.captionNetPath))
            state_dict = torch.load(self.opts.captionNetPath)['state_dict']
            self.CaptionNet.load_state_dict(state_dict)
            self.CaptionNet.to(self.device)
            total_params_cap = sum(p.numel() for p in self.CaptionNet.parameters() if p.requires_grad)
            print("The caption encoder has {} trainable parameters".format(total_params_cap))

        self.QuestionNet.to(self.device)
        if os.path.isfile(self.opts.questionNetPath):
            print('Loading QuestionNet from {}'.format(optsQ.questionNetPath))
            state_dict = torch.load(self.opts.questionNetPath)['state_dict']
            self.QuestionNet.load_state_dict(state_dict)
        total_params_quest = sum(p.numel() for p in self.QuestionNet.parameters() if p.requires_grad)
        print("The question encoder has {} trainable parameters".format(total_params_quest))

        if "minecraft" in self.opts.scenesPath:
            self.symbolicExecutor = SymbolicExecutorMinecraft(self.opts.scenesPath)
        else:
            self.symbolicExecutor = SymbolicExecutorClevr(self.opts.scenesPath)

        tb_path = os.path.join(self.opts.run_dir, "tb_logdir")
        if not os.path.isdir(tb_path):
            os.makedirs(tb_path)

        self.ckpt_path = os.path.join(self.opts.run_dir, "ckpt_dir")
        if not os.path.isdir(self.ckpt_path):
            os.makedirs(self.ckpt_path)
        if not os.path.isdir(self.opts.text_log_dir):
            os.makedirs(self.opts.text_log_dir)

        self.writer = SummaryWriter(tb_path)
        self.iter_val = 0
#***
        #if os.path.isfile(self.opts.dependenciesPath):
            #with open(self.opts.dependenciesPath, "rb") as f:
                #self.dependencies = pickle.load(f)

    def train(self):
        self.QuestionNet.train()

        # Define the multi-gpu training if needed
        if len(self.opts.gpu_ids) > 1:
            self.QuestionNet = nn.DataParallel(self.QuestionNet, device_ids=self.opts.gpu_ids)

        # Load checkpoint if resume training
        if os.path.isfile(self.opts.load_checkpoint_path):
            print("[INFO] Resume trainig from ckpt {} ...".format(
                self.opts.load_checkpoint_path
            ))

            # Load the network parameters
            ckpt = torch.load(self.opts.load_checkpoint_path)
            print("[INFO] Checkpoint successfully loaded ...")
            self.QuestionNet.load_state_dict(ckpt['state_dict'])

            # Load the optimizer paramters
            optim = get_optim(self.opts, self.QuestionNet, len(self.datasetTr))  # , ckpt['optim'], lr_base=ckpt['lr_base'])
            # optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH)
            optim.optimizer.load_state_dict(ckpt['optimizer'])
            _iter = 0  #  ckpt['last_iter']
            epoch = 0  # ckpt['last_epoch']

        else:
            optim = get_optim(self.opts, self.QuestionNet, len(self.datasetTr))
            _iter = 0
            epoch = 0

        trainTime = 0
        bestValAcc = float("-inf")
        bestCkp = 0
        # Training loop
        while _iter < self.opts.num_iters:

            # Learning Rate Decay
            if _iter in self.opts.lr_decay_marks:
                adjust_lr(optim, self.opts.lr_decay_factor)

            # Define multi-thread dataloader
            dataloader = Data.DataLoader(
                self.datasetTr,
                batch_size=self.opts.batch_size,
                shuffle=self.opts.shuffle_data,
                num_workers=self.opts.num_workers,
            )

            # Iteration
            time_start = 0
            time_end = 0
            for batch_iter, (quest, hist, prog, questionRound, _) in enumerate(dataloader):
                time_start = time.time()
                if _iter >= self.opts.num_iters:
                    break
                quest = quest.to(self.device)
                if self.opts.last_n_rounds < 10:
                    last_n_rounds_batch = []
                    for i, r in enumerate(questionRound.tolist()):
                        startIdx = max(r - self.opts.last_n_rounds, 0)
                        endIdx = max(r, self.opts.last_n_rounds)
                        if hist.dim() == 3:
                            assert endIdx - startIdx == self.opts.last_n_rounds
                            histBatch = hist[i, :, :]
                            last_n_rounds_batch.append(histBatch[startIdx:endIdx, :])
                        elif hist.dim() == 2:
                            startIdx *= 20
                            endIdx *= 20
                            histBatch = hist[i, :]
                            temp = histBatch[startIdx:endIdx].cpu()
                            if r > self.opts.last_n_rounds:
                                last_n_rounds_batch.append(torch.cat([torch.tensor([1]), temp, torch.tensor([2])], 0))
                            else:
                                last_n_rounds_batch.append(torch.cat([temp, torch.tensor([2, 0])], 0))
                    hist = torch.stack(last_n_rounds_batch, dim=0)
                hist = hist.to(self.device)
                prog = prog.to(self.device)
                progTarget = prog.clone()
                optim.zero_grad()

                predSoftmax, _ = self.QuestionNet(quest, hist, prog[:, :-1])
                loss = self.loss_fn(
                    # predSoftmax[:, :-1, :].contiguous().view(-1, predSoftmax.size(2)),
                    predSoftmax.contiguous().view(-1, predSoftmax.size(2)),
                    progTarget[:, 1:].contiguous().view(-1))
                loss.backward()

                if _iter % self.opts.validate_every == 0 and _iter > 0:
                    valAcc = self.val()
                    if valAcc > bestValAcc:
                        bestValAcc = valAcc
                        bestCkp = _iter
                        print("\n[INFO] Checkpointing model @ iter {} with val accuracy {}\n".format(_iter, valAcc))
                        state = {
                            'state_dict': self.QuestionNet.state_dict(),
                            'optimizer': optim.optimizer.state_dict(),
                            'lr_base': optim.lr_base,
                            'optim': optim.lr_base,
                            'last_iter': _iter,
                            'last_epoch': epoch,
                        }
                        # checkpointing
                        torch.save(
                            state,
                            os.path.join(self.ckpt_path, 'ckpt_iter' + str(_iter) + '.pkl')
                        )

                # logging
                self.writer.add_scalar(
                    'train/loss',
                    loss.cpu().data.numpy(),
                    global_step=_iter)

                self.writer.add_scalar(
                    'train/lr',
                    optim._rate,
                    global_step=_iter)
                if _iter % self.opts.display_every == 0:
                    time_end = time.time()
                    trainTime += time_end-time_start

                    print("\r[CLEVR-Dialog - %s (%d | %d)][epoch %2d][iter %4d/%4d][runtime %4f] loss: %.4f, lr: %.2e" % (
                        self.datasetTr.name,
                        batch_iter,
                        len(dataloader),
                        epoch,
                        _iter,
                        self.opts.num_iters,
                        trainTime,
                        loss.cpu().data.numpy(),
                        optim._rate,
                    ), end='          ')

                optim.step()
                _iter += 1

            epoch += 1
        print("[INFO] Avg. epoch time: {} s".format(trainTime / epoch))
        print("[INFO] Best model achieved val acc. {} @ iter {}".format(bestValAcc, bestCkp))

    def val(self):
        self.QuestionNet.eval()

        total_correct = 0
        total = 0

        if len(self.opts.gpu_ids) > 1:
            self.QuestionNet = nn.DataParallel(self.QuestionNet, device_ids=self.opts.gpu_ids)
        self.QuestionNet = self.QuestionNet.eval()
        dataloader = Data.DataLoader(
            self.datasetVal,
            batch_size=self.opts.batch_size,
            shuffle=True,
            num_workers=self.opts.num_workers,
            pin_memory=False
        )
        _iterCur = 0
        _totalCur = len(dataloader)

        for step, (question, questionPrg, questionImgIdx, questionRounds, history, historiesProg, answer) in enumerate(dataloader):
            # print("\rEvaluation: [step %4d/%4d]" % (
            print("\rEvaluation: [step %4d/%4d]" % (
                step,
                int(len(dataloader)),
            ), end='          ')

            question = question.to(self.device)
            if history.dim() == 3:
                caption = history.detach()
                caption = caption[:, 0, :]
                caption = caption[:, :16].to(self.device)
            elif history.dim() == 2:
                caption = history.detach()
                caption = caption[:, :16].to(self.device)
            if self.opts.last_n_rounds is not None:
                last_n_rounds_batch = []
                for i, r in enumerate(questionRounds.tolist()):
                    startIdx = max(r - self.opts.last_n_rounds, 0)
                    endIdx = max(r, self.opts.last_n_rounds)
                    if history.dim() == 3:
                        assert endIdx - startIdx == self.opts.last_n_rounds
                        histBatch = history[i, :, :]
                        last_n_rounds_batch.append(histBatch[startIdx:endIdx, :])
                    elif history.dim() == 2:
                        startIdx *= 20
                        endIdx *= 20
                        histBatch = history[i, :]
                        temp = histBatch[startIdx:endIdx]
                        if r > self.opts.last_n_rounds:
                            last_n_rounds_batch.append(torch.cat([torch.tensor([1]), temp, torch.tensor([2])], 0))
                        else:
                            last_n_rounds_batch.append(torch.cat([temp, torch.tensor([2, 0])], 0))
                history = torch.stack(last_n_rounds_batch, dim=0)
            history = history.to(self.device)
            questionPrg = questionPrg.to(self.device)
            questProgsToksPred = self.QuestionNet.sample(question, history)
            questProgsPred = decodeProg(questProgsToksPred, self.datasetVal.vocab["idx_prog_to_token"])

            targetProgs = decodeProg(questionPrg, self.datasetVal.vocab["idx_prog_to_token"], target=True)
            #print("this is the value for targetProgs:", targetProgs)
            #print("-----------------------------------------------------")
            correct = [1 if pred == gt else 0 for (pred, gt) in zip(questProgsPred, targetProgs)]
            #print("lets see what is inside of the correct:", correct)
            #print("-----------------------------------------------------")
            correct = sum(correct)
            total_correct += correct
            total += len(targetProgs)
            self.QuestionNet.train()

        return 100.0 * (total_correct / total)



    def getPrediction(self, questProgPred, capProgPred, historyProg, imgIndex):
        self.symbolicExecutor.reset(imgIndex)
        # if round one, execute the predicted caption program first then answer the question
        if len(historyProg) == 1:
            captionFuncLabel = capProgPred[0]
            captionFuncArgs = capProgPred[1:]

            questionFuncLabel = questProgPred[0]
            questionFuncArgs = questProgPred[1:]

            try:
                _ = self.symbolicExecutor.execute(captionFuncLabel, captionFuncArgs)
            except Exception as e:
                #print("Error is in caption This is the first round",e)
               # print(captionFuncLabel, captionFuncArgs)
                return "Error"

                

            try:
                predAnswer = self.symbolicExecutor.execute(questionFuncLabel, questionFuncArgs)
            except Exception as e:
               # print ("Error is in question. This is the first round ",e)
               # print("The question questionFuncLabel ",questionFuncLabel)
               # print("The questionFunc Args ",questionFuncArgs)
               # print(self.symbolicExecutor.execute(questionFuncLabel,questionFuncArgs))

                return "Error"

        # If it is not the first round, we have to execute the program history and
        # then answer the question.
        else:
            questionFuncLabel = questProgPred[0]
            questionFuncArgs = questProgPred[1:]
            for prg in historyProg:
                # prg = prg.split(" ")
                FuncLabel = prg[0]
                FuncArgs = prg[1:]
                try:
                    _ = self.symbolicExecutor.execute(FuncLabel, FuncArgs)
                except:
                    #print("Error executing in history program:", )
                    return "Error"

            try:
                predAnswer = self.symbolicExecutor.execute(questionFuncLabel, questionFuncArgs)
            except Exception as e:
                #print("Error executing in current program:", e)  #

                return "Error"
        return str(predAnswer)

    def run(self, run_mode, epoch=None):
        self.set_seed(self.opts.seed)
        if run_mode == 'train':
            self.train()
    
       
        else:
            exit(-1)

    def set_seed(self, seed):
        """Sets the seed for reproducibility.
        Args:
            seed (int): The seed used
        """
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)
        print('[INFO] Seed set to {}...'.format(seed))


def constructQuestionNet(opts, lenVocabText, lenVocabProg, maxLenProg):
    decoder = Decoder(opts, lenVocabProg, maxLenProg)
    if opts.encoderType == 1:
        encoder = QuestEncoder_1(opts, lenVocabText)
    elif opts.encoderType == 2:
        encoder = QuestEncoder_2(opts, lenVocabText)

    net = SeqToSeqQ(encoder, decoder)
    return net


def constructCaptionNet(opts, lenVocabText, lenVocabProg, maxLenProg):
    decoder = Decoder(opts, lenVocabProg, maxLenProg)
    encoder = CaptionEncoder(opts, lenVocabText)
    net = SeqToSeqC(encoder, decoder)
    return net


def getProgHistories(progHistToks, prgIdxToToken):
    progHist = []
    temp = []
    for tok in progHistToks:
        if tok not in [0, 1, 2]:
            temp.append(prgIdxToToken[tok])
            # del progHistToks[i]
        if tok == 2:
            # del progHistToks[i]
            # progHist.append(" ".join(temp))
            progHist.append(temp)
            temp = []
    return progHist


def getHistoriesFromStack(histToks, textIdxToToken):
    histories = "\n"
    temp = []
    for i, roundToks in enumerate(histToks):
        for tok in roundToks:
            if tok not in [0, 1, 2]:
                temp.append(textIdxToToken[tok])
                # del progHistToks[i]
            if tok == 2:
                # del progHistToks[i]
                if i == 0:
                    histories += " ".join(temp) + ".\n"
                else:
                    histories += " ".join(temp[:-1]) + "? | {}.\n".format(temp[-1])
                # histories.append(temp)
                temp = []
                break
    return histories


def getHistoriesFromConcat(histToks, textIdxToToken):
    histories = []
    temp = []
    for tok in histToks:
        if tok not in [0, 1, 2]:
            temp.append(textIdxToToken[tok])
            # del progHistToks[i]
        if tok == 2:
            # del progHistToks[i]
            histories.append(" ".join(temp[:-1]) + "? | {}".format(temp[-1]))
            # histories.append(temp)
            temp = []
    return histories


def decodeProg(tokens, prgIdxToToken, target=False):
    #tokensBatch = tokens.tolist()
    if (target == True):#***
        tokensBatch = tokens.tolist()
    else:#***
        tokensBatch = tokens
    progsBatch = []
    for tokens in tokensBatch:
        prog = []
        for tok in tokens:
            if tok == 2:  # <END> has index 2
                break
            prog.append(prgIdxToToken.get(tok))
        if target:
            prog = prog[1:]
        # progsBatch.append(" ".join(prog))
        progsBatch.append(prog)
    return progsBatch


def printPred(predSoftmax, gts, prgIdxToToken):
    assert predSoftmax.size(0) == gts.size(0)
    tokens = predSoftmax.topk(1)[1].squeeze(-1)
    tokens = tokens.tolist()
    gts = gts.tolist()
    message = "\n ------------------------ \n"
    for token, gt in zip(tokens, gts):
        message += "Prediction: "
        for tok in token:
            message += prgIdxToToken.get(tok) + " "
        message += "\n Target   : "
        for tok in gt:
            message += prgIdxToToken.get(tok) + " "
        message += "\n ------------------------ \n"
    return message


def get_per_round_acc(preds, gts, penalties):
    res = {}
    for img_preds, img_gt, img_pen in zip(preds, gts, penalties):
        img_preds = list(img_preds)
        img_gt = list(img_gt)
        img_pen = list(img_pen)
        print(img_pen)
        for i, (pred, gt, pen) in enumerate(zip(img_preds, img_gt, img_pen)):
            _round = str(i + 1)
            if _round not in res:
                res[_round] = {
                    "correct": 0,
                    "all": 0
                }
            res[_round]["all"] += 1
            if pred == gt:
                pen = 0.1
                res[_round]["correct"] += 0.5**pen

    textOut = "\n --------------- Per round Acc --------------- \n"
    for k in res:
        textOut += "{}: {} %\n".format(k, 100.0 * (res[k]["correct"]/res[k]["all"]))
    return textOut


def get_per_question_type_acc(preds, gts, qtypes, penalties):
    res1 = {}
    res2 = {}

    for img_preds, img_gt, img_qtypes, img_pen in zip(preds, gts, qtypes, penalties):
        # img_preds = list(img_preds)
        # img_gt = list(img_gt)
        img_pen = list(img_pen)
        for pred, gt, temp, pen in zip(img_preds, img_gt, img_qtypes, img_pen):
            if temp not in res1:
                res1[temp] = {
                    "correct": 0,
                    "all": 0
                }
            temp_cat = temp.split("-")[0]
            if temp_cat not in res2:
                res2[temp_cat] = {
                    "correct": 0,
                    "all": 0
                }
            res1[temp]["all"] += 1
            res2[temp_cat]["all"] += 1

            if pred == gt:
                pen = 0.1 
                res1[temp]["correct"] += 0.5**pen
                res2[temp_cat]["correct"] += 0.5**pen

    textOut = "\n --------------- Per question Type Acc --------------- \n"
    for k in res1:
        textOut += "{}: {} %\n".format(k, 100.0 * (res1[k]["correct"]/res1[k]["all"]))

    textOut += "\n --------------- Per question Category Acc --------------- \n"
    for k in res2:
        textOut += "{}: {} %\n".format(k, 100.0 * (res2[k]["correct"]/res2[k]["all"]))
    return textOut


def decode(tokens, prgIdxToToken, target=False):
    if type(tokens) != list:
        tokens = tokens.tolist()

    progsBatch = []
    for token in tokens:
        prog = []
        for tok in token:
            if tok == 2:  # <END> has index 2
                break
            prog.append(prgIdxToToken.get(tok))
        if target:
            prog = prog[1:]
        # progsBatch.append(" ".join(prog))
        progsBatch.append(prog)
    return progsBatch

#if __name__ == "__main__":#***

In [44]:
optsC = OptionsC().parse()#***


[INFO] Using 1 CUDA device(s) ...

 ------------------------------Opts------------------------------
mode: train
run_dir: kaggle/working
load_checkpoint_path: None
res_path: kaggle/working/res.txt
gpu_ids: [0]
seed: 42
dataPathTr: /kaggle/input/caption_small/tr_cap_s.h5
dataPathVal: /kaggle/input/caption_small/val_cap_s.h5
dataPathTest: /kaggle/input/caption_small/test_cap_s.h5
vocabPath: /kaggle/input/caption/vocab_output_caption.json
batch_size: 64
num_workers: 0
num_iters: 5000
display_every: 5
debug_every: 100
validate_every: 1000
shuffle_data: 1
optim: adam
lr: 0.001
betas: [0.9, 0.98]
eps: 1e-09
lr_decay_marks: [50000, 55000]
lr_decay_factor: 0.5
weight_decay: 1e-06
embedDim: 300
hiddenDim: 512
numLayers: 2
dropout: 0.1
multiHead: 8
hiddenSizeHead: 64
FeedForwardSize: 2048
FlatMLPSize: 512
FlatGlimpses: 1
FlatOutSize: 512
layers: 6
bidirectional: 1


In [45]:
optsQ = OptionsQ().parse()#***


[INFO] Using 1 CUDA device(s) ...

 ------------------------------Opts------------------------------
mode: train
run_dir: /kaggle/working
useCuda: 1
text_log_dir: /kaggle/working/res.txt
questionNetPath: 
captionNetPath: 
dialogLen: 10
last_n_rounds: 10
encoderType: 1
load_checkpoint_path: None
gpu_ids: [0]
seed: 42
dataPathTr: /kaggle/input/Small_Tr_Val_Test_Final/train_concat_half.h5
dataPathVal: /kaggle/input/Small_Tr_Val_Test_Final/cap_val_half.h5
dataPathTest: /kaggle/input/Small_Tr_Val_Test_Final/test_concat_1000.h5
scenesPath: /kaggle/input/data/CLEVR_train_scenes.json
vocabTestPath: /kaggle/input/test_concat/vocab_output.json
vocabPath: /kaggle/input/train_concat/vocab_output.json
batch_size: 32
countFirstFailueRound: 0
maxSamples: -1
num_workers: 0
num_iters: 5000
display_every: 5
validate_every: 1000
shuffle_data: 1
optim: adam
lr: 0.001
betas: [0.9, 0.98]
eps: 1e-09
lr_decay_marks: [50000, 55000]
lr_decay_factor: 0.5
weight_decay: 1e-06
embedDim: 300
hiddenDim: 512
numLayer

In [None]:
exe = Execution(optsQ, optsC)#***
exe.run('train')

[INFO] Using GPU Tesla T4 ...
[INFO] Loading dataset ...
The question encoder has 139784134 trainable parameters
[INFO] Seed set to 42...
Evaluation: [step   39/  40]          49)][epoch  0][iter  995/5000][runtime 170.128428] loss: 0.1096, lr: 5.00e-04          
[INFO] Checkpointing model @ iter 1000 with val accuracy 41.47317854283427

Evaluation: [step   39/  40]          649)][epoch  0][iter 1995/5000][runtime 353.707362] loss: 0.0037, lr: 5.00e-04          
[INFO] Checkpointing model @ iter 2000 with val accuracy 49.639711769415534

Evaluation: [step   39/  40]          649)][epoch  0][iter 2995/5000][runtime 537.297892] loss: 0.0000, lr: 5.00e-04          
[INFO] Checkpointing model @ iter 3000 with val accuracy 51.72137710168134

[CLEVR-Dialog - All tr data (4995 | 54649)][epoch  0][iter 4995/5000][runtime 901.553704] loss: 0.0000, lr: 5.00e-04          