In [47]:
#image.py
def convert_view_tree_file(view_tree_path, config_json):
    with open(view_tree_path, "r") as view_tree_file:
        view_tree = json.load(view_tree_file)
    return convert_view_tree(view_tree, config_json)

def convert_semantic_view_tree_file(view_tree_path, config_json):
    with open(view_tree_path, "r") as view_tree_file:
        view_tree = json.load(view_tree_file)
    return convert_semantic_view_tree(view_tree, config_json)

def convert_semantic_view_tree(view_tree, config_json):
    origin_dim = config_json["origin_dim"]
    downscale_dim = config_json["downscale_dim"]

    label_num = len(config_json["semantic_labels"])
    label_to_id = {}
    for i in range(label_num):
        label_to_id[config_json["semantic_labels"][i]] = i

    boxes = []
    if view_tree is None:
        return None
    if not is_view_hierarchy_valid(view_tree, config_json, semantic_ui=True):
        return None

    view_offset = compute_view_offset(view_tree, config_json, semantic_ui=True)
    # print(view_offset)

    def view_call_back(view_tree):
        if "componentLabel" in view_tree:
            bounds = view_tree["bounds"]
            if bounds[0] < 0 or bounds[1] < 0 or \
               bounds[2] > origin_dim[0] or bounds[3] > origin_dim[1] or \
               bounds[0] >= bounds[2] or bounds[1] >= bounds[3]:
                return
            x_center = (bounds[0] + bounds[2]) / origin_dim[0] / 2 + view_offset[0] / downscale_dim[0]
            y_center = (bounds[1] + bounds[3]) / origin_dim[1] / 2 + view_offset[1] / downscale_dim[1]
            width = (bounds[2] - bounds[0]) / origin_dim[0]
            height = (bounds[3] - bounds[1]) / origin_dim[1]
            boxes.append([label_to_id[view_tree["componentLabel"]],
                          x_center, y_center, width, height])

    traverse_view_tree(view_tree, view_call_back, semantic_ui=True)
    return boxes

def convert_view_tree(view_tree, config_json):
    origin_dim = config_json["origin_dim"]
    downscale_dim = config_json["downscale_dim"]
    downscale_ratio = downscale_dim[0] / origin_dim[0]
    text_dim = config_json["text_dim"]
    image_dim = config_json["image_dim"]
    total_dims = config_json["total_dims"]
    bw = config_json["boundary_width"]

    image_data = np.zeros((downscale_dim[0], downscale_dim[1], total_dims), dtype=np.float32)

    if view_tree is None:
        return image_data
    if not is_view_hierarchy_valid(view_tree, config_json):
        return image_data

    view_offset = compute_view_offset(view_tree, config_json)

    def view_call_back(view_tree):
        if "children" not in view_tree or not len(view_tree["children"]):
            bounds = view_tree["bounds"]
            x_min = int(bounds[0] * downscale_ratio) + view_offset[0]
            y_min = int(bounds[1] * downscale_ratio) + view_offset[1]
            x_max = int(bounds[2] * downscale_ratio) + view_offset[0]
            y_max = int(bounds[3] * downscale_ratio) + view_offset[1]
            if x_min >= x_max or y_min >= y_max:
                return
            draw_dim = text_dim if ("text" in view_tree and view_tree["text"] is not None) \
                                else image_dim
            image_data[x_min:x_max, y_min:y_max, draw_dim] = 1.0
            # draw four boundaries
            image_data[x_min - bw:x_min, y_min - bw:y_max + bw, draw_dim] = 0.0
            image_data[x_max:x_max + bw, y_min - bw:y_max + bw, draw_dim] = 0.0
            image_data[x_min - bw:x_max + bw, y_min - bw:y_min, draw_dim] = 0.0
            image_data[x_min - bw:x_max + bw, y_max:y_max + bw, draw_dim] = 0.0

    traverse_view_tree(view_tree["activity"]["root"], view_call_back)
    # if "/262.json" in view_tree_path:
    # if True:
    #     print(view_tree_path)
    #     visualize_view_tree(image_data, config_json)

    return image_data

def visualize_view_tree(image_data, config_json):
    downscale_dim = config_json["downscale_dim"]
    text_dim = config_json["text_dim"]
    image_dim = config_json["image_dim"]

    image_full = np.zeros([downscale_dim[1], downscale_dim[0], 3], dtype=np.float32)
    print(image_full.shape)

    image_full[:, :, text_dim] = image_data[:, :, text_dim].T
    image_full[:, :, image_dim] = image_data[:, :, image_dim].T

    plt.imshow(image_full, interpolation='nearest' )
    plt.show()

In [48]:
#text_input.py
from utils import traverse_view_tree, get_text_view_signature, is_text_view, \
                  is_view_hierarchy_valid, compute_view_offset

def add_text_inputs(view_tree_paths, image_array,
                    heatmap_array, interact_array, config_json):
    total_dims = config_json["total_dims"]
    downscale_dim = config_json["downscale_dim"]
    origin_dim = config_json["origin_dim"]
    downscale_ratio = downscale_dim[0] / origin_dim[0]
    interact_dim = config_json["interact_dim"]
    interact_input_text = config_json["interact_input_text"]

    assert(len(view_tree_paths) == len(image_array) == len(heatmap_array) == len(interact_array))

    text_history = {} # activity + view_id -> [(idx, text_content), ...]

    for i, view_tree_path in enumerate(view_tree_paths):
        with open(view_tree_path, "r") as view_tree_file:
            view_tree = json.load(view_tree_file)

        if view_tree is None:
            continue
        if not is_view_hierarchy_valid(view_tree, config_json):
            continue
        activity_name = view_tree["activity_name"]
        if activity_name is None:
            continue

        view_offset = compute_view_offset(view_tree, config_json)

        def view_call_back(view_tree):
            if is_text_view(view_tree):
                bounds = view_tree["bounds"]

                x_min = max(0, int(bounds[0] * downscale_ratio) + view_offset[0])
                y_min = max(0, int(bounds[1] * downscale_ratio) + view_offset[1])
                x_max = min(downscale_dim[0], int(bounds[2] * downscale_ratio) + view_offset[0])
                y_max = min(downscale_dim[1], int(bounds[3] * downscale_ratio) + view_offset[1])

                if x_min > x_max or y_min > y_max:
                    return

                text_view_id = activity_name + ":" + get_text_view_signature(view_tree)
                if text_view_id not in text_history:
                    text_history[text_view_id] = {
                        "pos": [min(int((x_min + x_max) / 2), downscale_dim[0] - 1),
                                min(int((y_min + y_max) / 2), downscale_dim[1] - 1)],
                        "texts": []
                    }
                text_history[text_view_id]["texts"].append([i, view_tree["text"]])

        traverse_view_tree(view_tree["activity"]["root"], view_call_back)

    # identify text changes
    text_changes = {}
    for text_view_id in text_history:
        pos = text_history[text_view_id]["pos"]
        texts = [[0, ""]] + text_history[text_view_id]["texts"]
        for i in range(len(texts) - 1):
            if texts[i][1] != texts[i + 1][1]:
                if texts[i + 1][0] not in text_changes:
                    text_changes[texts[i + 1][0]] = []
                text_changes[texts[i + 1][0]].append({"pos": pos, "text": texts[i + 1][1]})

    new_view_tree_paths = []
    new_image_array = []
    new_heatmap_array = []
    new_interact_array = []

    text_change_indices = sorted(text_changes)
    last_idx = 0
    for idx in text_change_indices:
        new_view_tree_paths += view_tree_paths[last_idx:idx]
        new_image_array += image_array[last_idx:idx]
        new_heatmap_array += heatmap_array[last_idx:idx]
        new_interact_array += interact_array[last_idx:idx]

        # from left to right, up to down
        sorted_inputs = sorted(text_changes[idx], key=lambda x: x["pos"])
        for text_input in sorted_inputs:
            new_view_tree_paths.append(view_tree_paths[idx])
            new_image_array.append(np.copy(image_array[idx]))
            new_interact_array.append({
                "interact_type": interact_input_text,
                "text": text_input["text"]
            })
            text_heatmap = np.zeros((downscale_dim[0], downscale_dim[1], total_dims), dtype=np.float32)
            new_heatmap_array.append(text_heatmap)
            for x in range(downscale_dim[0]):
                for y in range(downscale_dim[1]):
                    sample_x = abs(x - text_input["pos"][0])
                    sample_y = abs(y - text_input["pos"][1])
                    text_heatmap[x, y, interact_dim] = touch_input.GAUSS_MAP[sample_x, sample_y]
        last_idx = idx

    new_view_tree_paths += view_tree_paths[last_idx:]
    new_image_array += image_array[last_idx:]
    new_heatmap_array += heatmap_array[last_idx:]
    new_interact_array += interact_array[last_idx:]

    return new_view_tree_paths, new_image_array, new_heatmap_array, new_interact_array


In [49]:
#touch_input.py

GAUSS_MAP = None

def gesture_classify(gesture, config_json):
    downscale_dim = config_json["downscale_dim"]

    assert(len(gesture) > 0)
    if len(gesture) <= config_json["long_touch_threshold"]:
        return config_json["interact_touch"]

    delta_x = (gesture[0][0] - gesture[-1][0]) * downscale_dim[0]
    delta_y = (gesture[0][1] - gesture[-1][1]) * downscale_dim[1]

    dis = int(np.sqrt(delta_x ** 2 + delta_y ** 2))
    if dis <= config_json["swipe_threshold"]:
        return config_json["interact_long_touch"]

    # horizon first
    if abs(delta_x) > abs(delta_y):
        if delta_x < 0:
            return config_json["interact_swipe_right"]
        else:
            return config_json["interact_swipe_left"]
    else:
        if delta_y > 0:
            return config_json["interact_swipe_down"]
        else:
            return config_json["interact_swipe_up"]

def convert_gestures(gestures, config_json):
    downscale_dim = config_json["downscale_dim"]
    interact_dim = config_json["interact_dim"]
    total_dims = config_json["total_dims"]
    gauss_delta = config_json["gauss_delta"]

    # generate GAUSS_MAP cache if not yet
    global GAUSS_MAP
    if GAUSS_MAP is None:
        GAUSS_MAP = np.zeros((downscale_dim[0], downscale_dim[1]), dtype=np.float32)
        var = multivariate_normal(mean=[0, 0], cov=[[gauss_delta,0],[0,gauss_delta]])
        for x in range(downscale_dim[0]):
            for y in range(downscale_dim[1]):
                GAUSS_MAP[x, y] = var.pdf([x, y])

    # image num, x, y, channels (TEXT/IMAGE)
    interact_heatmap_array = []
    gesture_array = []

    for gesture in gestures:
        interact_heatmap = np.zeros((downscale_dim[0], downscale_dim[1], total_dims), dtype=np.float32)
        interact_heatmap_array.append(interact_heatmap)

        if not len(gesture):
            gesture_array.append(None)
            continue

        gesture_kind = gesture_classify(gesture, config_json)
        gesture_array.append({
            "interact_type": gesture_kind,
        })

        gesture_pos = [min(max(int(gesture[0][0] * downscale_dim[0]), 0), downscale_dim[0] - 1),
                       min(max(int(gesture[0][1] * downscale_dim[1]), 0), downscale_dim[1] - 1)]
        for x in range(downscale_dim[0]):
            for y in range(downscale_dim[1]):
                sample_x = abs(x - gesture_pos[0])
                sample_y = abs(y - gesture_pos[1])
                interact_heatmap[x, y, interact_dim] = GAUSS_MAP[sample_x, sample_y]

        # if True:
        #     visualize_gesture(interact_heatmap, config_json)

    return interact_heatmap_array, gesture_array

def visualize_gesture(interact_heatmap, config_json):
    downscale_dim = config_json["downscale_dim"]
    interact_dim = config_json["interact_dim"]

    image_full = np.zeros([downscale_dim[1], downscale_dim[0], 3], dtype=np.float32)

    print(np.sum(interact_heatmap))
    image_full[:, :, interact_dim] = interact_heatmap[:, :, interact_dim].T
    max_val = np.max(image_full[:, :, interact_dim])
    image_full /= max_val

    plt.imshow(image_full, interpolation='nearest' )
    plt.show()


In [50]:
#utils.py
def traverse_view_tree(view_tree, call_back, semantic_ui=False):
    if view_tree is None or not semantic_ui and not is_view_valid(view_tree):
        return
    call_back(view_tree)
    if "children" in view_tree:
        for child in view_tree["children"]:
            traverse_view_tree(child, call_back, semantic_ui)

def is_view_hierarchy_valid(view_tree, config_json, semantic_ui=False):
    origin_dim = config_json["origin_dim"]
    if semantic_ui:
        view_root_bounds = view_tree["bounds"]
    else:
        view_root_bounds = view_tree["activity"]["root"]["bounds"]
    # skip full-screen horizon ones
    if view_root_bounds[2] > view_root_bounds[3] and view_root_bounds[2] > origin_dim[0]:
        return False
    return True

def compute_view_offset(view_tree, config_json, semantic_ui=False):
    if semantic_ui:
        view_root_bounds = view_tree["bounds"]
    else:
        view_root_bounds = view_tree["activity"]["root"]["bounds"]

    downscale_dim = config_json["downscale_dim"]
    origin_dim = config_json["origin_dim"]
    status_bar_height = config_json["status_bar_height"]
    navigation_bar_height = config_json["navigation_bar_height"]
    downscale_ratio = downscale_dim[0] / origin_dim[0]

    view_offset = [0, 0]
    if semantic_ui:
        root_view = view_tree
    else:
        root_view = view_tree["activity"]["root"]

    # heuristically identify non-full-screen window like permission window
    if not root_view["class"].startswith("com.android.internal.policy.PhoneWindow"):
        return view_offset

    # view_tree from DroidBot may not contain activity_name
    if "activity_name" in view_tree and not view_tree["activity_name"] == "com.android.packageinstaller/com.android.packageinstaller.permission.ui.GrantPermissionsActivity":
        return view_offset

    if view_root_bounds[2] - view_root_bounds[0] < origin_dim[0] and \
        view_root_bounds[3] - view_root_bounds[1] < origin_dim[1] - status_bar_height - navigation_bar_height:
        view_center = [(view_root_bounds[0] + view_root_bounds[2]) / 2,
                    (view_root_bounds[1] + view_root_bounds[3]) / 2]
        view_offset = [int((origin_dim[0] / 2 - view_center[0]) * downscale_ratio),
                       int(((origin_dim[1] + status_bar_height - navigation_bar_height) / 2 - view_center[1]) * downscale_ratio)]
    return view_offset

def is_view_valid(view):
    visible_to_user = view["visible-to-user"]
    if not visible_to_user:
        return False

    if "bounds" not in view or "rel-bounds" not in view:
        return False

    bounds = view["bounds"]
    rel_bounds = view["rel-bounds"]

    if (bounds[0] >= bounds[2] or bounds[1] >= bounds[3] or \
        rel_bounds[0] >= rel_bounds[2] or rel_bounds[1] >= rel_bounds[3]):
        return False

    if ((rel_bounds[2] - rel_bounds[0]) < (bounds[2] - bounds[0]) or \
        (rel_bounds[3] - rel_bounds[1]) < (bounds[3] - bounds[1])):
        return False

    return True

def is_text_view(view):
    if "text" not in view:
        return False
    for ancestor in view["ancestors"]:
        if "edittext" in ancestor.lower():
            return True
    return "edittext" in view["class"].lower()

def is_valid_data(image, interact, config_json):
    if interact is None:
        return False

    text_dim = config_json["text_dim"]
    image_dim = config_json["image_dim"]
    interact_dim = config_json["interact_dim"]

    if np.sum(image[:, :, text_dim]) + np.sum(image[:, :, image_dim]) == 0:
        return False
    if np.sum(image[:, :, interact_dim]) == 0:
        return False

    return True

def get_text_view_signature(view):
    signature = ""

    # class
    signature += "[class]"
    if "class" in view:
        signature += view["class"]

    # resource_id
    signature += "[resource_id]"
    if "resource_id" in view:
        signature += view["resource_id"]

    # text_hint
    signature += "[text_hint]"
    if "text_hint" in view:
        signature += view["text_hint"]

    # pointer
    signature += "[pointer]"
    if "pointer" in view:
        signature += view["pointer"]

    return signature

def visualize_data(data, label=""):
    image_full = np.zeros([data.shape[1], data.shape[0], 3], dtype=np.float32)

    for i in range(data.shape[2]):
        image_full[:, :, i] = data[:, :, i].T
        max_val = np.max(image_full[:, :, i])
        if max_val > 0:
            image_full[:, :, i] /= max_val

    plt.imshow(image_full, interpolation="nearest")
    plt.xlabel(label)
    plt.show()
