# **Anchors**

In [96]:
pip install anchor-exp

9647.92s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [97]:
from __future__ import print_function
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
%load_ext autoreload
%autoreload 2
from anchor import utils
from anchor import anchor_tabular
import pandas as pd
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**Define the datasets**

In [98]:
training_folder = 'datasets/dataset5000.csv'
validation_folder = 'datasets/dataset500.csv'


training_df = pd.read_csv(training_folder)
validation_df = pd.read_csv(validation_folder)

# take only the values that are true for all the requirements and drop only the requirements columns
all_true_training_dataset = training_df[(training_df['req_0'] == 1) & (training_df['req_1'] == 1) & (training_df['req_2'] == 1) & (training_df['req_3'] == 1)]
all_true_training_dataset = all_true_training_dataset.drop(columns=['req_0','req_1', 'req_2', 'req_3'])

req_names = ['req_0', 'req_1', 'req_2', 'req_3']
feature_names = ['cruise speed','image resolution','illuminance','controls responsiveness','power','smoke intensity','obstacle size','obstacle distance','firm obstacle']

**Train the model and the anchors**

In [99]:
models = [] #will contain the models (one per requirement)
datasets = [] #will contain the datasets as needed by the anchor library

explanations = np.zeros((len(req_names), all_true_training_dataset.shape[0]), dtype=object) #will contain the explanations (objects)
exp_txt = [] #will contain the textual explanations its structure is a matrix (list of lists) where each row corresponds to a requirement 
             #and each column corresponds to the explanation for the corresponding row in all_true_training_dataset

feature_to_use = [i for i in range(9)] #contains the range of features to use

for i in range(len(req_names)):
    
    #we load the dataset in anchors
    datasets.append(\
        utils.load_csv_dataset(\
            training_folder, 9+i,\
            features_to_use=feature_to_use,\
            categorical_features=None))
    
    #initialize and train the model
    models.append(\
        sklearn.ensemble.GradientBoostingClassifier(random_state=1234))
    models[i].fit(datasets[i].train, datasets[i].labels_train)

    #initialize the explainer
    explainer = anchor_tabular.AnchorTabularExplainer(
        datasets[i].class_names, #it maps the 0 and 1 in the dataset's requirements to the class names
        datasets[i].feature_names,
        datasets[i].train,
        datasets[i].categorical_names)
    
    #explain only points satisfying all the requirements
    names = []
    
    for j in range(all_true_training_dataset.shape[0]):
        exp = explainer.explain_instance(all_true_training_dataset.iloc[j].values.reshape(1, -1), models[i].predict, threshold=0.95)
        explanations[i,j] = exp
        names.append(exp.names())
        
    
    exp_txt.append(names)
    
    print(exp_txt[i])




[["b'firm obstacle' = 1.0", "b'image resolution' > 49.80", "b'cruise speed' <= 75.49", "b'smoke intensity' <= 23.42", "b'illuminance' > 26.13", "b'power' > 25.00", "b'obstacle distance' <= 74.78"], ["b'firm obstacle' = 1.0", "b'illuminance' > 50.87", "b'image resolution' > 25.40", "b'cruise speed' <= 75.49", "b'smoke intensity' <= 73.67", "b'power' > 25.00", "b'obstacle size' > 50.25"], ["b'image resolution' <= 25.40", "b'smoke intensity' > 73.67"], ["b'firm obstacle' = 1.0", "b'image resolution' > 49.80", "b'illuminance' > 26.13", "b'cruise speed' <= 75.49", "b'smoke intensity' <= 48.87", "b'power' > 51.00", "b'obstacle size' > 26.74"], ["b'firm obstacle' = 1.0", "b'image resolution' > 75.24", "b'smoke intensity' <= 48.87", "b'illuminance' > 75.91", "b'cruise speed' <= 75.49", "b'obstacle size' > 50.25", "b'obstacle distance' <= 49.94"], ["b'firm obstacle' = 1.0", "b'image resolution' > 49.80", "b'illuminance' > 50.87", "50.17 < b'cruise speed' <= 75.49", "b'obstacle size' > 74.61", "

**Let us make some checks...**

In [100]:
np.count_nonzero(explanations[0] == explanations[1]) #count the number of times the explanations are the same for the two requirements

0

In [101]:
len(exp_txt), len(exp_txt[0])

(4, 205)

**Define useful data-wrangling functions**

function separating the name of the feature from the ranges

In [102]:
def get_anchor(a):
    quoted_part = a.split("'")[1]
    rest = a.replace(f"'{quoted_part}'", '').replace("b", '').strip()

    return quoted_part, rest

function creating the intervals

In [103]:
import re
from math import inf

def parse_range(expr: str):
    expr = expr.strip().replace(" ", "")
    
    patterns = [
        (r"^=(\-?\d+(\.\d+)?)$", 'equals'),
        (r"^(>=|>)\s*(-?\d+(\.\d+)?)$", 'lower'),
        (r"^(<=|<)\s*(-?\d+(\.\d+)?)$", 'upper'),
        (r"^(-?\d+(\.\d+)?)(<=|<){1,2}(<=|<)(-?\d+(\.\d+)?)$", 'between'),
        (r"^(-?\d+(\.\d+)?)(>=|>){1,2}(>=|>)(-?\d+(\.\d+)?)$", 'reverse_between'),
    ]
    
    for pattern, kind in patterns:
        match = re.match(pattern, expr)
        if match:
            if kind == 'equals':
                num = float(match.group(1))
                return (num, num, True, True)
            elif kind == 'lower':
                op, num = match.group(1), float(match.group(2))
                return (
                    num,
                    inf,
                    op == '>=',
                    False
                )
            elif kind == 'upper':
                op, num = match.group(1), float(match.group(2))
                return (
                    -inf,
                    num,
                    False,
                    op == '<='
                )
            elif kind == 'between':
                low = float(match.group(1))
                op1 = match.group(3)
                op2 = match.group(4)
                high = float(match.group(5))
                return (
                    low,
                    high,
                    op1 == '<=',
                    op2 == '<='
                )
            elif kind == 'reverse_between':
                high = float(match.group(1))
                op1 = match.group(3)
                op2 = match.group(4)
                low = float(match.group(5))
                return (
                    low,
                    high,
                    op2 == '>=',
                    op1 == '>='
                )

    raise ValueError(f"Unrecognized format: {expr}")

function that returns the interseption among two intervals 

In [104]:
from typing import Optional, Tuple

def intersect(
    a: Tuple[float, float, bool, bool],
    b: Tuple[float, float, bool, bool]
) -> Optional[Tuple[float, float, bool, bool]]:
    
    a_low, a_high, a_li, a_ui = a
    b_low, b_high, b_li, b_ui = b

    # Compute max of lower bounds
    if a_low > b_low:
        low, li = a_low, a_li
    elif a_low < b_low:
        low, li = b_low, b_li
    else:
        low = a_low
        li = a_li and b_li

    # Compute min of upper bounds
    if a_high < b_high:
        high, ui = a_high, a_ui
    elif a_high > b_high:
        high, ui = b_high, b_ui
    else:
        high = a_high
        ui = a_ui and b_ui

    # Check for empty intersection
    if low > high:
        return None
    if low == high and not (li and ui):
        return None

    return (low, high, li, ui)

function that return the truth value of a num (val) being inside a given interval

In [105]:
def inside(val, interval):
    low, high, li, ui = interval
    if li and ui:
        return low <= val <= high
    elif li and not ui:
        return low <= val < high
    elif not li and ui:
        return low < val <= high
    else:
        return low < val < high

**Wrangle the data to cope better with them**

Transform exp_txt in exp_dict a list of 4 dictionaries per element (one per requirement) in which are listed each feature with the respective constraints (as a range data structure)
The range data structure is a 4-element tuple (float, float, boolean, boolean) where (a,b,x,y) num $\in$ (a,b) and x and y are true if the extremes are included, otherwise they are false

In [106]:
exp_dict =[]
for i in range(len(exp_txt[0])):
    exp_dict.append([{}, {}, {}, {}])
    for j in range(len(req_names)):
        for k in range(len(exp_txt[j][i])):
            quoted, rest = get_anchor(exp_txt[j][i][k])
            exp_dict[i][j][quoted] = parse_range(rest)

**Intersect all the obtained explanations over the different requirements**

In [129]:
exp_intersected = []

for i in range(len(exp_dict)):
    exp_intersected.append(exp_dict[i][3])
    for j in range(len(req_names)-1):
        for k in feature_names:
            if k in exp_dict[i][j]:
                if k in exp_intersected[i]:
                    inter = intersect(exp_dict[i][j][k], exp_intersected[i][k])
                    if inter is not None:
                        exp_intersected[i][k] = inter
                    else:
                        raise ValueError(f"Unrecognized format: {exp_dict[i][j][k]} and {exp_dict[i][j+1][k]}")
                else:
                    exp_intersected[i][k] = exp_dict[i][j][k]

# Validation

Define the function classifing w.r.t. the anchors

In [130]:
def classify_w_anchor(input, thresholds):
    inside_points = []

    featureNames = ['cruise speed','image resolution','illuminance','controls responsiveness','power',
     'smoke intensity','obstacle size','obstacle distance','firm obstacle']
    
    for i in range(input.shape[0]):
        for j in range(len(thresholds)):
            flag = True
            for k in featureNames:
                if k in thresholds[j]:
                    if not (inside(input.iloc[i][k], thresholds[j][k])):
                        flag = False
                        break
            if flag:
                inside_points.append(input.iloc[i])
                break
            else:
                flag = True
    return inside_points

Verify if the function works properly by submitting the whole 5k dataset, we should be obtaining only the previously obtained samples for which all the req. are satisfied

In [131]:
sat = []
for i in range(all_true_training_dataset.shape[0]):
    for j in range(len(exp_intersected)):
        flag = True
        for k in feature_names:
            if k not in exp_intersected[j]:
                continue
            else:
                if not (inside(all_true_training_dataset.iloc[i][k], exp_intersected[j][k])):
                    print(all_true_training_dataset.iloc[i])
                    flag = False
                    break
        
        if flag:
            sat.append(all_true_training_dataset.iloc[i])
            break
        else:
            flag = True



cruise speed               66.8818
image resolution           34.0519
illuminance                74.7470
controls responsiveness    24.4433
power                      72.0000
smoke intensity            69.8451
obstacle size              73.7317
obstacle distance          28.6129
firm obstacle               1.0000
Name: 64, dtype: float64
cruise speed               24.2402
image resolution           24.1728
illuminance                84.6850
controls responsiveness    40.6126
power                      49.0000
smoke intensity            90.4350
obstacle size              57.8777
obstacle distance          56.5959
firm obstacle               1.0000
Name: 94, dtype: float64
cruise speed               24.2402
image resolution           24.1728
illuminance                84.6850
controls responsiveness    40.6126
power                      49.0000
smoke intensity            90.4350
obstacle size              57.8777
obstacle distance          56.5959
firm obstacle               1.0000
Name:

In [160]:
targets = np.zeros((all_true_training_dataset.shape[0], len(models)))
for i in range(all_true_training_dataset.shape[0]):
    for j in range(len(models)):
        targets[i,j] = models[j].predict(all_true_training_dataset.iloc[i].values.reshape(1, -1))

all_yes_t = np.all(targets==1, axis = 1)

In [154]:
all_yes_t.shape

(205,)

In [161]:
np.count_nonzero(all_yes_t==1), targets.shape, len(sat)

(19, (205, 4), 205)

Validate the function on a test set (500 samples)

In [134]:
n_samples_val = validation_df.shape[0]

In [135]:
y_pred = np.zeros((n_samples_val, len(models)))
y_real = np.zeros((n_samples_val, len(models)))
y_anch = np.zeros((n_samples_val, len(models)))


for i in range(n_samples_val):

    y_real[i] = validation_df.iloc[i].values[9:]

    values = validation_df.iloc[i].values[:9]
    for j in range(len(models)):
        y_pred[i,j] = models[j].predict(values.reshape(1, -1))
        y_anch[i,j] = 0
        flag = True
        for k in feature_names:
            if k in exp_intersected[j]:
                if not inside(validation_df.iloc[i][k], exp_intersected[j][k]):
                    flag = False
                    break
        if flag:
            y_anch[i,j] = 1

In [136]:
y_real_all = np.all(targets==1, axis = 1)
y_pred_all = np.all(y_pred==1, axis = 1)
y_anch_all = np.all(y_anch==1, axis = 1)

In [137]:
y_real_all[y_real_all == True], y_pred_all[y_pred_all == True], y_anch_all[y_anch_all == True].shape

(array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]),
 array([ True]),
 (0,))

In [138]:
r_p = np.zeros(n_samples_val)
r_a = np.zeros(n_samples_val)
p_a = np.zeros(n_samples_val)

for i, req in enumerate(req_names):
    for j in range(n_samples_val):
        r_p[j] = y_pred[j,i] and y_real[j,i]
        r_a[j] = y_anch[j,i] and y_real[j,i]
        p_a[j] = y_anch[j,i] and y_pred[j,i]

r_p = np.sum(r_p)
r_a = np.sum(r_a)
p_a = np.sum(p_a)
r_p, r_a, p_a, [np.count_nonzero(y_real[:,i]) for i in range(4)]


(93.0, 0.0, 0.0, [178, 54, 88, 121])