# Anchors

In [1]:
pip install anchor-exp

Collecting anchor-exp
  Downloading anchor_exp-0.0.2.0.tar.gz (427 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting spacy (from anchor-exp)
  Downloading spacy-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy->anchor-exp)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy->anchor-exp)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy->anchor-exp)
  Downloading murmurhash-1.0.12-cp311-cp311-macosx_10_9_x86_64.whl.metadata (2.1 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy->anchor-exp)
  Downloading cymem-2.0.11-cp311-cp311-macosx_10_9_x86_64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy->anchor-exp)
  Downloading preshed-3.0.9-cp311-cp311-m

In [2]:
from __future__ import print_function
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
%load_ext autoreload
%autoreload 2
from anchor import utils
from anchor import anchor_tabular
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataset_folder = 'datasets/dataset500.csv'
df = pd.read_csv(dataset_folder)

all_true_dataset = df[(df['req_0'] == 1) & (df['req_1'] == 1) & (df['req_2'] == 1) & (df['req_3'] == 1)]
all_true_dataset = all_true_dataset.drop(columns=['req_0','req_1', 'req_2', 'req_3'])
print(all_true_dataset.shape)

(12, 9)


In [4]:
df0 = df.drop(columns=['req_1', 'req_2', 'req_3'])
df0.to_csv('datasets/dataset500req0.csv', index=False)

df1 = df.drop(columns=['req_0', 'req_2', 'req_3'])
df1.to_csv('datasets/dataset500req1.csv', index=False)

df2 = df.drop(columns=['req_0', 'req_1', 'req_3'])
df2.to_csv('datasets/dataset500req2.csv', index=False)

df3 = df.drop(columns=['req_0', 'req_1', 'req_2'])
df3.to_csv('datasets/dataset500req3.csv', index=False)

In [5]:
datasets = []
dataframes = []
for i in range(4):
    dataset_folder = 'datasets/dataset500req' + str(i) + '.csv'
    datasets.append(utils.load_csv_dataset(dataset_folder, 9))

    c = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
    c.fit(datasets[i].train, datasets[i].labels_train)
    print('Train', sklearn.metrics.accuracy_score(datasets[i].labels_train, c.predict(datasets[i].train)))
    print('Test', sklearn.metrics.accuracy_score(datasets[i].labels_test, c.predict(datasets[i].test)))

    explainer = anchor_tabular.AnchorTabularExplainer(
        datasets[i].class_names, #it maps the 0 and 1 in the dataset's requirements to the class names
        datasets[i].feature_names,
        datasets[i].train,
        datasets[i].categorical_names)
    
    names = []
    for j in range(all_true_dataset.shape[0]):
        
        #do a csv with a prediction for each sample
        #print(all_true_dataset.iloc[i].values.reshape(1, -1)[0])
        with open('datasets/anchorsReq' +str(i) +'.csv', 'a') as f:
            f.write('Prediction: %s\n' % (explainer.class_names[c.predict(all_true_dataset.iloc[j].values.reshape(1, -1))[0]]))
            exp = explainer.explain_instance(all_true_dataset.iloc[j].values.reshape(1, -1), c.predict, threshold=0.95)
            f.write('Anchor: %s\n' % (' AND '.join(exp.names())))
            #print('Anchor: %s' % (' AND '.join(exp.names())))
            f.write('Precision: %.2f\n' % exp.precision())
            f.write('Coverage: %.2f\n' % exp.coverage())
            f.write('\n')
            names.append(exp.names())
    
    dataframes.append(names)
    
    print(dataframes[i])




Train 1.0
Test 0.98
[["b'firm obstacle' = 1.0", "b'power' > 23.00", "b'cruise speed' <= 71.28", "b'image resolution' > 23.99", "b'smoke intensity' <= 49.24", "b'obstacle distance' <= 43.80"], ["b'firm obstacle' = 1.0", "b'image resolution' > 75.39", "b'cruise speed' <= 44.79", "b'power' > 23.00", "b'illuminance' > 51.56", "b'smoke intensity' <= 49.24"], ["b'firm obstacle' = 1.0", "b'power' > 23.00", "b'cruise speed' <= 44.79", "b'smoke intensity' <= 49.24", "b'image resolution' > 23.99"], ["b'firm obstacle' = 1.0", "b'cruise speed' <= 22.33", "b'power' > 23.00", "b'smoke intensity' <= 74.25", "b'image resolution' > 23.99"], ["b'firm obstacle' = 1.0", "b'image resolution' > 75.39", "b'cruise speed' <= 44.79", "b'power' > 23.00", "b'smoke intensity' <= 49.24", "b'illuminance' > 27.51"], ["b'firm obstacle' = 1.0", "b'smoke intensity' <= 23.26", "b'image resolution' > 51.33", "27.51 < b'illuminance' <= 75.57"], ["b'firm obstacle' = 1.0", "b'image resolution' > 75.39", "b'cruise speed' <= 4

It prints the rules that allow us to keep the sample's prediction, in this case False for requirement 2, and the precision and coverage with which these rules hold

The coverage tells us how much of the dataset we have explained with this rule.

In [6]:
lista = []
for i in range(df.shape[0]):
    dat = df.iloc[i]
    if((dat['firm obstacle'] == 1 and dat['power']>23.00 and dat['cruise speed']<=71.28 and dat['smoke intensity']<=49.24 and dat['image resolution']>23.99 and dat['obstacle distance']<=43.80)):
        lista.append(dat)
        print(dat['firm obstacle'], dat['power'], dat['cruise speed'], dat['smoke intensity'], dat['image resolution'], dat['obstacle distance'], dat['req_0'], dat['req_1'], dat['req_2'], dat['req_3'])

print(len(lista)-12)

1.0 69.0 22.7995 32.0775 47.5676 18.952 True False False True
1.0 72.0 18.0329 9.0105 88.1242 14.3371 True False True True
1.0 88.0 47.0146 11.4562 55.1652 28.0682 True False True True
1.0 72.0 61.9286 35.0041 80.1206 34.0424 True False False False
1.0 38.0 50.9071 45.0738 33.397 15.681 True True True True
1.0 30.0 35.8364 46.5315 49.6464 7.3371 True True True True
1.0 46.0 29.8661 45.8727 50.6223 17.2824 True False True True
1.0 97.0 12.8875 0.9936 49.3559 13.2518 True True True True
1.0 54.0 0.8053 6.5258 82.9626 12.8178 False False False False
1.0 38.0 0.5755 22.1285 61.4105 37.1102 True True False True
1.0 77.0 25.9089 31.7312 93.6499 22.3561 True True True True
1.0 31.0 43.2555 21.008 25.6881 34.9979 True False True True
1.0 80.0 38.3862 26.5053 75.4232 11.7324 True False False True
1.0 89.0 27.4022 15.124 76.5957 16.5379 True False False False
1.0 56.0 27.8734 13.3137 48.469 18.1208 True False False False
1.0 51.0 69.8011 15.0281 81.2153 15.2147 True False True True
1.0 89.0 67.3

In [8]:
def get_anchor(a):
    quoted_part = a.split("'")[1]
    rest = a.replace(f"'{quoted_part}'", '').replace("b", '').strip()

    return quoted_part, rest

In [10]:
dictionaries =[]
for i in range(len(dataframes[0])):
    dictionaries.append([{}, {}, {}, {}])
    for j in range(4):
        for k in range(len(dataframes[j][i])):
            quoted, rest = get_anchor(dataframes[j][i][k])
            dictionaries[i][j][quoted] = rest

In [11]:
featureNames = ['cruise speed','image resolution','illuminance','controls responsiveness','power',
     'smoke intensity','obstacle size','obstacle distance','firm obstacle']

In [38]:
import re
from math import inf

def parse_range(expr: str):
    expr = expr.strip().replace(" ", "")
    
    patterns = [
        (r"^=(\-?\d+(\.\d+)?)$", 'equals'),
        (r"^(>=|>)\s*(-?\d+(\.\d+)?)$", 'lower'),
        (r"^(<=|<)\s*(-?\d+(\.\d+)?)$", 'upper'),
        (r"^(-?\d+(\.\d+)?)(<=|<){1,2}(<=|<)(-?\d+(\.\d+)?)$", 'between'),
        (r"^(-?\d+(\.\d+)?)(>=|>){1,2}(>=|>)(-?\d+(\.\d+)?)$", 'reverse_between'),
    ]
    
    for pattern, kind in patterns:
        match = re.match(pattern, expr)
        if match:
            if kind == 'equals':
                num = float(match.group(1))
                return (num, num, True, True)
            elif kind == 'lower':
                op, num = match.group(1), float(match.group(2))
                return (
                    num,
                    inf,
                    op == '>=',
                    False
                )
            elif kind == 'upper':
                op, num = match.group(1), float(match.group(2))
                return (
                    -inf,
                    num,
                    False,
                    op == '<='
                )
            elif kind == 'between':
                low = float(match.group(1))
                op1 = match.group(3)
                op2 = match.group(4)
                high = float(match.group(5))
                return (
                    low,
                    high,
                    op1 == '<=',
                    op2 == '<='
                )
            elif kind == 'reverse_between':
                high = float(match.group(1))
                op1 = match.group(3)
                op2 = match.group(4)
                low = float(match.group(5))
                return (
                    low,
                    high,
                    op2 == '>=',
                    op1 == '>='
                )

    raise ValueError(f"Unrecognized format: {expr}")

In [40]:
dictionaries =[]
for i in range(len(dataframes[0])):
    dictionaries.append([{}, {}, {}, {}])
    for j in range(4):
        for k in range(len(dataframes[j][i])):
            quoted, rest = get_anchor(dataframes[j][i][k])
            dictionaries[i][j][quoted] = parse_range(rest)

In [43]:
from typing import Optional, Tuple

def intersect(
    a: Tuple[float, float, bool, bool],
    b: Tuple[float, float, bool, bool]
) -> Optional[Tuple[float, float, bool, bool]]:
    
    a_low, a_high, a_li, a_ui = a
    b_low, b_high, b_li, b_ui = b

    # Compute max of lower bounds
    if a_low > b_low:
        low, li = a_low, a_li
    elif a_low < b_low:
        low, li = b_low, b_li
    else:
        low = a_low
        li = a_li and b_li

    # Compute min of upper bounds
    if a_high < b_high:
        high, ui = a_high, a_ui
    elif a_high > b_high:
        high, ui = b_high, b_ui
    else:
        high = a_high
        ui = a_ui and b_ui

    # Check for empty intersection
    if low > high:
        return None
    if low == high and not (li and ui):
        return None

    return (low, high, li, ui)

In [48]:
print(intersect((1, 5, True, False), (3, 7, True, True)))
# ➞ (3, 5, True, False)

print(intersect((1, 2, True, False), (2, 3, True, True)))
# ➞ None

print(intersect((0, 10, True, True), (10, 20, False, True)))
# ➞ (10, 10, True, False) ➞ None

print(intersect((3, 3, True, True), (3, 3, True, True)))
# ➞ (3, 3, True, True)

print(intersect((3, 5, True, True), (-2, 4, False, False)))
#➞ (3, 4, True, False)

(3, 5, True, False)
None
None
(3, 3, True, True)
(3, 4, True, False)


In [51]:
intersected = []

for i in range(len(dataframes[0])):

    intersected.append({})

    for j in range(3):
        for k in featureNames:
            if k not in dictionaries[i][j] or k not in dictionaries[i][j+1]:
                continue
            else:
                print(k)
                print(dictionaries[i][j][k])
                print(dictionaries[i][j+1][k])
                inter = intersect(dictionaries[i][j][k], dictionaries[i][j+1][k])
                if inter is not None:
                    intersected[i][k] = inter
                    print(inter)
                else:
                    raise ValueError(f"Unrecognized format: {dictionaries[i][j][k]} and {dictionaries[i][j+1][k]}")

cruise speed
(-inf, 71.28, False, True)
(44.79, inf, False, False)
(44.79, 71.28, False, True)
cruise speed
(44.79, inf, False, False)
(-inf, 71.28, False, True)
(44.79, 71.28, False, True)
cruise speed
(-inf, 71.28, False, True)
(22.33, 71.28, False, True)
(22.33, 71.28, False, True)
image resolution
(23.99, inf, False, False)
(23.99, inf, False, False)
(23.99, inf, False, False)
illuminance
(27.51, inf, False, False)
(27.51, inf, False, False)
(27.51, inf, False, False)
controls responsiveness
(74.46, inf, False, False)
(74.46, inf, False, False)
(74.46, inf, False, False)
power
(23.0, 50.0, False, True)
(-inf, 77.25, False, True)
(23.0, 50.0, False, True)
smoke intensity
(-inf, 49.24, False, True)
(23.26, 74.25, False, True)
(23.26, 49.24, False, True)
obstacle size
(27.78, 71.67, False, True)
(-inf, 71.67, False, True)
(27.78, 71.67, False, True)
firm obstacle
(1.0, 1.0, True, True)
(1.0, 1.0, True, True)
(1.0, 1.0, True, True)
cruise speed
(-inf, 44.79, False, True)
(-inf, 44.79, 

In [54]:
all_true_dataset, type(all_true_dataset), all_true_dataset.shape, all_true_dataset.columns

(     cruise speed  image resolution  illuminance  controls responsiveness  \
 67        50.9071           33.3970      42.4174                  83.5993   
 68        42.5871           92.9370      57.2953                  49.8743   
 70        35.8364           49.6464      81.5615                  88.6837   
 76        12.8875           49.3559      58.1259                  62.4755   
 96        25.9089           93.6499      28.8524                  94.5842   
 108       36.9259           53.8403      47.8659                  32.2035   
 195       41.5747           85.1220      61.4060                  41.9737   
 222       37.8936           41.8950      68.6273                  93.5441   
 258       60.5370           33.5818      83.8188                  44.1427   
 328       37.1301           53.6653      89.3982                  51.2406   
 341       12.9419           30.3257      88.7688                  20.1834   
 393       37.3821           71.3693      94.4444               

In [55]:
def inside(val, interval):
    low, high, li, ui = interval
    if li and ui:
        return low <= val <= high
    elif li and not ui:
        return low <= val < high
    elif not li and ui:
        return low < val <= high
    else:
        return low < val < high

In [59]:
sat = []
for i in range(all_true_dataset.shape[0]):
    for j in range(len(intersected)):
        flag = True
        for k in featureNames:
            if k not in intersected[j]:
                continue
            else:
                if not (inside(all_true_dataset.iloc[i][k], intersected[j][k])):
                    print(all_true_dataset.iloc[i])
                    flag = False
                    break
        
        if flag:
            sat.append(all_true_dataset.iloc[i])
            break
        else:
            flag = True

cruise speed               42.5871
image resolution           92.9370
illuminance                57.2953
controls responsiveness    49.8743
power                      66.0000
smoke intensity            31.1562
obstacle size              91.6935
obstacle distance          73.1713
firm obstacle               1.0000
Name: 68, dtype: float64
cruise speed               12.8875
image resolution           49.3559
illuminance                58.1259
controls responsiveness    62.4755
power                      97.0000
smoke intensity             0.9936
obstacle size              52.4364
obstacle distance          13.2518
firm obstacle               1.0000
Name: 76, dtype: float64
cruise speed               12.8875
image resolution           49.3559
illuminance                58.1259
controls responsiveness    62.4755
power                      97.0000
smoke intensity             0.9936
obstacle size              52.4364
obstacle distance          13.2518
firm obstacle               1.0000
Name:

In [60]:
print(len(sat))

12


In [None]:
# Get test examples where the anchora pplies
#fit_anchor = np.where(np.all(new_dataset.test[:, exp.features()] == new_dataset.test[idx][exp.features()], axis=1))[0]
#print('Anchor test precision: %.2f' % (np.mean(c.predict(new_dataset.test[fit_anchor]) == c.predict(new_dataset.test[idx].reshape(1, -1)))))
#print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(new_dataset.test.shape[0])))