# **Anchors on one requirement**

In [2]:
from __future__ import print_function
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
from sklearn.metrics import accuracy_score
%load_ext autoreload
%autoreload 2
from anchor import utils
from anchor import anchor_tabular
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

**Define useful data-wrangling functions**

function separating the name of the feature from the ranges

In [25]:
def get_anchor(a):
    quoted_part = a.split("'")[1]
    rest = a.replace(f"'{quoted_part}'", '').replace("b", '').strip()

    return quoted_part, rest

function creating the intervals

In [26]:
import re
from math import inf

def parse_range(expr: str):
    expr = expr.strip().replace(" ", "")
    
    patterns = [
        (r"^=(\-?\d+(\.\d+)?)$", 'equals'),
        (r"^(>=|>)\s*(-?\d+(\.\d+)?)$", 'lower'),
        (r"^(<=|<)\s*(-?\d+(\.\d+)?)$", 'upper'),
        (r"^(-?\d+(\.\d+)?)(<=|<){1,2}(<=|<)(-?\d+(\.\d+)?)$", 'between'),
        (r"^(-?\d+(\.\d+)?)(>=|>){1,2}(>=|>)(-?\d+(\.\d+)?)$", 'reverse_between'),
    ]
    
    for pattern, kind in patterns:
        match = re.match(pattern, expr)
        if match:
            if kind == 'equals':
                num = float(match.group(1))
                return (num, num, True, True)
            elif kind == 'lower':
                op, num = match.group(1), float(match.group(2))
                return (
                    num,
                    inf,
                    op == '>=',
                    False
                )
            elif kind == 'upper':
                op, num = match.group(1), float(match.group(2))
                return (
                    -inf,
                    num,
                    False,
                    op == '<='
                )
            elif kind == 'between':
                low = float(match.group(1))
                op1 = match.group(3)
                op2 = match.group(4)
                high = float(match.group(5))
                return (
                    low,
                    high,
                    op1 == '<=',
                    op2 == '<='
                )
            elif kind == 'reverse_between':
                high = float(match.group(1))
                op1 = match.group(3)
                op2 = match.group(4)
                low = float(match.group(5))
                return (
                    low,
                    high,
                    op2 == '>=',
                    op1 == '>='
                )

    raise ValueError(f"Unrecognized format: {expr}")

function that return the truth value of a num (val) being inside a given interval

In [27]:
def inside(val, interval):
    low, high, li, ui = interval
    if li and ui:
        return low <= val <= high
    elif li and not ui:
        return low <= val < high
    elif not li and ui:
        return low < val <= high
    else:
        return low < val < high

In [28]:
def classify_w_anchor(input, thresholds, feature_names):
    out = np.zeros((input.shape[0], input.shape[1]), dtype=object)
    
    for i in range(input.shape[0]):
        for j in range(len(thresholds)):
            flag = True
            for k in feature_names:
                if k in thresholds[j]:
                    if not (inside(input.iloc[i][k], thresholds[j][k])):
                        flag = False
                        break
            if flag:
                out[i][j] = input.iloc[i]
                break
            else:
                flag = True
    return out

**DF Preparation**

In [3]:
#meta parameters
train_percentage = 80
val_percentage = 20

req_names = ['req_0', 'req_1', 'req_2', 'req_3']
req_number = len(req_names)
feature_names = ['cruise speed','image resolution','illuminance','controls responsiveness','power','smoke intensity','obstacle size','obstacle distance','firm obstacle']
feature_number = len(feature_names)

training_folder = '../datasets/dataset5000.csv'

# Load the dataset
df = pd.read_csv(training_folder)
n_samples = df.shape[0]
print("Number of samples: ", n_samples)

#Split 80 20 the training dataset in training anda validation to have more similar data
indices = np.arange(0,n_samples)
np.random.seed(1234)
indices = np.random.permutation(indices)

training_indices = indices[0:int(n_samples*train_percentage/100)]
validation_indices = indices[int(n_samples*train_percentage/100):]

training_df = df.iloc[training_indices]
validation_df = df.iloc[validation_indices]
print('Training dataset size: ', training_df.shape)
print('Validation dataset size: ', validation_df.shape)

#select the samples that have all the requirements satisfied
all_true_training = training_df[
    (training_df['req_0'] == 1) &
    (training_df['req_1'] == 1) &
    (training_df['req_2'] == 1) &
    (training_df['req_3'] == 1)
].drop(columns=req_names)

all_true_validation = validation_df[
    (validation_df['req_0'] == 1) &
    (validation_df['req_1'] == 1) &
    (validation_df['req_2'] == 1) &
    (validation_df['req_3'] == 1)
].drop(columns=req_names)

print('Training samples with all requirements satisfied: ', all_true_training.shape)
print('Validation samples with all requirements satisfied: ', all_true_validation.shape)

#select the samples that have at one specific requirement satisfied
req_true_training = {}
for r in req_names:
    req_true_training[r] = training_df[training_df[r] == 1].drop(columns=req_names)
    print('Training samples with {} satisfied: '.format(r), req_true_training[r].shape)

req_true_validation = {}
for r in req_names:
    req_true_validation[r] = validation_df[validation_df[r] == 1].drop(columns=req_names)
    print('Validation samples with {} satisfied: '.format(r), req_true_validation[r].shape)

#create a csv with the new training data and save it
training_df.to_csv('../datasets/training_dataset.csv', index=False)
validation_df.to_csv('../datasets/validation_dataset.csv', index=False)

Number of samples:  5000
Training dataset size:  (4000, 13)
Validation dataset size:  (1000, 13)
Training samples with all requirements satisfied:  (156, 9)
Validation samples with all requirements satisfied:  (49, 9)
Training samples with req_0 satisfied:  (1382, 9)
Training samples with req_1 satisfied:  (723, 9)
Training samples with req_2 satisfied:  (908, 9)
Training samples with req_3 satisfied:  (1041, 9)
Validation samples with req_0 satisfied:  (342, 9)
Validation samples with req_1 satisfied:  (172, 9)
Validation samples with req_2 satisfied:  (235, 9)
Validation samples with req_3 satisfied:  (261, 9)


In [4]:
datasets = [] #will contain the datasets as needed by the anchor library
feature_to_use = [i for i in range(feature_number)] #contains the range of features to use
true_from_anchors_df = {}

for i,r in enumerate(req_names):
    #we load the dataset in anchors
    datasets.append(\
        utils.load_csv_dataset(\
            training_folder, feature_number+i,\
            features_to_use=feature_to_use,\
            categorical_features=None))
    
    true_from_anchors_df[r] = np.nonzero(datasets[i].labels_train)[0]
    print('Training samples with {} satisfied: '.format(r), true_from_anchors_df[r].shape)


Training samples with req_0 satisfied:  (1365,)
Training samples with req_1 satisfied:  (725,)
Training samples with req_2 satisfied:  (903,)
Training samples with req_3 satisfied:  (1029,)


In [5]:
training_folder = '../datasets/training_dataset.csv'
validation_folder = '../datasets/validation_dataset.csv'

**Learning Phase**

In [6]:

models = [] #will contain the models (one per requirement)

explainer = []

# explanations = np.zeros((req_number, all_true_training.shape[0]), dtype=object) #will contain the explanations (objects)
# exp_txt = [] #will contain the textual explanations its structure is a matrix (list of lists) where each row corresponds to a requirement 
#              #and each column corresponds to the explanation for the corresponding row in all_true_training_dataset


for i in range(req_number):
    print(i)
    #initialize and train the model
    #if i == 1:
    #    models.append(\
    #    HistGradientBoostingClassifier(class_weight='balanced',random_state=1234))
    #    models[i].fit(datasets[i].train, datasets[i].labels_train)
            #models.append(\
        #    MLPClassifier(random_state=1234))
        #models[i].fit(datasets[i].train, datasets[i].labels_train)

    #else:
    #    models.append(\
    #        sklearn.ensemble.GradientBoostingClassifier(random_state=1234))
    #    models[i].fit(datasets[i].train, datasets[i].labels_train)

    models.append(\
            sklearn.ensemble.GradientBoostingClassifier(random_state=1234))
    models[i].fit(datasets[i].train, datasets[i].labels_train)
    
    #initialize the explainer
    explainer.append(anchor_tabular.AnchorTabularExplainer(
        datasets[i].class_names, #it maps the 0 and 1 in the dataset's requirements to the class names
        datasets[i].feature_names,
        datasets[i].train,
        datasets[i].categorical_names))
        
    # #explain only points satisfying all the requirements
    # names = []
    
    # for j in range():
    #     exp = explainer.explain_instance(all_true_training.iloc[j].values.reshape(1, -1), models[i].predict, threshold=0.95) #0.95
    #     explanations[i,j] = exp
    #     names.append(exp.names())        
    
    # exp_txt.append(names)
    
    # print(exp_txt[i])

0
1
2
3


In [7]:
for i in range(req_number):
    print(f"Model {i+1} training accuracy: {accuracy_score(datasets[i].labels_train, models[i].predict(datasets[i].train)):.4f}")

Model 1 training accuracy: 0.9390
Model 2 training accuracy: 0.9035
Model 3 training accuracy: 0.9437
Model 4 training accuracy: 0.9293


In [None]:
training_df_out = []
positively_classified = {} #contains the INDICES (w.r.t. datasets[req_i_num]) of the samples classified positively by the model. 
                           #Note: TEHSE MIGHT BE SLIGHTLY DIFFERENT FROM THOSE TRUE IN THE Dataset depending on the accuracy of the model

for i, req in enumerate(req_names):
    print(f"___________Requirement {i+1}: {req}___________")
    output = models[i].predict(datasets[i].train)
    
    #obtain the indices of the samples that have the requirement satisfied
    indices = np.where(output == 1)[0]

    print(f"Number of samples with {req} classified as satisfied: {len(indices)}")
    print(f"Number of samples with {req} truly satisfied: {len(true_from_anchors_df[req])}")
    
    #calulate false positives
    f_p = indices.shape[0] - np.intersect1d(indices, true_from_anchors_df[req]).shape[0]
    print(f"Number of false positives: {f_p}")
    #calculate the missclassified real positive
    m_r_p = true_from_anchors_df[req].shape[0] - np.intersect1d(indices, true_from_anchors_df[req]).shape[0]
    print(f"Number of missclassified real positives: {m_r_p}")

    positively_classified[req] = indices
    print("\n")

___________Requirement 1: req_0___________
Number of samples with req_0 classified as satisfied: 1303
Number of samples with req_0 truly satisfied: 1365
Number of false positives: 91
Number of missclassified real positives: 153


___________Requirement 2: req_1___________
Number of samples with req_1 classified as satisfied: 537
Number of samples with req_1 truly satisfied: 725
Number of false positives: 99
Number of missclassified real positives: 287


___________Requirement 3: req_2___________
Number of samples with req_2 classified as satisfied: 752
Number of samples with req_2 truly satisfied: 903
Number of false positives: 37
Number of missclassified real positives: 188


___________Requirement 4: req_3___________
Number of samples with req_3 classified as satisfied: 820
Number of samples with req_3 truly satisfied: 1029
Number of false positives: 37
Number of missclassified real positives: 246




**Explain the model**

In [16]:
exp = explainer[0].explain_instance(datasets[0].train[positively_classified[req_names[0]][0]], models[0].predict, threshold=0.95)

In [24]:
array = np.zeros_like(positively_classified[req_names[0]])
print(array.shape)

(1303,)


In [37]:
explanations = [[] for req in range(req_number)]

for i, req in enumerate(req_names):
    for j, p_sample in enumerate(positively_classified[req]):
        #prepare the data structure
        explanations[i].append({})
        #get the sample
        sample = datasets[0].train[p_sample]
        #explain the sample
        exp = explainer[i].explain_instance(sample, models[i].predict, threshold=0.95)
        #get the textual explanation
        exp = exp.names()
        #transform the textuql explanations in an interval
        for boundings in exp:
            quoted, rest = get_anchor(boundings)
            explanations[i][j][quoted] =rest

KeyboardInterrupt: 

Let's verify that the data structure is correctly built

In [None]:
print(explanations.len() == req_number)

for i, r in enumerate(req_names):
    print(f"req{i}, {len(explanations[i])}")
    print(len(explanations[i]) == positively_classified[r].shape, positively_classified[r].shape)


**Wrangle the data to cope better with them**

Transform exp_txt in exp_dict a list of 4 dictionaries per element (one per requirement) in which are listed each feature with the respective constraints (as a range data structure)
The range data structure is a 4-element tuple (float, float, boolean, boolean) where (a,b,x,y) num $\in$ (a,b) and x and y are true if the extremes are included, otherwise they are false

In [None]:
exp_dict =[]
for i in range(len(exp_txt[0])):
    exp_dict.append([{}, {}, {}, {}])
    for j in range(len(req_names)):
        for k in range(len(exp_txt[j][i])):
            quoted, rest = get_anchor(exp_txt[j][i][k])
            exp_dict[i][j][quoted] = parse_range(rest)

IndexError: list index out of range

**Assess the results**

metaparameters

In [None]:
val_samples_num = v.shape[0]

In [None]:
for i in range()