# Testing - Imitation
In this code, we aim to demonstrate how accurately the RDR model created by the proxy replicates the performance of the original model. By evaluating the accuracy, we can assess whether the explanations provided by the RDR model effectively represent the behavior of the original model.

In [1]:
import sys
sys.path.append('../src/')
from rdr import RDR

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Import Model Libraries

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

## 1. Load dataset

### 1.1. Load the dataset and store it in a variable named **df**

In [4]:
df = pd.read_csv('../data/star_classification.csv')
display(df.head())
display(df.info())

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   obj_ID       100000 non-null  float64
 1   alpha        100000 non-null  float64
 2   delta        100000 non-null  float64
 3   u            100000 non-null  float64
 4   g            100000 non-null  float64
 5   r            100000 non-null  float64
 6   i            100000 non-null  float64
 7   z            100000 non-null  float64
 8   run_ID       100000 non-null  int64  
 9   rerun_ID     100000 non-null  int64  
 10  cam_col      100000 non-null  int64  
 11  field_ID     100000 non-null  int64  
 12  spec_obj_ID  100000 non-null  float64
 13  class        100000 non-null  object 
 14  redshift     100000 non-null  float64
 15  plate        100000 non-null  int64  
 16  MJD          100000 non-null  int64  
 17  fiber_ID     100000 non-null  int64  
dtypes: float64(10), int64(7),

None

### 1.2. Define and train Label Encoder

In [5]:
le = LabelEncoder()
le.fit(df['class'])

## 2. Testing RDR model's imitation accuracy
At this phase, the RDR model will be created and compared against the original models. The conventional models to be used consists of:

**White box Model**
- Decision Tree Classifier

**Black box Model**
- Random Forest
- Support Vector Machine
- XGBoost
- MLP
- TabNet

Furthermore, testing will be carried out on data samples which differs from 25%, 50%, 75%, and 100% (its original size).

### 2.2. Testing with sample data size : 50%

In [6]:
def remove_outliers(df, threshold=2.5):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include='number')))
    outliers = (z_scores > threshold).any(axis=1)
    return df[~outliers]

# Sample size : 50%
df50 = df.sample(frac=0.5, random_state=42)

# remove outlier
df50 = remove_outliers(df50)

# define features and label
label = df50['class']
features = df50.drop(
            columns=['obj_ID', 'run_ID', 'rerun_ID', 'field_ID', 'spec_obj_ID','fiber_ID','class'],
            axis=1)

# split data for training and testing model
# train : test = 80 : 20
X50_train, X50_test, y50_train, y50_test = train_test_split(features,
                                                            label, 
                                                            test_size=0.2,
                                                            random_state=42)

#### 2.2.1. RDR against Decision Tree Classifier

In [7]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf50 = DecisionTreeClassifier()
dtf50.fit(X50_train, y50_train)

In [8]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf50.predict(features)

In [9]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

dtf_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dtf50_pred = dtf50.predict(X50_test)
dtf_rdr50_pred = dtf_rdr50.predict(X50_test)

#### 2.2.2. RDR against Random Forest Classifier

In [10]:
# CREATE AND TRAIN RANDOM FOREST CLASSIFIER
rf50 = RandomForestClassifier()
rf50.fit(X50_train, y50_train)

In [11]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf50.predict(features)

In [12]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

rf_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
rf50_pred = rf50.predict(X50_test)
rf_rdr50_pred = rf_rdr50.predict(X50_test)

#### 2.2.3. RDR against Support Vector Machine

In [13]:
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf','linear' 'poly', 'sigmoid']}

svm = SVC()

grid = GridSearchCV(svm,param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid.fit(X50_train, y50_train)

print("Best parameters found: ", grid.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))

80 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Vieri\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Vieri\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\Vieri\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Vieri\AppData\Local\Programs\Python\Python312\Lib\

Best parameters found:  {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Best cross-validation accuracy: 0.83


In [14]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm50 = SVC(C= 100, gamma= 0.001, kernel= 'rbf')
svm50.fit(X50_train, y50_train)

In [15]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm50.predict(features)

In [16]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

svm_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
svm50_pred = svm50.predict(X50_test)
svm_rdr50_pred = svm_rdr50.predict(X50_test)

#### 2.2.4. RDR against XGBoost Classifier

In [17]:
# CREATE AND TRAIN XGBOOST CLASSIFIER
train_dmatrix = xgb.DMatrix(data=X50_train, label=le.transform(y50_train))
xgboost = xgb.train({}, train_dmatrix)

In [18]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
dmatrix = xgb.DMatrix(
            data=features,
            label=le.transform(label))

xgb50_pred = xgboost.predict(dmatrix)
prediction_dataset = le.inverse_transform(np.round(xgb50_pred).astype(int))

In [19]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

xgb_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dmatrix = xgb.DMatrix(
            data=X50_test,
            label=le.transform(y50_test))

xgb50_pred = xgboost.predict(dmatrix)
xgb50_pred = le.inverse_transform(np.round(xgb50_pred).astype(int))

xgb_rdr50_pred = xgb_rdr50.predict(X50_test)

#### 2.2.5. RDR against Multi-layer Perceptron (MLP) Classifier

In [None]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.1, 0.01, 0.001],
    'batch_size': [32, 64],
    'max_iter': [300, 500],
    'alpha': [0.0001, 0.001],
    'activation': ['relu', 'tanh']
}

mlp = MLPClassifier()

grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X50_train, y50_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

Best parameters found:  {'activation': 'relu', 'alpha': 0.001, 'batch_size': 32, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 500}
Best cross-validation accuracy: 0.85


In [20]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp50 = MLPClassifier(
    max_iter=500,
    batch_size=32,
    hidden_layer_sizes=(100,),
    learning_rate='constant',
    learning_rate_init=0.001,
    alpha=0.0001,
    activation='relu'
)

mlp50.fit(X50_train, y50_train)

In [21]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp50.predict(features)

In [22]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

mlp_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp50_pred = mlp50.predict(X50_test)
mlp_rdr50_pred = mlp_rdr50.predict(X50_test)

#### 2.2.6. RDR against TabNet Classifier

In [28]:
# CREATE AND TRAIN TabNet Classifier
tabnet50 = TabNetClassifier()
tabnet50.fit(X50_train.values, y50_train, max_epochs=100)



epoch 0  | loss: 0.732   |  0:00:02s
epoch 1  | loss: 0.27217 |  0:00:04s
epoch 2  | loss: 0.17609 |  0:00:06s
epoch 3  | loss: 0.16249 |  0:00:09s
epoch 4  | loss: 0.14462 |  0:00:11s
epoch 5  | loss: 0.13044 |  0:00:13s
epoch 6  | loss: 0.13424 |  0:00:15s
epoch 7  | loss: 0.13048 |  0:00:17s
epoch 8  | loss: 0.12276 |  0:00:19s
epoch 9  | loss: 0.11853 |  0:00:21s
epoch 10 | loss: 0.11631 |  0:00:23s
epoch 11 | loss: 0.11901 |  0:00:25s
epoch 12 | loss: 0.11607 |  0:00:27s
epoch 13 | loss: 0.11157 |  0:00:29s
epoch 14 | loss: 0.11149 |  0:00:31s
epoch 15 | loss: 0.10901 |  0:00:34s
epoch 16 | loss: 0.11316 |  0:00:36s
epoch 17 | loss: 0.11295 |  0:00:38s
epoch 18 | loss: 0.10909 |  0:00:40s
epoch 19 | loss: 0.1084  |  0:00:42s
epoch 20 | loss: 0.10569 |  0:00:44s
epoch 21 | loss: 0.1083  |  0:00:46s
epoch 22 | loss: 0.11237 |  0:00:48s
epoch 23 | loss: 0.10907 |  0:00:50s
epoch 24 | loss: 0.10866 |  0:00:52s
epoch 25 | loss: 0.10798 |  0:00:54s
epoch 26 | loss: 0.10602 |  0:00:56s
e

In [29]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet50.predict(features.values)

In [30]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

tabnet_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet50_pred = tabnet50.predict(X50_test.values)
tabnet_rdr50_pred = tabnet_rdr50.predict(X50_test)

## 5. Accuracy Comparison between RDR model and conventional models
To streamline the process, the accuracy comparison for each sample size will be presented by invoking the `display_accuracy` function.

In [26]:
def display_accuracy(
        y_test,
        dtf_pred, dtf_rdr_pred,
        rf_pred, rf_rdr_pred,
        xgb_pred, xgb_rdr_pred,
        svm_pred, svm_rdr_pred,
        mlp_pred, mlp_rdr_pred,
        tabnet_pred, tabnet_rdr_pred
    ) -> None:

    accuracy_table = []

    accuracy_table.append([
        "Decision Tree",
        metrics.accuracy_score(y_test,dtf_pred),
        metrics.accuracy_score(y_test, dtf_rdr_pred),
        metrics.accuracy_score(dtf_pred, dtf_rdr_pred)
    ])

    accuracy_table.append([
        "Random Forest",
        metrics.accuracy_score(y_test,rf_pred),
        metrics.accuracy_score(y_test, rf_rdr_pred),
        metrics.accuracy_score(rf_pred, rf_rdr_pred)
    ])

    accuracy_table.append([
        "XGBoost",
        metrics.accuracy_score(y_test,xgb_pred),
        metrics.accuracy_score(y_test, xgb_rdr_pred),
        metrics.accuracy_score(xgb_pred, xgb_rdr_pred)
    ])

    accuracy_table.append([
        "Support Vector Machine",
        metrics.accuracy_score(y_test,svm_pred),
        metrics.accuracy_score(y_test, svm_rdr_pred),
        metrics.accuracy_score(svm_pred, svm_rdr_pred)
    ])

    accuracy_table.append([
        "Multi-layer Perceptron",
        metrics.accuracy_score(y_test,mlp_pred),
        metrics.accuracy_score(y_test, mlp_rdr_pred),
        metrics.accuracy_score(mlp_pred, mlp_rdr_pred)
    ])

    accuracy_table.append([
        "TabNet",
        metrics.accuracy_score(y_test,tabnet_pred),
        metrics.accuracy_score(y_test, tabnet_rdr_pred),
        metrics.accuracy_score(tabnet_pred, tabnet_rdr_pred)
    ])

    accuracy_table = pd.DataFrame(accuracy_table, columns=["Model", "Accuracy", "RDR Accuracy", "Difference"])
    pd.set_option('display.precision', 15)

    display(accuracy_table)

    return

In [27]:
print("Accuracy comparison for 50% sample size:")
display_accuracy(
    y50_test,
    dtf50_pred, dtf_rdr50_pred,
    rf50_pred, rf_rdr50_pred,
    xgb50_pred, xgb_rdr50_pred,
    svm50_pred, svm_rdr50_pred,
    mlp50_pred, mlp_rdr50_pred,
    tabnet50_pred, tabnet_rdr50_pred
)

Accuracy comparison for 50% sample size:


Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.964390402779286,0.882857453045272,0.90131364672674
1,Random Forest,0.976441211594832,0.887960047768972,0.903919227011182
2,XGBoost,0.966670285528173,0.851807621322332,0.871783736836391
3,Support Vector Machine,0.853761806535664,0.810661165997177,0.865486917815655
4,Multi-layer Perceptron,0.894582564325263,0.821083487134947,0.881880360438606
5,TabNet,0.766583432852025,0.728368255346868,0.944414287265226
