# Testing - Imitation
In this code, we aim to demonstrate how accurately the RDR model created by the proxy replicates the performance of the original model. By evaluating the accuracy, we can assess whether the explanations provided by the RDR model effectively represent the behavior of the original model.

In [1]:
import sys
sys.path.append('../src/')
from rdr import RDR

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Import Model Libraries

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

## 1. Load dataset

### 1.1. Load the dataset and store it in a variable named **df**

In [4]:
df = pd.read_csv('../data/star_classification.csv')
display(df.head())
display(df.info())

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   obj_ID       100000 non-null  float64
 1   alpha        100000 non-null  float64
 2   delta        100000 non-null  float64
 3   u            100000 non-null  float64
 4   g            100000 non-null  float64
 5   r            100000 non-null  float64
 6   i            100000 non-null  float64
 7   z            100000 non-null  float64
 8   run_ID       100000 non-null  int64  
 9   rerun_ID     100000 non-null  int64  
 10  cam_col      100000 non-null  int64  
 11  field_ID     100000 non-null  int64  
 12  spec_obj_ID  100000 non-null  float64
 13  class        100000 non-null  object 
 14  redshift     100000 non-null  float64
 15  plate        100000 non-null  int64  
 16  MJD          100000 non-null  int64  
 17  fiber_ID     100000 non-null  int64  
dtypes: float64(10), int64(7),

None

### 1.2. Define and train Label Encoder

In [5]:
le = LabelEncoder()
le.fit(df['class'])

## 2. Testing RDR model's imitation accuracy
At this phase, the RDR model will be created and compared against the original models. The conventional models to be used consists of:

**White box Model**
- Decision Tree Classifier

**Black box Model**
- Random Forest
- Support Vector Machine
- XGBoost
- MLP
- TabNet

Furthermore, testing will be carried out on data samples which differs from 25%, 50%, 75%, and 100% (its original size).

### 2.3. Testing with sample data size : 75%

In [6]:
def remove_outliers(df, threshold=2.5):
    z_scores = np.abs(stats.zscore(df.select_dtypes(include='number')))
    outliers = (z_scores > threshold).any(axis=1)
    return df[~outliers]

# Sample size : 75%
df75 = df.sample(frac=0.75, random_state=42)

# remove outlier
df75 = remove_outliers(df75)

# define features and label
label = df75['class']
features = df75.drop(
            columns=['obj_ID', 'run_ID', 'rerun_ID', 'field_ID', 'spec_obj_ID','fiber_ID','class'],
            axis=1)

# split data for training and testing model
# train : test = 80 : 20
X75_train, X75_test, y75_train, y75_test = train_test_split(features,
                                                            label, 
                                                            test_size=0.2,
                                                            random_state=42)

#### 2.3.1. RDR against Decision Tree Classifier

In [7]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf75 = DecisionTreeClassifier()
dtf75.fit(X75_train, y75_train)

In [8]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf75.predict(features)

In [9]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

dtf_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dtf75_pred = dtf75.predict(X75_test)
dtf_rdr75_pred = dtf_rdr75.predict(X75_test)

#### 2.3.2. RDR against Random Forest Classifier

In [10]:
# CREATE AND TRAIN RANDOM FOREST CLASSIFIER
rf75 = RandomForestClassifier()
rf75.fit(X75_train, y75_train)

In [11]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf75.predict(features)

In [12]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

rf_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
rf75_pred = rf75.predict(X75_test)
rf_rdr75_pred = rf_rdr75.predict(X75_test)

#### 2.3.3. RDR against Support Vector Machine

In [None]:
param_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf','linear' 'poly', 'sigmoid']}

svm = SVC()

grid = GridSearchCV(svm,param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid.fit(X75_train, y75_train)

print("Best parameters found: ", grid.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))

In [13]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm75 = SVC(C= 100, gamma= 0.001, kernel= 'rbf')
svm75.fit(X75_train, y75_train)

In [14]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm75.predict(features)

In [15]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

svm_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
svm75_pred = svm75.predict(X75_test)
svm_rdr75_pred = svm_rdr75.predict(X75_test)

#### 2.3.4. RDR against XGBoost Classifier

In [16]:
# CREATE AND TRAIN XGBOOST CLASSIFIER
train_dmatrix = xgb.DMatrix(data=X75_train, label=le.transform(y75_train))
xgboost = xgb.train({}, train_dmatrix)

In [17]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
dmatrix = xgb.DMatrix(
            data=features,
            label=le.transform(label))

xgb75_pred = xgboost.predict(dmatrix)
prediction_dataset = le.inverse_transform(np.round(xgb75_pred).astype(int))

In [18]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

xgb_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dmatrix = xgb.DMatrix(
            data=X75_test,
            label=le.transform(y75_test))

xgb75_pred = xgboost.predict(dmatrix)
xgb75_pred = le.inverse_transform(np.round(xgb75_pred).astype(int))

xgb_rdr75_pred = xgb_rdr75.predict(X75_test)

#### 2.3.5. RDR against Multi-layer Perceptron (MLP) Classifier

In [19]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.1, 0.01, 0.001],
    'batch_size': [32, 64],
    'max_iter': [300, 500],
    'alpha': [0.0001, 0.001],
    'activation': ['relu', 'tanh']
}

mlp = MLPClassifier()

grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X75_train, y75_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

Best parameters found:  {'activation': 'relu', 'alpha': 0.001, 'batch_size': 64, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 500}
Best cross-validation accuracy: 0.90


In [19]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp75 = MLPClassifier(
    max_iter=500,
    batch_size=64,
    hidden_layer_sizes=(100,),
    learning_rate='constant',
    learning_rate_init=0.001,
    alpha=0.001,
    activation='relu'
)
mlp75.fit(X75_train, y75_train)

In [20]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp75.predict(features)

In [21]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

mlp_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp75_pred = mlp75.predict(X75_test)
mlp_rdr75_pred = mlp_rdr75.predict(X75_test)

#### 2.3.6. RDR against TabNet Classifier

In [22]:
# CREATE AND TRAIN TabNet Classifier
tabnet75 = TabNetClassifier()
tabnet75.fit(X75_train.values, y75_train, max_epochs=100)



epoch 0  | loss: 0.48026 |  0:00:02s
epoch 1  | loss: 0.14609 |  0:00:05s
epoch 2  | loss: 0.12288 |  0:00:08s
epoch 3  | loss: 0.11581 |  0:00:10s
epoch 4  | loss: 0.11147 |  0:00:13s
epoch 5  | loss: 0.10922 |  0:00:16s
epoch 6  | loss: 0.11104 |  0:00:19s
epoch 7  | loss: 0.10934 |  0:00:21s
epoch 8  | loss: 0.10997 |  0:00:24s
epoch 9  | loss: 0.10799 |  0:00:27s
epoch 10 | loss: 0.10749 |  0:00:30s
epoch 11 | loss: 0.10572 |  0:00:32s
epoch 12 | loss: 0.10353 |  0:00:35s
epoch 13 | loss: 0.10218 |  0:00:38s
epoch 14 | loss: 0.10252 |  0:00:41s
epoch 15 | loss: 0.10114 |  0:00:44s
epoch 16 | loss: 0.10225 |  0:00:46s
epoch 17 | loss: 0.101   |  0:00:49s
epoch 18 | loss: 0.10582 |  0:00:52s
epoch 19 | loss: 0.10172 |  0:00:55s
epoch 20 | loss: 0.10103 |  0:00:57s
epoch 21 | loss: 0.10102 |  0:01:00s
epoch 22 | loss: 0.09971 |  0:01:03s
epoch 23 | loss: 0.09761 |  0:01:06s
epoch 24 | loss: 0.09698 |  0:01:09s
epoch 25 | loss: 0.09757 |  0:01:12s
epoch 26 | loss: 0.09706 |  0:01:14s
e

In [23]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet75.predict(features.values)

In [24]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['cam_col','plate'],
        comp_operator='>=')

tabnet_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet75_pred = tabnet75.predict(X75_test.values)
tabnet_rdr75_pred = tabnet_rdr75.predict(X75_test)

## 5. Accuracy Comparison between RDR model and conventional models
To streamline the process, the accuracy comparison for each sample size will be presented by invoking the `display_accuracy` function.

In [25]:
def display_accuracy(
        y_test,
        dtf_pred, dtf_rdr_pred,
        rf_pred, rf_rdr_pred,
        xgb_pred, xgb_rdr_pred,
        svm_pred, svm_rdr_pred,
        mlp_pred, mlp_rdr_pred,
        tabnet_pred, tabnet_rdr_pred
    ) -> None:

    accuracy_table = []

    accuracy_table.append([
        "Decision Tree",
        metrics.accuracy_score(y_test,dtf_pred),
        metrics.accuracy_score(y_test, dtf_rdr_pred),
        metrics.accuracy_score(dtf_pred, dtf_rdr_pred)
    ])

    accuracy_table.append([
        "Random Forest",
        metrics.accuracy_score(y_test,rf_pred),
        metrics.accuracy_score(y_test, rf_rdr_pred),
        metrics.accuracy_score(rf_pred, rf_rdr_pred)
    ])

    accuracy_table.append([
        "XGBoost",
        metrics.accuracy_score(y_test,xgb_pred),
        metrics.accuracy_score(y_test, xgb_rdr_pred),
        metrics.accuracy_score(xgb_pred, xgb_rdr_pred)
    ])

    accuracy_table.append([
        "Support Vector Machine",
        metrics.accuracy_score(y_test,svm_pred),
        metrics.accuracy_score(y_test, svm_rdr_pred),
        metrics.accuracy_score(svm_pred, svm_rdr_pred)
    ])

    accuracy_table.append([
        "Multi-layer Perceptron",
        metrics.accuracy_score(y_test,mlp_pred),
        metrics.accuracy_score(y_test, mlp_rdr_pred),
        metrics.accuracy_score(mlp_pred, mlp_rdr_pred)
    ])

    accuracy_table.append([
        "TabNet",
        metrics.accuracy_score(y_test,tabnet_pred),
        metrics.accuracy_score(y_test, tabnet_rdr_pred),
        metrics.accuracy_score(tabnet_pred, tabnet_rdr_pred)
    ])

    accuracy_table = pd.DataFrame(accuracy_table, columns=["Model", "Accuracy", "RDR Accuracy", "Difference"])
    pd.set_option('display.precision', 15)

    display(accuracy_table)

    return

In [26]:
print("Accuracy comparison for 75% sample size:")
display_accuracy(
    y75_test,
    dtf75_pred, dtf_rdr75_pred,
    rf75_pred, rf_rdr75_pred,
    xgb75_pred, xgb_rdr75_pred,
    svm75_pred, svm_rdr75_pred,
    mlp75_pred, mlp_rdr75_pred,
    tabnet75_pred, tabnet_rdr75_pred
)

Accuracy comparison for 75% sample size:


Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.964159003692709,0.864021432191731,0.881760915212512
1,Random Forest,0.979364274853378,0.890232423430599,0.903120700890594
2,XGBoost,0.973064948229672,0.872710158569256,0.890739265802621
3,Support Vector Machine,0.871334443559482,0.812395916298603,0.871117225400043
4,Multi-layer Perceptron,0.888639490261386,0.824053290855115,0.892549417131272
5,TabNet,0.966548403446528,0.861342408225328,0.882267757584534
