# Testing - Imitation
In this code, we aim to demonstrate how accurately the RDR model created by the proxy replicates the performance of the original model. By evaluating the accuracy, we can assess whether the explanations provided by the RDR model effectively represent the behavior of the original model.

In [1]:
import sys
sys.path.append('../src/')
from rdr import RDR

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Import Model Libraries

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

## 1. Load dataset

### 1.1. Load the dataset and store it in a variable named **df**

In [4]:
df = pd.read_csv('../data/star_classification.csv')
display(df.head())
display(df.info())

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   obj_ID       100000 non-null  float64
 1   alpha        100000 non-null  float64
 2   delta        100000 non-null  float64
 3   u            100000 non-null  float64
 4   g            100000 non-null  float64
 5   r            100000 non-null  float64
 6   i            100000 non-null  float64
 7   z            100000 non-null  float64
 8   run_ID       100000 non-null  int64  
 9   rerun_ID     100000 non-null  int64  
 10  cam_col      100000 non-null  int64  
 11  field_ID     100000 non-null  int64  
 12  spec_obj_ID  100000 non-null  float64
 13  class        100000 non-null  object 
 14  redshift     100000 non-null  float64
 15  plate        100000 non-null  int64  
 16  MJD          100000 non-null  int64  
 17  fiber_ID     100000 non-null  int64  
dtypes: float64(10), int64(7),

None

### 1.2. Define and train Label Encoder

In [5]:
le = LabelEncoder()
le.fit(df['class'])

## 2. Testing RDR model's imitation accuracy
At this phase, the RDR model will be created and compared against the original models. The conventional models to be used consists of:

**White box Model**
- Decision Tree Classifier

**Black box Model**
- Random Forest
- Support Vector Machine
- XGBoost
- MLP
- TabNet

Furthermore, testing will be carried out on data samples which differs from 25%, 50%, 75%, and 100% (its original size).

### 2.1. Testing with sample data size : 25%

In [6]:
# Sample size : 25%
df25 = df.sample(frac=0.25, random_state=42)

# define features and label
label = df25['class']
features = df25.drop(columns=['obj_ID', 'class', 'spec_obj_ID'], axis=1)

# split data for training and testing model
# train : test = 80 : 20
X25_train, X25_test, y25_train, y25_test = train_test_split(features,
                                                            label,
                                                            test_size=0.2,
                                                            random_state=42)

#### 2.1.1. RDR against Decision Tree Classifier

In [7]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf25 = DecisionTreeClassifier()
dtf25.fit(X25_train, y25_train)

In [8]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf25.predict(features)

In [9]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

dtf_rdr25 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dtf25_pred = dtf25.predict(X25_test)
dtf_rdr25_pred = dtf_rdr25.predict(X25_test)

#### 2.1.2. RDR against Random Forest Classifier

In [10]:
# CREATE AND TRAIN RANDOM FOREST CLASSIFIER
rf25 = RandomForestClassifier()
rf25.fit(X25_train, y25_train)

In [11]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf25.predict(features)

In [12]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

rf_rdr25 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
rf25_pred = rf25.predict(X25_test)
rf_rdr25_pred = rf_rdr25.predict(X25_test)

#### 2.1.3. RDR against Support Vector Machine

In [13]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm25 = SVC()
svm25.fit(X25_train, y25_train)

In [14]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm25.predict(features)

In [15]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

svm_rdr25 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
svm25_pred = svm25.predict(X25_test)
svm_rdr25_pred = svm_rdr25.predict(X25_test)

#### 2.1.4. RDR against XGBoost Classifier

In [16]:
# CREATE AND TRAIN XGBOOST CLASSIFIER
train_dmatrix = xgb.DMatrix(data=X25_train, label=le.transform(y25_train))
xgboost = xgb.train({}, train_dmatrix)

In [17]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
dmatrix = xgb.DMatrix(
            data=features,
            label=le.transform(label))

xgb25_pred = xgboost.predict(dmatrix)
prediction_dataset = le.inverse_transform(np.round(xgb25_pred).astype(int))

In [18]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

xgb_rdr25 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dmatrix = xgb.DMatrix(
            data=X25_test,
            label=le.transform(y25_test))

xgb25_pred = xgboost.predict(dmatrix)
xgb25_pred = le.inverse_transform(np.round(xgb25_pred).astype(int))

xgb_rdr25_pred = xgb_rdr25.predict(X25_test)

#### 2.1.5. RDR against Multi-layer Perceptron (MLP) Classifier

In [19]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp25 = MLPClassifier(max_iter=100, batch_size=5, hidden_layer_sizes=(10, 15, 30), learning_rate='constant', learning_rate_init=0.1)
mlp25.fit(X25_train, y25_train)

In [20]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp25.predict(features)

In [21]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

mlp_rdr25 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp25_pred = mlp25.predict(X25_test)
mlp_rdr25_pred = mlp_rdr25.predict(X25_test)

#### 2.1.6. RDR against TabNet Classifier

In [22]:
# CREATE AND TRAIN TabNet Classifier
tabnet25 = TabNetClassifier()
tabnet25.fit(X25_train.values, y25_train, max_epochs=100)



epoch 0  | loss: 1.2078  |  0:00:02s
epoch 1  | loss: 0.67361 |  0:00:04s
epoch 2  | loss: 0.37872 |  0:00:06s
epoch 3  | loss: 0.23713 |  0:00:08s
epoch 4  | loss: 0.18131 |  0:00:09s
epoch 5  | loss: 0.1521  |  0:00:11s
epoch 6  | loss: 0.13802 |  0:00:13s
epoch 7  | loss: 0.14295 |  0:00:15s
epoch 8  | loss: 0.1416  |  0:00:17s
epoch 9  | loss: 0.13622 |  0:00:18s
epoch 10 | loss: 0.12737 |  0:00:20s
epoch 11 | loss: 0.12169 |  0:00:21s
epoch 12 | loss: 0.12295 |  0:00:23s
epoch 13 | loss: 0.12302 |  0:00:25s
epoch 14 | loss: 0.12051 |  0:00:26s
epoch 15 | loss: 0.12238 |  0:00:28s
epoch 16 | loss: 0.13927 |  0:00:30s
epoch 17 | loss: 0.1193  |  0:00:31s
epoch 18 | loss: 0.11686 |  0:00:33s
epoch 19 | loss: 0.11142 |  0:00:34s
epoch 20 | loss: 0.11236 |  0:00:36s
epoch 21 | loss: 0.11126 |  0:00:38s
epoch 22 | loss: 0.11151 |  0:00:39s
epoch 23 | loss: 0.10975 |  0:00:41s
epoch 24 | loss: 0.11338 |  0:00:42s
epoch 25 | loss: 0.10981 |  0:00:44s
epoch 26 | loss: 0.10524 |  0:00:46s
e

In [23]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet25.predict(features.values)

In [24]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

tabnet_rdr25 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet25_pred = tabnet25.predict(X25_test.values)
tabnet_rdr25_pred = tabnet_rdr25.predict(X25_test)

### 2.2. Testing with sample data size : 50%

In [25]:
# Sample size : 50%
df50 = df.sample(frac=0.5, random_state=42)

# define features and label
label = df50['class']
features = df50.drop(columns=['obj_ID', 'class', 'spec_obj_ID'], axis=1)

# split data for training and testing model
# train : test = 80 : 20
X50_train, X50_test, y50_train, y50_test = train_test_split(features,
                                                            label, 
                                                            test_size=0.2,
                                                            random_state=42)

#### 2.2.1. RDR against Decision Tree Classifier

In [26]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf50 = DecisionTreeClassifier()
dtf50.fit(X50_train, y50_train)

In [27]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf50.predict(features)

In [28]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

dtf_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dtf50_pred = dtf50.predict(X50_test)
dtf_rdr50_pred = dtf_rdr50.predict(X50_test)

#### 2.2.2. RDR against Random Forest Classifier

In [29]:
# CREATE AND TRAIN RANDOM FOREST CLASSIFIER
rf50 = RandomForestClassifier()
rf50.fit(X50_train, y50_train)

In [30]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf50.predict(features)

In [31]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

rf_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
rf50_pred = rf50.predict(X50_test)
rf_rdr50_pred = rf_rdr50.predict(X50_test)

#### 2.2.3. RDR against Support Vector Machine

In [32]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm50 = SVC()
svm50.fit(X50_train, y50_train)

In [33]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm50.predict(features)

In [34]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

svm_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
svm50_pred = svm50.predict(X50_test)
svm_rdr50_pred = svm_rdr50.predict(X50_test)

#### 2.2.4. RDR against XGBoost Classifier

In [35]:
# CREATE AND TRAIN XGBOOST CLASSIFIER
train_dmatrix = xgb.DMatrix(data=X50_train, label=le.transform(y50_train))
xgboost = xgb.train({}, train_dmatrix)

In [36]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
dmatrix = xgb.DMatrix(
            data=features,
            label=le.transform(label))

xgb50_pred = xgboost.predict(dmatrix)
prediction_dataset = le.inverse_transform(np.round(xgb50_pred).astype(int))

In [37]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

xgb_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dmatrix = xgb.DMatrix(
            data=X50_test,
            label=le.transform(y50_test))

xgb50_pred = xgboost.predict(dmatrix)
xgb50_pred = le.inverse_transform(np.round(xgb50_pred).astype(int))

xgb_rdr50_pred = xgb_rdr50.predict(X50_test)

#### 2.2.5. RDR against Multi-layer Perceptron (MLP) Classifier

In [38]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp50 = MLPClassifier(max_iter=100, batch_size=5, hidden_layer_sizes=(10, 15, 30), learning_rate='constant', learning_rate_init=0.1)
mlp50.fit(X50_train, y50_train)

In [39]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp50.predict(features)

In [40]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

mlp_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp50_pred = mlp50.predict(X50_test)
mlp_rdr50_pred = mlp_rdr50.predict(X50_test)

#### 2.2.6. RDR against TabNet Classifier

In [41]:
# CREATE AND TRAIN TabNet Classifier
tabnet50 = TabNetClassifier()
tabnet50.fit(X50_train.values, y50_train, max_epochs=100)



epoch 0  | loss: 0.95667 |  0:00:03s
epoch 1  | loss: 0.30905 |  0:00:06s
epoch 2  | loss: 0.20892 |  0:00:08s
epoch 3  | loss: 0.18149 |  0:00:11s
epoch 4  | loss: 0.15121 |  0:00:14s
epoch 5  | loss: 0.1353  |  0:00:17s
epoch 6  | loss: 0.12887 |  0:00:19s
epoch 7  | loss: 0.12627 |  0:00:22s
epoch 8  | loss: 0.12215 |  0:00:24s
epoch 9  | loss: 0.12237 |  0:00:28s
epoch 10 | loss: 0.11874 |  0:00:31s
epoch 11 | loss: 0.11662 |  0:00:34s
epoch 12 | loss: 0.11662 |  0:00:37s
epoch 13 | loss: 0.11328 |  0:00:39s
epoch 14 | loss: 0.11024 |  0:00:42s
epoch 15 | loss: 0.11176 |  0:00:44s
epoch 16 | loss: 0.10781 |  0:00:48s
epoch 17 | loss: 0.11074 |  0:00:51s
epoch 18 | loss: 0.1114  |  0:00:54s
epoch 19 | loss: 0.10859 |  0:00:57s
epoch 20 | loss: 0.11    |  0:01:00s
epoch 21 | loss: 0.10885 |  0:01:02s
epoch 22 | loss: 0.10907 |  0:01:05s
epoch 23 | loss: 0.11115 |  0:01:08s
epoch 24 | loss: 0.10855 |  0:01:10s
epoch 25 | loss: 0.10853 |  0:01:14s
epoch 26 | loss: 0.10976 |  0:01:16s
e

In [42]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet50.predict(features.values)

In [43]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

tabnet_rdr50 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet50_pred = tabnet50.predict(X50_test.values)
tabnet_rdr50_pred = tabnet_rdr50.predict(X50_test)

### 2.3. Testing with sample data size : 75%

In [44]:
# Sample size : 75%
df75 = df.sample(frac=0.75, random_state=42)

# define features and label
label = df75['class']
features = df75.drop(columns=['obj_ID', 'class', 'spec_obj_ID'], axis=1)

# split data for training and testing model
# train : test = 80 : 20
X75_train, X75_test, y75_train, y75_test = train_test_split(features,
                                                            label, 
                                                            test_size=0.2,
                                                            random_state=42)

#### 2.3.1. RDR against Decision Tree Classifier

In [45]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf75 = DecisionTreeClassifier()
dtf75.fit(X75_train, y75_train)

In [46]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf75.predict(features)

In [47]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

dtf_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dtf75_pred = dtf75.predict(X75_test)
dtf_rdr75_pred = dtf_rdr75.predict(X75_test)

#### 2.3.2. RDR against Random Forest Classifier

In [48]:
# CREATE AND TRAIN RANDOM FOREST CLASSIFIER
rf75 = RandomForestClassifier()
rf75.fit(X75_train, y75_train)

In [49]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf75.predict(features)

In [50]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

rf_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
rf75_pred = rf75.predict(X75_test)
rf_rdr75_pred = rf_rdr75.predict(X75_test)

#### 2.3.3. RDR against Support Vector Machine

In [51]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm75 = SVC()
svm75.fit(X75_train, y75_train)

In [52]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm75.predict(features)

In [53]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

svm_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
svm75_pred = svm75.predict(X75_test)
svm_rdr75_pred = svm_rdr75.predict(X75_test)

#### 2.3.4. RDR against XGBoost Classifier

In [54]:
# CREATE AND TRAIN XGBOOST CLASSIFIER
train_dmatrix = xgb.DMatrix(data=X75_train, label=le.transform(y75_train))
xgboost = xgb.train({}, train_dmatrix)

In [55]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
dmatrix = xgb.DMatrix(
            data=features,
            label=le.transform(label))

xgb75_pred = xgboost.predict(dmatrix)
prediction_dataset = le.inverse_transform(np.round(xgb75_pred).astype(int))

In [56]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

xgb_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dmatrix = xgb.DMatrix(
            data=X75_test,
            label=le.transform(y75_test))

xgb75_pred = xgboost.predict(dmatrix)
xgb75_pred = le.inverse_transform(np.round(xgb75_pred).astype(int))

xgb_rdr75_pred = xgb_rdr75.predict(X75_test)

#### 2.3.5. RDR against Multi-layer Perceptron (MLP) Classifier

In [57]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp75 = MLPClassifier(max_iter=100, batch_size=5, hidden_layer_sizes=(10, 15, 30), learning_rate='constant', learning_rate_init=0.1)
mlp75.fit(X75_train, y75_train)

In [58]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp75.predict(features)

In [59]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

mlp_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp75_pred = mlp75.predict(X75_test)
mlp_rdr75_pred = mlp_rdr75.predict(X75_test)

#### 2.3.6. RDR against TabNet Classifier

In [60]:
# CREATE AND TRAIN TabNet Classifier
tabnet75 = TabNetClassifier()
tabnet75.fit(X75_train.values, y75_train, max_epochs=100)



epoch 0  | loss: 0.75331 |  0:00:04s
epoch 1  | loss: 0.16283 |  0:00:08s
epoch 2  | loss: 0.12005 |  0:00:12s
epoch 3  | loss: 0.11645 |  0:00:16s
epoch 4  | loss: 0.11195 |  0:00:20s
epoch 5  | loss: 0.11037 |  0:00:23s
epoch 6  | loss: 0.10488 |  0:00:27s
epoch 7  | loss: 0.10585 |  0:00:31s
epoch 8  | loss: 0.10283 |  0:00:35s
epoch 9  | loss: 0.10062 |  0:00:39s
epoch 10 | loss: 0.10087 |  0:00:43s
epoch 11 | loss: 0.09969 |  0:00:47s
epoch 12 | loss: 0.09757 |  0:00:51s
epoch 13 | loss: 0.0986  |  0:00:55s
epoch 14 | loss: 0.09895 |  0:00:59s
epoch 15 | loss: 0.0978  |  0:01:03s
epoch 16 | loss: 0.09596 |  0:01:07s
epoch 17 | loss: 0.09597 |  0:01:10s
epoch 18 | loss: 0.09473 |  0:01:14s
epoch 19 | loss: 0.09513 |  0:01:18s
epoch 20 | loss: 0.09604 |  0:01:22s
epoch 21 | loss: 0.09315 |  0:01:26s
epoch 22 | loss: 0.09377 |  0:01:30s
epoch 23 | loss: 0.09338 |  0:01:34s
epoch 24 | loss: 0.09452 |  0:01:38s
epoch 25 | loss: 0.09286 |  0:01:42s
epoch 26 | loss: 0.09506 |  0:01:46s
e

In [61]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet75.predict(features.values)

In [62]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

tabnet_rdr75 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet75_pred = tabnet75.predict(X75_test.values)
tabnet_rdr75_pred = tabnet_rdr75.predict(X75_test)

### 2.4. Testing with sample data size : 100%

In [63]:
# Sample size : 100%
df100 = df

# define features and label
label = df100['class']
features = df100.drop(columns=['obj_ID', 'class', 'spec_obj_ID'], axis=1)

# split data for training and testing model
# train : test = 80 : 20
X100_train, X100_test, y100_train, y100_test = train_test_split(features,
                                                            label, 
                                                            test_size=0.2,
                                                            random_state=42)

#### 2.4.1. RDR against Decision Tree Classifier

In [64]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf100 = DecisionTreeClassifier()
dtf100.fit(X100_train, y100_train)

In [65]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf100.predict(features)

In [66]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

dtf_rdr100 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dtf100_pred = dtf100.predict(X100_test)
dtf_rdr100_pred = dtf_rdr100.predict(X100_test)

#### 2.4.2. RDR against Random Forest Classifier

In [67]:
# CREATE AND TRAIN RANDOM FOREST CLASSIFIER
rf100 = RandomForestClassifier()
rf100.fit(X100_train, y100_train)

In [68]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf100.predict(features)

In [69]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

rf_rdr100 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
rf100_pred = rf100.predict(X100_test)
rf_rdr100_pred = rf_rdr100.predict(X100_test)

#### 2.4.3. RDR against Support Vector Machine

In [70]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm100 = SVC()
svm100.fit(X100_train, y100_train)

In [71]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm100.predict(features)

In [72]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

svm_rdr100 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
svm100_pred = svm100.predict(X100_test)
svm_rdr100_pred = svm_rdr100.predict(X100_test)

#### 2.4.4. RDR against XGBoost Classifier

In [73]:
# CREATE AND TRAIN XGBOOST CLASSIFIER
train_dmatrix = xgb.DMatrix(data=X100_train, label=le.transform(y100_train))
xgboost = xgb.train({}, train_dmatrix)

In [74]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
dmatrix = xgb.DMatrix(
            data=features,
            label=le.transform(label))

xgb100_pred = xgboost.predict(dmatrix)
prediction_dataset = le.inverse_transform(np.round(xgb100_pred).astype(int))

In [75]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

xgb_rdr100 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dmatrix = xgb.DMatrix(
            data=X100_test,
            label=le.transform(y100_test))

xgb100_pred = xgboost.predict(dmatrix)
xgb100_pred = le.inverse_transform(np.round(xgb100_pred).astype(int))

xgb_rdr100_pred = xgb_rdr100.predict(X100_test)

#### 2.4.5. RDR against Multi-layer Perceptron (MLP) Classifier

In [76]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp100 = MLPClassifier(max_iter=100, batch_size=5, hidden_layer_sizes=(10, 15, 30), learning_rate='constant', learning_rate_init=0.1)
mlp100.fit(X100_train, y100_train)

In [77]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp100.predict(features)

In [78]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

mlp_rdr100 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp100_pred = mlp100.predict(X100_test)
mlp_rdr100_pred = mlp_rdr100.predict(X100_test)

#### 2.4.6. RDR against TabNet Classifier

In [79]:
# CREATE AND TRAIN TabNet Classifier
tabnet100 = TabNetClassifier()
tabnet100.fit(X100_train.values, y100_train, max_epochs=100)



epoch 0  | loss: 0.63079 |  0:00:04s
epoch 1  | loss: 0.16416 |  0:00:09s
epoch 2  | loss: 0.13044 |  0:00:15s
epoch 3  | loss: 0.12433 |  0:00:20s
epoch 4  | loss: 0.11531 |  0:00:25s
epoch 5  | loss: 0.11613 |  0:00:31s
epoch 6  | loss: 0.11389 |  0:00:36s
epoch 7  | loss: 0.10829 |  0:00:41s
epoch 8  | loss: 0.10638 |  0:00:46s
epoch 9  | loss: 0.10367 |  0:00:51s
epoch 10 | loss: 0.10189 |  0:00:56s
epoch 11 | loss: 0.10101 |  0:01:01s
epoch 12 | loss: 0.10089 |  0:01:06s
epoch 13 | loss: 0.10098 |  0:01:11s
epoch 14 | loss: 0.09818 |  0:01:16s
epoch 15 | loss: 0.10353 |  0:01:21s
epoch 16 | loss: 0.1001  |  0:01:26s
epoch 17 | loss: 0.0979  |  0:01:31s
epoch 18 | loss: 0.09776 |  0:01:37s
epoch 19 | loss: 0.09753 |  0:01:42s
epoch 20 | loss: 0.09664 |  0:01:46s
epoch 21 | loss: 0.09761 |  0:01:52s
epoch 22 | loss: 0.09591 |  0:01:57s
epoch 23 | loss: 0.0949  |  0:02:05s
epoch 24 | loss: 0.09545 |  0:02:13s
epoch 25 | loss: 0.0935  |  0:02:26s
epoch 26 | loss: 0.0945  |  0:02:34s
e

In [80]:
# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet100.predict(features.values)

In [81]:
# CREATE AND TRAIN RDR MODEL
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

tabnet_rdr100 = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet100_pred = tabnet100.predict(X100_test.values)
tabnet_rdr100_pred = tabnet_rdr100.predict(X100_test)

## 5. Accuracy Comparison between RDR model and conventional models
To streamline the process, the accuracy comparison for each sample size will be presented by invoking the `display_accuracy` function.

In [82]:
def display_accuracy(
        y_test,
        dtf_pred, dtf_rdr_pred,
        rf_pred, rf_rdr_pred,
        xgb_pred, xgb_rdr_pred,
        svm_pred, svm_rdr_pred,
        mlp_pred, mlp_rdr_pred,
        tabnet_pred, tabnet_rdr_pred
    ) -> None:

    accuracy_table = []

    accuracy_table.append([
        "Decision Tree",
        metrics.accuracy_score(y_test,dtf_pred),
        metrics.accuracy_score(y_test, dtf_rdr_pred),
        metrics.accuracy_score(dtf_pred, dtf_rdr_pred)
    ])

    accuracy_table.append([
        "Random Forest",
        metrics.accuracy_score(y_test,rf_pred),
        metrics.accuracy_score(y_test, rf_rdr_pred),
        metrics.accuracy_score(rf_pred, rf_rdr_pred)
    ])

    accuracy_table.append([
        "XGBoost",
        metrics.accuracy_score(y_test,xgb_pred),
        metrics.accuracy_score(y_test, xgb_rdr_pred),
        metrics.accuracy_score(xgb_pred, xgb_rdr_pred)
    ])

    accuracy_table.append([
        "Support Vector Machine",
        metrics.accuracy_score(y_test,svm_pred),
        metrics.accuracy_score(y_test, svm_rdr_pred),
        metrics.accuracy_score(svm_pred, svm_rdr_pred)
    ])

    accuracy_table.append([
        "Multi-layer Perceptron",
        metrics.accuracy_score(y_test,mlp_pred),
        metrics.accuracy_score(y_test, mlp_rdr_pred),
        metrics.accuracy_score(mlp_pred, mlp_rdr_pred)
    ])

    accuracy_table.append([
        "TabNet",
        metrics.accuracy_score(y_test,tabnet_pred),
        metrics.accuracy_score(y_test, tabnet_rdr_pred),
        metrics.accuracy_score(tabnet_pred, tabnet_rdr_pred)
    ])

    accuracy_table = pd.DataFrame(accuracy_table, columns=["Model", "Accuracy", "RDR Accuracy", "Difference"])
    pd.set_option('display.precision', 15)

    display(accuracy_table)

    return

In [83]:
print("Accuracy comparison for 25% sample size:")
display_accuracy(
    y25_test,
    dtf25_pred, dtf_rdr25_pred,
    rf25_pred, rf_rdr25_pred,
    xgb25_pred, xgb_rdr25_pred,
    svm25_pred, svm_rdr25_pred,
    mlp25_pred, mlp_rdr25_pred,
    tabnet25_pred, tabnet_rdr25_pred
)

Accuracy comparison for 25% sample size:


Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.966,0.857,0.8754
1,Random Forest,0.9762,0.8594,0.876
2,XGBoost,0.9704,0.8808,0.9028
3,Support Vector Machine,0.5988,0.5988,1.0
4,Multi-layer Perceptron,0.5988,0.5988,1.0
5,TabNet,0.8102,0.7496,0.9078


In [84]:
print("Accuracy comparison for 50% sample size:")
display_accuracy(
    y50_test,
    dtf50_pred, dtf_rdr50_pred,
    rf50_pred, rf_rdr50_pred,
    xgb50_pred, xgb_rdr50_pred,
    svm50_pred, svm_rdr50_pred,
    mlp50_pred, mlp_rdr50_pred,
    tabnet50_pred, tabnet_rdr50_pred
)

Accuracy comparison for 50% sample size:


Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.9653,0.8721,0.8887
1,Random Forest,0.9765,0.8591,0.8755
2,XGBoost,0.9691,0.8672,0.8883
3,Support Vector Machine,0.5925,0.5925,1.0
4,Multi-layer Perceptron,0.5925,0.5925,1.0
5,TabNet,0.9056,0.8617,0.9285


In [85]:
print("Accuracy comparison for 75% sample size:")
display_accuracy(
    y75_test,
    dtf75_pred, dtf_rdr75_pred,
    rf75_pred, rf_rdr75_pred,
    xgb75_pred, xgb_rdr75_pred,
    svm75_pred, svm_rdr75_pred,
    mlp75_pred, mlp_rdr75_pred,
    tabnet75_pred, tabnet_rdr75_pred
)

Accuracy comparison for 75% sample size:


Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.964133333333333,0.860866666666667,0.881466666666667
1,Random Forest,0.976,0.873933333333333,0.889866666666667
2,XGBoost,0.969866666666667,0.884733333333333,0.9034
3,Support Vector Machine,0.594533333333333,0.594533333333333,1.0
4,Multi-layer Perceptron,0.594533333333333,0.594533333333333,1.0
5,TabNet,0.902466666666667,0.8072,0.881066666666667


In [86]:
print("Accuracy comparison for 100% sample size:")
display_accuracy(
    y100_test,
    dtf100_pred, dtf_rdr100_pred,
    rf100_pred, rf_rdr100_pred,
    xgb100_pred, xgb_rdr100_pred,
    svm100_pred, svm_rdr100_pred,
    mlp100_pred, mlp_rdr100_pred,
    tabnet100_pred, tabnet_rdr100_pred
)

Accuracy comparison for 100% sample size:


Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.9637,0.8807,0.89965
1,Random Forest,0.97685,0.86185,0.87785
2,XGBoost,0.96935,0.86465,0.88445
3,Support Vector Machine,0.593,0.593,1.0
4,Multi-layer Perceptron,0.593,0.593,1.0
5,TabNet,0.655,0.63205,0.91815
