# Testing phase - Star Classification.csv

In [18]:
import sys
sys.path.append('../src/')
from rdr import RDR

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Import Model Libraries
White box Model :
- Decision Tree Classifier

Black box Model :
- Random Forest
- Support Vector Machine
- XGBoost
- MLP
- TabNet

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

## 1. Load dataset

In [21]:
df = pd.read_csv('../data/star_classification.csv')
display(df.head())
display(df.isnull().sum())
# df = df.dropna()
display(df.shape)

# df = df.sample(20000, random_state=42)

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237660961327743e+18,135.6891066036,32.4946318397087,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777369295182e+18,GALAXY,0.6347936,5812,56354,171
1,1.237664879951151e+18,144.826100550256,31.274184894493896,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014203670733e+19,GALAXY,0.779136,10445,58158,427
2,1.23766096133043e+18,142.188789562506,35.5824441819976,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200256025549e+18,GALAXY,0.6441945,4576,55592,299
3,1.237663478724298e+18,338.741037753146,-0.402827574587482,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107141295442e+19,GALAXY,0.9323456,9149,58039,775
4,1.237680272041378e+18,345.282593210935,21.1838656010284,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891864880783317e+18,GALAXY,0.1161227,6121,56187,842


obj_ID         0
alpha          0
delta          0
u              0
g              0
r              0
i              0
z              0
run_ID         0
rerun_ID       0
cam_col        0
field_ID       0
spec_obj_ID    0
class          0
redshift       0
plate          0
MJD            0
fiber_ID       0
dtype: int64

(100000, 18)

In [22]:
label = df['class']
features = df.drop(columns=['obj_ID', 'class', 'spec_obj_ID'], axis=1)
le = LabelEncoder()
le.fit(label)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

## 2. Comparison between conventional models with RDR model

### 2.0. Create RDR base model

In [23]:
rdr = RDR(
        antecedent='GALAXY',
        categorical_attr=['run_ID','rerun_ID','cam_col','field_ID','plate','fiber_ID'],
        comp_operator='>=')

### 2.1. Decision Tree vs RDR

In [24]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf = DecisionTreeClassifier()
dtf.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf.predict(features)

In [25]:
# CREATE AND TRAIN RDR MODEL
dtf_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# # PREDICT USING DECISION TREE AND RDR
dtf_pred = dtf.predict(X_test)
dtf_rdr_pred = dtf_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,dtf_pred))
print(metrics.accuracy_score(y_test, dtf_rdr_pred))
print(metrics.accuracy_score(dtf_pred, dtf_rdr_pred))

0.9631
0.8521
0.8717


### 2.2. Random Forest vs RDR

In [26]:
# CREATE AND TRAIN RANDOM FOREST CLASSIFIER
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf.predict(features)

In [27]:
# CREATE AND TRAIN RDR MODEL
rf_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
rf_pred = rf.predict(X_test)
rf_rdr_pred = rf_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,rf_pred))
print(metrics.accuracy_score(y_test, rf_rdr_pred))
print(metrics.accuracy_score(rf_pred, rf_rdr_pred))

0.9772
0.94805
0.96895


### 2.3. SVM vs RDR

In [28]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm = SVC()
svm.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm.predict(features)

In [29]:
# CREATE AND TRAIN RDR MODEL
svm_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
svm_pred = svm.predict(X_test)
svm_rdr_pred = svm_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,svm_pred))
print(metrics.accuracy_score(y_test, svm_rdr_pred))
print(metrics.accuracy_score(svm_pred, svm_rdr_pred))

0.593
0.593
1.0


### 2.4. XGBoost vs RDR

In [30]:
# CREATE AND TRAIN XGBOOST CLASSIFIER
train_dmatrix = xgb.DMatrix(data=X_train, label=le.transform(y_train))
xgboost = xgb.train({}, train_dmatrix)

dmatrix = xgb.DMatrix(data=features, label=le.transform(label))
xgb_pred = xgboost.predict(dmatrix)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = le.inverse_transform(np.round(xgb_pred).astype(int))

In [31]:
# CREATE AND TRAIN RDR MODEL
xgb_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT USING DECISION TREE AND RDR
dmatrix = xgb.DMatrix(data=X_test, label=le.transform(y_test))
xgb_pred = xgboost.predict(dmatrix)
xgb_pred = le.inverse_transform(np.round(xgb_pred).astype(int))

xgb_rdr_pred = xgb_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,xgb_pred))
print(metrics.accuracy_score(y_test, xgb_rdr_pred))
print(metrics.accuracy_score(xgb_pred, xgb_rdr_pred))

0.96935
0.9639
0.9923


### 2.5. MLP vs RDR

In [32]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp = MLPClassifier(max_iter=100, batch_size=5, hidden_layer_sizes=(10, 15, 30), learning_rate='constant', learning_rate_init=0.1)
mlp.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp.predict(features)

In [33]:
# CREATE AND TRAIN RDR MODEL
mlp_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp_pred = mlp.predict(X_test)
mlp_rdr_pred = mlp_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,mlp_pred))
print(metrics.accuracy_score(y_test, mlp_rdr_pred))
print(metrics.accuracy_score(mlp_pred, mlp_rdr_pred))

0.593
0.593
1.0


### 2.6. TabNet vs RDR

In [34]:
# CREATE AND TRAIN TabNet Classifier
tabnet = TabNetClassifier()
tabnet.fit(X_train.values, y_train, max_epochs=100)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet.predict(features.values)



epoch 0  | loss: 0.63079 |  0:00:06s
epoch 1  | loss: 0.16416 |  0:00:14s
epoch 2  | loss: 0.13044 |  0:00:23s
epoch 3  | loss: 0.12433 |  0:00:31s
epoch 4  | loss: 0.11531 |  0:00:37s
epoch 5  | loss: 0.11613 |  0:00:43s
epoch 6  | loss: 0.11389 |  0:00:49s
epoch 7  | loss: 0.10829 |  0:00:54s
epoch 8  | loss: 0.10638 |  0:00:59s
epoch 9  | loss: 0.10367 |  0:01:04s
epoch 10 | loss: 0.10189 |  0:01:09s
epoch 11 | loss: 0.10101 |  0:01:14s
epoch 12 | loss: 0.10089 |  0:01:18s
epoch 13 | loss: 0.10098 |  0:01:23s
epoch 14 | loss: 0.09818 |  0:01:28s
epoch 15 | loss: 0.10353 |  0:01:32s
epoch 16 | loss: 0.1001  |  0:01:37s
epoch 17 | loss: 0.0979  |  0:01:42s
epoch 18 | loss: 0.09776 |  0:01:46s
epoch 19 | loss: 0.09753 |  0:01:52s
epoch 20 | loss: 0.09664 |  0:01:57s
epoch 21 | loss: 0.09761 |  0:02:01s
epoch 22 | loss: 0.09591 |  0:02:06s
epoch 23 | loss: 0.0949  |  0:02:12s
epoch 24 | loss: 0.09545 |  0:02:18s
epoch 25 | loss: 0.0935  |  0:02:23s
epoch 26 | loss: 0.0945  |  0:02:28s
e

In [35]:
# CREATE AND TRAIN RDR MODEL
tabnet_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet_pred = tabnet.predict(X_test.values)
tabnet_rdr_pred = tabnet_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,tabnet_pred))
print(metrics.accuracy_score(y_test, tabnet_rdr_pred))
print(metrics.accuracy_score(tabnet_pred, tabnet_rdr_pred))

0.655
0.6495
0.967


## 5. Accuracy Comparison between RDR model and conventional models

In [36]:
dtf_accuracy = metrics.accuracy_score(y_test,dtf_pred)
rf_accuracy = metrics.accuracy_score(y_test,rf_pred)
xgb_accuracy = metrics.accuracy_score(y_test,xgb_pred)
svm_accuracy = metrics.accuracy_score(y_test,svm_pred)
mlp_accuracy = metrics.accuracy_score(y_test,mlp_pred)
tabnet_accuracy = metrics.accuracy_score(y_test,tabnet_pred)

dtf_rdr_accuracy = metrics.accuracy_score(y_test, dtf_rdr_pred)
rf_rdr_accuracy = metrics.accuracy_score(y_test, rf_rdr_pred)
xgb_rdr_accuracy = metrics.accuracy_score(y_test, xgb_rdr_pred)
svm_rdr_accuracy = metrics.accuracy_score(y_test, svm_rdr_pred)
mlp_rdr_accuracy = metrics.accuracy_score(y_test, mlp_rdr_pred)
tabnet_rdr_accuracy = metrics.accuracy_score(y_test, tabnet_rdr_pred)

accuracy_table = []
accuracy_table.append([
    "Decision Tree",
    dtf_accuracy,
    dtf_rdr_accuracy,
    metrics.accuracy_score(dtf_pred, dtf_rdr_pred)
])

accuracy_table.append([
    "Random Forest",
    rf_accuracy,
    rf_rdr_accuracy,
    metrics.accuracy_score(rf_pred, rf_rdr_pred)
])

accuracy_table.append([
    "XGBoost",
    xgb_accuracy,
    xgb_rdr_accuracy,
    metrics.accuracy_score(xgb_pred, xgb_rdr_pred)
])

accuracy_table.append([
    "Support Vector Machine",
    svm_accuracy,
    svm_rdr_accuracy,
    metrics.accuracy_score(svm_pred, svm_rdr_pred)
])

accuracy_table.append([
    "Multi-layer Perceptron",
    mlp_accuracy,
    mlp_rdr_accuracy,
    metrics.accuracy_score(mlp_pred, mlp_rdr_pred)
])

accuracy_table.append([
    "TabNet",
    tabnet_accuracy,
    tabnet_rdr_accuracy,
    metrics.accuracy_score(tabnet_pred, tabnet_rdr_pred)
])

accuracy_table = pd.DataFrame(accuracy_table, columns=["Model", "Accuracy", "RDR Accuracy", "Difference"])
pd.set_option('display.precision', 15)

display(accuracy_table)

Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.9631,0.8521,0.8717
1,Random Forest,0.9772,0.94805,0.96895
2,XGBoost,0.96935,0.9639,0.9923
3,Support Vector Machine,0.593,0.593,1.0
4,Multi-layer Perceptron,0.593,0.593,1.0
5,TabNet,0.655,0.6495,0.967
