# Testing phase - Gender Classification.csv

In [142]:
import sys
sys.path.append('../src/')
from rdr import RDR

In [143]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Import Model Libraries
White box Model :
- Decision Tree Classifier

Black box Model :
- Random Forest
- Support Vector Machine
- XGBoost
- MLP
- TabNet

In [144]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

## 1. Load dataset

In [145]:
df = pd.read_csv('../data/gender_classification.csv')
display(df.head())
display(df.isnull().sum())

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


long_hair                    0
forehead_width_cm            0
forehead_height_cm           0
nose_wide                    0
nose_long                    0
lips_thin                    0
distance_nose_to_lip_long    0
gender                       0
dtype: int64

In [146]:
label = df['gender']
features = df.drop('gender', axis=1)

le = LabelEncoder()
le.fit(label)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

## 2. Comparison between conventional models with RDR model

### 2.0. Create RDR base model

In [147]:
rdr = RDR(
        antecedent="Male",
        categorical_attr=['long_hair','nose_wide','nose_long','lips_thin','distance_nose_to_lip_long'],
        total_precedent=1)

### 2.1. Decision Tree vs RDR

In [148]:
# CREATE AND TRAIN DECISION TREE CLASSIFIER
dtf = DecisionTreeClassifier()
dtf.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = dtf.predict(features)

In [149]:
# CREATE AND TRAIN RDR MODEL
dtf_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - DECISION TREE AND RDR
dtf_pred = dtf.predict(X_test)
dtf_rdr_pred = dtf_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,dtf_pred))
print(metrics.accuracy_score(y_test, dtf_rdr_pred))
print(metrics.accuracy_score(dtf_pred, dtf_rdr_pred))

0.9530469530469531
0.919080919080919
0.9440559440559441


### 2.2. Random Forest vs RDR

In [150]:
# CREATE AND TRAIN Random Forest CLASSIFIER
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = rf.predict(features)

In [151]:
# CREATE AND TRAIN RDR MODEL
rf_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - Random Forest AND RDR
rf_pred = rf.predict(X_test)
rf_rdr_pred = rf_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,rf_pred))
print(metrics.accuracy_score(y_test, rf_rdr_pred))
print(metrics.accuracy_score(rf_pred, rf_rdr_pred))

0.9600399600399601
0.9590409590409591
0.995004995004995


### 2.3. SVM vs RDR

In [152]:
# CREATE AND TRAIN SUPPORT VECTOR MACHINE
svm = SVC()
svm.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = svm.predict(features)

In [153]:
# CREATE AND TRAIN RDR MODEL
svm_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - SVM AND RDR
svm_pred = svm.predict(X_test)
svm_rdr_pred = svm_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,svm_pred))
print(metrics.accuracy_score(y_test, svm_rdr_pred))
print(metrics.accuracy_score(svm_pred, svm_rdr_pred))

0.9630369630369631
0.962037962037962
0.999000999000999


### 2.4. XGBoost vs RDR

In [154]:
# CREATE AND TRAIN XGBoost Classifier
train_dmatrix = xgb.DMatrix(data=X_train, label=le.transform(y_train))
xgboost = xgb.train({}, train_dmatrix)

dmatrix = xgb.DMatrix(data=features, label=le.transform(label))
xgb_pred = xgboost.predict(dmatrix)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = le.inverse_transform(np.round(xgb_pred).astype(int))

In [155]:
# CREATE AND TRAIN RDR MODEL
xgb_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - XGBoost AND RDR
dmatrix = xgb.DMatrix(data=X_test, label=le.transform(y_test))
xgb_pred = xgboost.predict(dmatrix)
xgb_pred = le.inverse_transform(np.round(xgb_pred).astype(int))

xgb_rdr_pred = xgb_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,xgb_pred))
print(metrics.accuracy_score(y_test, xgb_rdr_pred))
print(metrics.accuracy_score(xgb_pred, xgb_rdr_pred))

0.965034965034965
0.965034965034965
1.0


### 2.5. MLP vs RDR

In [156]:
# CREATE AND TRAIN MLP CLASSIFIER
mlp = MLPClassifier(max_iter=100, batch_size=5, hidden_layer_sizes=(10, 15, 30), learning_rate='constant', learning_rate_init=0.1)
mlp.fit(X_train, y_train)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = mlp.predict(features)

In [157]:
# CREATE AND TRAIN RDR MODEL
mlp_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - MLP AND RDR
mlp_pred = mlp.predict(X_test)
mlp_rdr_pred = mlp_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,mlp_pred))
print(metrics.accuracy_score(y_test, mlp_rdr_pred))
print(metrics.accuracy_score(mlp_pred, mlp_rdr_pred))

0.4985014985014985
0.4985014985014985
1.0


### 2.6. TabNet vs RDR

In [158]:
# CREATE AND TRAIN TabNet Classifier
tabnet = TabNetClassifier()
tabnet.fit(X_train.values, y_train, max_epochs=100)

# CREATE PREDICTION DATASET - USING ALL DATA IN DATASET
prediction_dataset = tabnet.predict(features.values)



epoch 0  | loss: 0.5788  |  0:00:00s
epoch 1  | loss: 0.31026 |  0:00:00s
epoch 2  | loss: 0.22278 |  0:00:01s
epoch 3  | loss: 0.16579 |  0:00:01s
epoch 4  | loss: 0.15127 |  0:00:01s
epoch 5  | loss: 0.13236 |  0:00:01s
epoch 6  | loss: 0.12415 |  0:00:02s
epoch 7  | loss: 0.11101 |  0:00:02s
epoch 8  | loss: 0.09981 |  0:00:02s
epoch 9  | loss: 0.10481 |  0:00:03s
epoch 10 | loss: 0.0984  |  0:00:03s
epoch 11 | loss: 0.09262 |  0:00:03s
epoch 12 | loss: 0.08043 |  0:00:04s
epoch 13 | loss: 0.08535 |  0:00:04s
epoch 14 | loss: 0.08673 |  0:00:05s
epoch 15 | loss: 0.087   |  0:00:05s
epoch 16 | loss: 0.0859  |  0:00:05s
epoch 17 | loss: 0.08796 |  0:00:05s
epoch 18 | loss: 0.07321 |  0:00:06s
epoch 19 | loss: 0.07651 |  0:00:06s
epoch 20 | loss: 0.08354 |  0:00:06s
epoch 21 | loss: 0.07676 |  0:00:07s
epoch 22 | loss: 0.07898 |  0:00:07s
epoch 23 | loss: 0.07603 |  0:00:07s
epoch 24 | loss: 0.07628 |  0:00:08s
epoch 25 | loss: 0.0684  |  0:00:08s
epoch 26 | loss: 0.07148 |  0:00:08s
e

In [159]:
# CREATE AND TRAIN RDR MODEL
tabnet_rdr = rdr.fit(features, pd.DataFrame(prediction_dataset))

# PREDICT - TabNet AND RDR
tabnet_pred = tabnet.predict(X_test.values)
tabnet_rdr_pred = tabnet_rdr.predict(X_test)

print(metrics.accuracy_score(y_test,tabnet_pred))
print(metrics.accuracy_score(y_test, tabnet_rdr_pred))
print(metrics.accuracy_score(tabnet_pred, tabnet_rdr_pred))

0.9310689310689311
0.9280719280719281
0.995004995004995


## 5. Accuracy Comparison between RDR model and conventional models

In [160]:
dtf_accuracy = metrics.accuracy_score(y_test,dtf_pred)
rf_accuracy = metrics.accuracy_score(y_test,rf_pred)
xgb_accuracy = metrics.accuracy_score(y_test,xgb_pred)
svm_accuracy = metrics.accuracy_score(y_test,svm_pred)
mlp_accuracy = metrics.accuracy_score(y_test,mlp_pred)
tabnet_accuracy = metrics.accuracy_score(y_test,tabnet_pred)

dtf_rdr_accuracy = metrics.accuracy_score(y_test, dtf_rdr_pred)
rf_rdr_accuracy = metrics.accuracy_score(y_test, rf_rdr_pred)
xgb_rdr_accuracy = metrics.accuracy_score(y_test, xgb_rdr_pred)
svm_rdr_accuracy = metrics.accuracy_score(y_test, svm_rdr_pred)
mlp_rdr_accuracy = metrics.accuracy_score(y_test, mlp_rdr_pred)
tabnet_rdr_accuracy = metrics.accuracy_score(y_test, tabnet_rdr_pred)

accuracy_table = []
accuracy_table.append([
    "Decision Tree",
    dtf_accuracy,
    dtf_rdr_accuracy,
    metrics.accuracy_score(dtf_pred, dtf_rdr_pred)
])

accuracy_table.append([
    "Random Forest",
    rf_accuracy,
    rf_rdr_accuracy,
    metrics.accuracy_score(rf_pred, rf_rdr_pred)
])

accuracy_table.append([
    "XGBoost",
    xgb_accuracy,
    xgb_rdr_accuracy,
    metrics.accuracy_score(xgb_pred, xgb_rdr_pred)
])

accuracy_table.append([
    "Support Vector Machine",
    svm_accuracy,
    svm_rdr_accuracy,
    metrics.accuracy_score(svm_pred, svm_rdr_pred)
])

accuracy_table.append([
    "Multi-layer Perceptron",
    mlp_accuracy,
    mlp_rdr_accuracy,
    metrics.accuracy_score(mlp_pred, mlp_rdr_pred)
])

accuracy_table.append([
    "TabNet",
    tabnet_accuracy,
    tabnet_rdr_accuracy,
    metrics.accuracy_score(tabnet_pred, tabnet_rdr_pred)
])

accuracy_table = pd.DataFrame(accuracy_table, columns=["Model", "Accuracy", "RDR Accuracy", "Difference"])
pd.set_option('display.precision', 15)

display(accuracy_table)

Unnamed: 0,Model,Accuracy,RDR Accuracy,Difference
0,Decision Tree,0.953046953046953,0.919080919080919,0.944055944055944
1,Random Forest,0.96003996003996,0.959040959040959,0.995004995004995
2,XGBoost,0.965034965034965,0.965034965034965,1.0
3,Support Vector Machine,0.963036963036963,0.962037962037962,0.999000999000999
4,Multi-layer Perceptron,0.498501498501498,0.498501498501498,1.0
5,TabNet,0.931068931068931,0.928071928071928,0.995004995004995
