## Import libraries

In [1]:
from badacost import BAdaCost

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.tree import DecisionTreeClassifier

In [4]:
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from sklearn.metrics import confusion_matrix

## Car Evaluation dataset

In [7]:
data = pd.read_csv('car_data/car.data', names = ['buying', 'maint', 'doors', 'persons', 
                                                 'lug_boot', 'safety', 'label'])

In [8]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [9]:
data.shape

(1728, 7)

In [10]:
pd.value_counts(data['label'])

unacc    1210
acc       384
good       69
vgood      65
Name: label, dtype: int64

In [11]:
# Encode Data
data.buying.replace(('vhigh','high','med','low'),(0,1,2,3), inplace=True)
data.maint.replace(('vhigh','high','med','low'),(0,1,2,3), inplace=True)
data.doors.replace(('2','3','4','5more'),(0,1,2,3), inplace=True)
data.persons.replace(('2','4','more'),(0,1,2), inplace=True)
data.lug_boot.replace(('small','med','big'),(0,1,2), inplace=True)
data.safety.replace(('low','med','high'),(0,1,2), inplace=True)
data.label.replace(('unacc','acc','good','vgood'),(0,1,2,3), inplace=True)
    

In [12]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,label
0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,2,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,1,0


In [13]:
from sklearn.ensemble import RandomForestClassifier
tree = RandomForestClassifier(n_estimators=2,max_depth=5)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['label']), data['label'], test_size=0.2)

In [15]:
C1 = np.matrix([[0,1,1,1],[1,0,1,1],[1,1,0,1],[1,1,1,0]])

In [16]:
bada1 = BAdaCost(tree,200,0.001,C1,0.000001)

In [17]:
bada1.fit(np.array(X_train),np.array(y_train))

In [18]:
preds_train = bada1.predict(X_train)

In [19]:
C = confusion_matrix(y_train,preds_train)

In [20]:
C

array([[918,  54,   0,   0],
       [ 36, 264,   0,   0],
       [  0,  55,   0,   0],
       [  0,  55,   0,   0]])

In [21]:
accuracy_score(y_train,preds_train)

0.8552821997105644

In [22]:
accuracy_score(y_test,bada1.predict(X_test))

0.8439306358381503

In [23]:
#construct new weight matrix
k = 0
F = []
for i in C:
    F.append(C[k]/sum(i))
    k += 1
F = np.matrix(F)
for i in range(4):
    F[i,i]=0

In [24]:
F

matrix([[0.        , 0.05555556, 0.        , 0.        ],
        [0.12      , 0.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ]])

In [25]:
bada2 = BAdaCost(tree,200,0.001,F,0.000001)

In [26]:
bada2.fit(np.array(X_train),np.array(y_train))

In [27]:
preds_train2 = bada2.predict(X_train)

In [28]:
confusion_matrix(y_train,preds_train2)

array([[902,  36,   9,  25],
       [ 93, 196,   2,   9],
       [  0,  36,  15,   4],
       [  3,   0,   0,  52]])

In [29]:
accuracy_score(y_train,preds_train2)

0.8429811866859623

In [30]:
accuracy_score(y_test,bada2.predict(X_test))

0.838150289017341

## Chess Dataset

In [35]:
chess_data = pd.read_csv('chess_data/kr-vs-kp.data',names=['bkblk','bknwy','bkon8','bkona','bkspr',
                                                           'bkxbq','bkxcr','bkxwp','blxwp','bxqsq',
                                                           'cntxt','dsopp','dwipd','hdchk','katri',
                                                           'mulch','qxmsq','r2ar8','reskd','reskr',
                                                           'rimmx','rkxwp','rxmsq','simpl','skach',
                                                           'skewr','skrxp','spcop','stlmt','thrsk',
                                                           'wkcti','wkna8','wknck','wkovl','wkpos',
                                                           'wtoeg','label'])

In [37]:
chess_data.head()

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,label
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won


In [38]:
chess_data.shape

(3196, 37)

In [43]:
# Encode Data
for i in chess_data.columns:
    unique_val = np.unique(chess_data[i])
    N = len(unique_val)
    chess_data[i].replace(unique_val,range(N), inplace=True)

In [45]:
chess_data.head()

Unnamed: 0,bkblk,bknwy,bkon8,bkona,bkspr,bkxbq,bkxcr,bkxwp,blxwp,bxqsq,...,spcop,stlmt,thrsk,wkcti,wkna8,wknck,wkovl,wkpos,wtoeg,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
2,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,1
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1


In [57]:
pd.value_counts(chess_data.label)

1    1669
0    1527
Name: label, dtype: int64

In [90]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(chess_data.drop(columns=['label']), chess_data['label'], 
                                                        test_size=0.2)

In [91]:
C2 = np.matrix([[0,1],[1,0]])

In [92]:
bada3 = BAdaCost(tree,200,0.001,C2,0.000001)

In [93]:
bada3.fit(np.array(X_train2),np.array(y_train2))

In [94]:
preds_train_chess = bada3.predict(X_train2)

In [95]:
C_chess = confusion_matrix(y_train2,preds_train_chess)

In [96]:
C_chess

array([[1052,  182],
       [  48, 1274]])

In [97]:
accuracy_score(preds_train_chess,y_train2)

0.9100156494522692

In [98]:
accuracy_score(y_test2,bada3.predict(X_test2))

0.90625

In [99]:
#construct new weight matrix
k = 0
F2 = []
for i in C_chess:
    F2.append(C_chess[k]/sum(i))
    k += 1
F2 = np.matrix(F2)
for i in range(2):
    F2[i,i]=0

In [100]:
np.transpose(F2)

matrix([[0.        , 0.03630862],
        [0.14748784, 0.        ]])

In [101]:
bada4 = BAdaCost(tree,200,0.001,0.1*F2,0.000001)

In [102]:
bada4.fit(np.array(X_train2),np.array(y_train2))

In [103]:
preds_train_chess2 = bada4.predict(X_train2)

In [104]:
accuracy_score(y_train2,preds_train_chess2)

0.09898278560250391

In [105]:
confusion_matrix(y_train2,preds_train_chess2)

array([[ 144, 1090],
       [1213,  109]])