In [15]:
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score
import numpy as np
import scipy.sparse
import collections
X_train = scipy.sparse.load_npz("training_data.npz")
y_train = np.load("training_labels.npy")
X_test = scipy.sparse.load_npz("test_data.npz")
y_test = np.load("test_labels.npy")

In [16]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train,y_train)
dtPred = dt.predict(X_test)
print(collections.Counter(dtPred))
print(balanced_accuracy_score(y_test, dtPred)) 

Counter({0: 121, 1: 10})
0.8333333333333333


In [17]:
dtWeighted=tree.DecisionTreeClassifier(class_weight="balanced")
dtWeighted.fit(X_train,y_train)
dtWeightedPred = dtWeighted.predict(X_test)
print(collections.Counter(dtWeightedPred))
print(balanced_accuracy_score(y_test, dtWeightedPred))

Counter({0: 114, 1: 17})
0.9913793103448276


In [18]:
from sklearn.utils import resample
X_train_np = X_train.toarray()
class_0_indices = [i for i, x in enumerate(y_train==0) if x]
class_1_indices = [i for i, x in enumerate(y_train==1) if x]
size_class_0 = sum(y_train==0)
X_train_class_0 = X_train_np[class_0_indices,:]
y_train_class_0 = [0]*size_class_0
X_train_class_1 = X_train_np[class_1_indices,:]

In [19]:
X_train_class_1_resampled = resample(X_train_class_1, replace=True, n_samples=size_class_0)
y_train_class_1_resampled = [1]*size_class_0

In [20]:
X_train_resampled = np.concatenate([X_train_class_0,X_train_class_1_resampled])
y_train_resampled = y_train_class_0+y_train_class_1_resampled

In [21]:
from scipy import sparse
X_train_resampled = sparse.csr_matrix(X_train_resampled)

In [22]:
dtResampled=tree.DecisionTreeClassifier()
dtResampled.fit(X_train_resampled,y_train_resampled)
dtResampledPred = dtResampled.predict(X_test)
print(collections.Counter(dtResampledPred))
print(balanced_accuracy_score(y_test, dtResampledPred))

Counter({0: 114, 1: 17})
0.9913793103448276


In [23]:
X_train_np = X_train.toarray()
class_0_indices = [i for i, x in enumerate(y_train==0) if x]
class_1_indices = [i for i, x in enumerate(y_train==1) if x]
size_class_1 = sum(y_train==1)
X_train_class_1 = X_train_np[class_1_indices,:]
y_train_class_1 = [1]*size_class_1
X_train_class_0 = X_train_np[class_0_indices,:]
X_train_class_0_downsampled = resample(X_train_class_0, replace=False, n_samples=size_class_1)
y_train_class_0_downsampled = [0]*size_class_1

In [24]:
X_train_downsampled = np.concatenate([X_train_class_1,X_train_class_0_downsampled])
y_train_downsampled = y_train_class_1+y_train_class_0_downsampled

In [25]:
X_train_downsampled = sparse.csr_matrix(X_train_downsampled)

In [26]:
dtDownsampled=tree.DecisionTreeClassifier()
dtDownsampled.fit(X_train_downsampled,y_train_downsampled)
dtDownsampledPred = dtDownsampled.predict(X_test)
print(collections.Counter(dtDownsampledPred))
print(balanced_accuracy_score(y_test, dtDownsampledPred))

Counter({0: 100, 1: 31})
0.9310344827586207


In [29]:
from imblearn.ensemble import BalancedBaggingClassifier
balancedclf = BalancedBaggingClassifier(base_estimator=tree.DecisionTreeClassifier(),sampling_strategy='auto',replacement=True)
balancedclf.fit(X_train, y_train) 
balancedclfPred = balancedclf.predict(X_test)
print(collections.Counter(balancedclfPred))
print(balanced_accuracy_score(y_test, balancedclfPred))  

Counter({0: 113, 1: 18})
0.9494252873563218
