In [None]:
import joblib,copy,alipy
import numpy as np
import pandas as pd
from alipy import ToolBox
from matplotlib import pyplot as plt
from numpy import array,loadtxt
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn import model_selection,metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline 
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import precision_score,accuracy_score,recall_score,confusion_matrix,f1_score
from mlxtend.feature_selection import ColumnSelector
from alipy.query_strategy import QueryInstanceGraphDensity,QueryInstanceDensityWeighted,QueryInstanceUncertainty
import warnings
warnings.filterwarnings("ignore")

In [None]:
# define the DMF model
def stmodel(num):
    sclf = RandomForestClassifier(oob_score=True,n_jobs=-1)
    sxgb = XGBClassifier(eval_metric=['logloss','auc','error'],max_depth=20,n_jobs=-1)
    sgnb = GaussianNB()
    pipe1 = make_pipeline(ColumnSelector(cols=range(num)),sclf)

    pipe2 = make_pipeline(ColumnSelector(cols=range(num)),sxgb)

    pipe3 = make_pipeline(ColumnSelector(cols=range(num)),sgnb)

    stack = StackingClassifier(classifiers=[pipe1,pipe2,pipe3], meta_classifier=LogisticRegression(solver="lbfgs"))
    return stack

In [None]:
# import the reduction sample set and label 
X = np.loadtxt('reduction50.csv')
y = np.loadtxt('y_label.csv')

# uncertainty sampling (UNSM)

In [None]:
# create a query ToolBox and set the saveing_path
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./')
            
# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.001, split_count=20)

# Use the DMF classifier with 10 features
model  = stmodel(10)
            
# The cost budget is 100 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 100)

def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)

    # Set initial performance point
    model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
    pred = model.predict(X[test_idx, :])
    accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                              y_pred=pred,
                                              performance_metric='accuracy_score')
    saver.set_initial_point(accuracy)
            
    # If the stopping criterion is simple, such as query 100 times. Use `for i in range(100):` is ok.
    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = strategy.select(label_index=label_ind, unlabel_index=unlab_ind, batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                y_pred=pred,
                                                performance_metric='accuracy_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    return saver    

unc_result = []
 
for round in range(10):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
            
    # Use defined strategy
    unc = alibox.get_query_strategy(strategy_name="QueryInstanceUncertainty")
    unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))

#get the query results
analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
analyser.add_method(method_name='UNSM', method_results=unc_result)
print(analyser)
# show the figure
analyser.plot_learning_curves(title='Uncertainty Sampling', std_area=False)

# UNSM+Graph density (UNGD)

In [None]:
alibox1 = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./')

alibox1.split_AL(test_ratio=0.3, initial_label_rate=0.001, split_count=20)

model  = stmodel(50)    

stopping_criterion = alibox1.get_stopping_criterion('num_of_queries', 100)
            
def main_loop(alibox1, strategy, round):
    train_idx, test_idx, label_ind, unlab_ind = alibox1.get_split(round)
    saver = alibox1.get_stateio(round)
    model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
    pred = model.predict(X[test_idx, :])
    accuracy = alibox1.calc_performance_metric(y_true=y[test_idx],
                                              y_pred=pred,
                                              performance_metric='accuracy_score')
    saver.set_initial_point(accuracy)     

    while not stopping_criterion.is_stop():
        select_ind = strategy.select(label_index=label_ind, unlabel_index=unlab_ind, batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox1.calc_performance_metric(y_true=y[test_idx],
                                                y_pred=pred,
                                                performance_metric='accuracy_score')

        st = alibox1.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)
        stopping_criterion.update_information(saver)
        
    stopping_criterion.reset()
    return saver

denG_result = []

for round in range(5):
    train_idx, test_idx, label_ind, unlab_ind = alibox1.get_split(round)
            
    # Use UNSM to find the most uncerain points
    unc1 = alibox1.get_query_strategy(strategy_name="QueryInstanceUncertainty")
    denG_result.append(copy.deepcopy(main_loop(alibox1, unc1, round)))
    
for round in range(5,10):
    train_idx, test_idx, label_ind, unlab_ind = alibox1.get_split(round)
            
    # Use Graph Density to uery dense samle points
    denG = alibox1.get_query_strategy(strategy_name="QueryInstanceGraphDensity", train_idx=train_idx)

    denG_result.append(copy.deepcopy(main_loop(alibox1, denG, round)))

analyser1 = alibox1.get_experiment_analyser(x_axis='num_of_queries')
analyser1.add_method(method_name='UNGD', method_results=denG_result)
print(analyser1)
analyser1.plot_learning_curves(title='Uncertainty & GraphDensity', std_area=False)

# UNSM + Information density (UNID)

In [None]:
%%time
alibox2= ToolBox(X=X, y=y, query_type='AllLabels', saving_path='./')

alibox2.split_AL(test_ratio=0.3, initial_label_rate=0.001, split_count=20)

model = stmodel(10)

stopping_criterion = alibox2.get_stopping_criterion('num_of_queries',100)
     
denWStrategy = alibox2.get_query_strategy(strategy_name='QueryInstanceDensityWeighted',
                                               uncertainty_meansure='entropy')
denW_result = []

for round in range(10):
    train_idx, test_idx, label_ind, unlab_ind = alibox2.get_split(round)
    saver = alibox2.get_stateio(round)

    model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
    pred = model.predict(X[test_idx, :])
    accuracy = alibox2.calc_performance_metric(y_true=y[test_idx],y_pred=pred,
                                                performance_metric='accuracy_score')
    saver.set_initial_point(accuracy)
    
    while not stopping_criterion.is_stop():
        select_ind = denWStrategy.select(label_ind, unlab_ind, model=model, batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)
     
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox2.calc_performance_metric(y_true=y[test_idx],y_pred=pred,
                                                performance_metric='accuracy_score')
     
        st = alibox2.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)
        saver.save()
        stopping_criterion.update_information(saver)
        
    stopping_criterion.reset()
    denW_result.append(copy.deepcopy(saver))

analyser2 = alibox2.get_experiment_analyser(x_axis='num_of_queries')
analyser2.add_method(method_name='Uncertainty & DensityWeighted', method_results=denW_result)
print(analyser2)
analyser2.plot_learning_curves(title='UNID', std_area=True)

In [None]:
from alipy.experiment import ExperimentAnalyser
# get the query results
anal1 = ExperimentAnalyser(x_axis='num_of_queries')
anal1.add_method('UNSM', unc_result)
anal1.add_method('UNID',denW_result)
anal1.add_method('UNGD', denG_result)
# set plot parameters
anal1.plot_learning_curves(title='F10 - Learning curves', std_area=True,show=False)
plt.title('F10 - Learning curves',fontproperties='Times New Roman',fontsize=14)
plt.yticks(fontproperties='Times New Roman',fontsize=12)
plt.xticks(fontproperties='Times New Roman',fontsize=12)
plt.xlabel('Number of queries',fontproperties='Times New Roman',fontsize=14)
plt.ylabel('Performance',fontproperties='Times New Roman',fontsize=14)
plt.legend(loc=4,prop='Times New Roman')
plt.show()