# Specify and fit an active learning pipeline
In this example we will specify and evaluate an active learning pipeline. You can set all parameters manually, such as the learning algorithm and query strategy to be used, the seeds, the size of the train/test split, number of iterations, number of queries per iteration, etc.

In [1]:
from alpbench.benchmark.ActiveLearningScenario import ActiveLearningScenario
from alpbench.benchmark.ActiveLearningSetting import ActiveLearningSetting
from alpbench.pipeline.ActiveLearningPipeline import ActiveLearningPipeline
from alpbench.pipeline.Oracle import Oracle
from alpbench.pipeline.QueryStrategy import MarginQueryStrategy
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score

### Setup scenario and setting

In [2]:
SCENARIO_ID = 1
OPENML_ID = 31
TEST_SPLIT_SEED = 42
TRAIN_SPLIT_SEED = 43
SEED = 44

SETTING_ID = 1337
SETTING_NAME = "TestSetting"
SETTING_TRAIN_SIZE = 10
SETTING_TRAIN_TYPE = "absolute"
SETTING_TEST_SIZE = 0.3
NUMBER_OF_IT = 10
NUMBER_OF_QUERIES = 5
FACTOR = -1

In [3]:
alsetting = ActiveLearningSetting(
    setting_id=SETTING_ID,
    setting_name=SETTING_NAME,
    setting_labeled_train_size=SETTING_TRAIN_SIZE,
    setting_train_type=SETTING_TRAIN_TYPE,
    setting_test_size=SETTING_TEST_SIZE,
    number_of_iterations=NUMBER_OF_IT,
    number_of_queries=NUMBER_OF_QUERIES,
    factor=FACTOR,
)

In [4]:
alsetting

<ActiveLearningSetting> {'setting_id': 1337, 'setting_name': 'TestSetting', 'setting_labeled_train_size': 10.0, 'setting_train_type': 'absolute', 'setting_test_size': 0.3, 'number_of_iterations': 10, 'number_of_queries': 5, 'factor': -1}

In [5]:
alscenario = ActiveLearningScenario(
    scenario_id=SCENARIO_ID,
    openml_id=OPENML_ID,
    test_split_seed=TEST_SPLIT_SEED,
    train_split_seed=TRAIN_SPLIT_SEED,
    seed=SEED,
    setting=alsetting,
)

### Split data

In [6]:
X_l, y_l, X_u, y_u, X_test, y_test = alscenario.get_data_split()

### Specify active learning pipeline and run procedure

In [7]:
# we choose a **random forest** as learning algorithm and **margin sampling** as query strategy

print("define query strategy")
query_strategy = MarginQueryStrategy(42)
print("setup learner")
learner = RF(n_estimators=100)


ALP = ActiveLearningPipeline(
    learner=learner,
    query_strategy=query_strategy,
    init_budget=SETTING_TRAIN_SIZE,
    num_iterations=NUMBER_OF_IT,
    num_queries_per_iteration=NUMBER_OF_QUERIES,
    initially_labeled_indices=alscenario.labeled_indices,
)

oracle = Oracle(X_u, y_u)
print("fit active learning pipeline")
ALP.active_fit(X_l, y_l, X_u, oracle)

define query strategy
setup learner
fit active learning pipeline


In [8]:
y_hat = ALP.predict(X=X_test)
print("final test acc", accuracy_score(y_test, y_hat))

final test acc 0.6966666666666667
