In [None]:
%pip install small-text[transformers]==1.3.0

# setfit is an optional dependency and must be installed as well
%pip install setfit>=0.5.0
%pip install datasets
%pip install matplotlib

import logging

import datasets

import matplotlib.pyplot as plt

plt.rc('figure', titlesize=22)
plt.rc('axes', titlesize=22, labelsize=20, linewidth=1.2)
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('legend', fontsize=16)
plt.rc('lines', linewidth=2)

datasets.logging.set_verbosity_error()

# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET

for logger_name in ['setfit.modeling', 'setfit.trainer']:
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.ERROR)

In [2]:
import datasets
import logging
import numpy as np
from small_text import TextDataset
from small_text.integrations.transformers.classifiers.setfit import SetFitModelArguments
from small_text.integrations.transformers.classifiers.factories import SetFitClassificationFactory
from small_text import (
    PoolBasedActiveLearner, 
    random_initialization_balanced,
    BreakingTies,
    SubsamplingQueryStrategy,
    PredictionEntropy,
)
import gc
import torch
from sklearn.metrics import accuracy_score


In [None]:
datasets.logging.get_verbosity = lambda: logging.NOTSET

dataset1 = datasets.load_dataset("SetFit/toxic_conversations_50k")

dataset2 = datasets.load_dataset("SetFit/tweet_eval_stance_abortion")
dataset3 = datasets.load_dataset("SetFit/catalonia_independence_es")

raw_dataset = dataset1
raw_dataset['train']=raw_dataset['train'].shuffle(seed=42).select(range(10 * 10))
raw_dataset['test']=raw_dataset['train'].shuffle(seed=42).select(range(10 * 2))
num_classes = np.unique(raw_dataset['train']['label']).shape[0]

print('First 10 training samples:\n')
for i in range(10):
    print(raw_dataset['train']['label'][i], ' ', raw_dataset['train']['text'][i])

In [None]:
raw_dataset['train']=raw_dataset['train'].shuffle(seed=42).select(range(10 * 10))
raw_dataset['test']=raw_dataset['test'].shuffle(seed=42).select(range(10 * 2))


num_classes = np.unique(raw_dataset['train']['label']).shape[0]
target_labels = np.arange(num_classes)



train = TextDataset.from_arrays(raw_dataset['train']['text'], np.array(raw_dataset['train']['label']), target_labels=target_labels)
test = TextDataset.from_arrays(raw_dataset['test']['text'], np.array(raw_dataset['test']['label']), target_labels=target_labels)

In [11]:
sentence_transformer_model_name = 'sentence-transformers/paraphrase-mpnet-base-v2'
setfit_model_args = SetFitModelArguments(sentence_transformer_model_name)
clf_factory = SetFitClassificationFactory(setfit_model_args, 
                                          num_classes)

In [12]:
# define a query strategy and initialize a pool-based active learner
query_strategy = SubsamplingQueryStrategy(BreakingTies())

# suppress progress bars in jupyter notebook
setfit_train_kwargs = {'show_progress_bar': False}
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train, fit_kwargs={'setfit_train_kwargs': setfit_train_kwargs})


# simulate a warm start
def initialize_active_learner(active_learner, y_train):

    x_indices_initial = random_initialization_balanced(y_train, n_samples=20)
    y_initial = y_train[x_indices_initial]

    active_learner.initialize_data(x_indices_initial, y_initial)

    return x_indices_initial


initial_indices = initialize_active_learner(active_learner, train.y)
labeled_indices = initial_indices

In [13]:
from scipy.stats import entropy
from small_text.utils.data import list_length
import pandas as pd
num_queries = 5

def get_entropy_labeled(row, model,labeled_indices):
  p = model.predict_proba(row)
  entr = np.apply_along_axis(lambda x: entropy(x), 1, p) 
  query2 = pd.DataFrame(entr,labeled_indices)
  query3 = np.array(query2.nlargest(1,0).index.values, dtype=np.int64)
 
  return query3
  

def evaluate(active_learner, train, test,labeled_indices):

    y_pred = active_learner.classifier.predict(train)
    y_prob = get_entropy_labeled(train,active_learner.classifier,labeled_indices)

  
    y_pred_test = active_learner.classifier.predict(test)
    
    test_acc = accuracy_score(y_pred_test, test.y)

    print('Train accuracy: {:.2f}'.format(accuracy_score(y_pred, train.y)))
    print('Test accuracy: {:.2f}'.format(test_acc))
    
    return test_acc, y_prob


results = []
eval, mix_query = evaluate(active_learner, train[labeled_indices], test,labeled_indices)
results.append(eval)

    
for i in range(num_queries):
    # ...where each iteration consists of labelling 5 samples
    indices_queried = active_learner.query(num_samples=5)

    # Simulate user interaction here. Replace this for real-world usage.
    y = train.y[indices_queried]
    
    
    yy=train.y[mix_query]


    # Return the labels for the current query to the active learner.
    active_learner.update(y)

    #update the examples for which the model is uncertain about
    active_learner.update_label_at(mix_query[0],yy[0],retrain=True)

    labeled_indices = np.concatenate([indices_queried, labeled_indices])
    gc.collect()
    torch.cuda.empty_cache()
    print('---------------')
    print(f'Iteration #{i} ({len(labeled_indices)} samples)')
    eval, mix_query = evaluate(active_learner, train[labeled_indices], test,labeled_indices)
    results.append(eval)

Train accuracy: 1.00
Test accuracy: 0.85
---------------
Iteration #0 (25 samples)
Train accuracy: 1.00
Test accuracy: 1.00
---------------
Iteration #1 (30 samples)
Train accuracy: 1.00
Test accuracy: 1.00
---------------
Iteration #2 (35 samples)
Train accuracy: 1.00
Test accuracy: 1.00
---------------
Iteration #3 (40 samples)
Train accuracy: 1.00
Test accuracy: 1.00
---------------
Iteration #4 (45 samples)
Train accuracy: 1.00
Test accuracy: 1.00
