In [85]:
import numpy as np
import argparse
import torch

from transformers import AutoTokenizer

from small_text import (
    EmptyPoolException,
    PoolBasedActiveLearner,
    PoolExhaustedException,
    RandomSampling,
    random_initialization_balanced
)
from small_text.query_strategies.strategies import (QueryStrategy,
                                                    RandomSampling,
                                                    ConfidenceBasedQueryStrategy,
                                                    LeastConfidence,
                                                    EmbeddingBasedQueryStrategy,
                                                    EmbeddingKMeans)
from learner_functions import run_multiple_experiments
from preprocess import (data_loader,
                        df_to_dict)
from SL_transformers_workaround import genenrate_val_indices
from small_text.integrations.transformers.classifiers.classification import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.integrations.transformers.datasets import TransformersDataset



In [59]:
6144/512

12.0

In [79]:
def _genenrate_start_indices(train_dict, args):
    if args.cold_strategy =='TrueRandom':
        indices_neg_label = np.where(train_dict['target'] == 0)[0]
        indices_pos_label = np.where(train_dict['target'] == 1)[0]
        all_indices = np.concatenate([indices_neg_label, indices_pos_label])
        x_indices_initial = np.random.choice(all_indices,
                                            args.init_n,
                                            replace=False)
    # Balanced Random Choice Based on Known Class label
    elif args.cold_strategy == 'BalancedRandom': 
        indices_neg_label = np.where(train_dict['target'] == 0)[0]
        indices_pos_label = np.where(train_dict['target'] == 1)[0]
        selected_neg_label = np.random.choice(indices_neg_label,
                                                int(args.init_n/2),
                                                replace=False)
        selected_pos_label = np.random.choice(indices_pos_label,
                                                int(args.init_n/2),
                                                replace=False)
        x_indices_initial = np.concatenate([selected_neg_label, selected_pos_label])
    # Balanced Random Choice Based on Keywords (Weak label)
    elif args.cold_strategy == 'BalancedWeak': 
        indices_neg_label = np.where(train_dict['weak_target'] == 0)[0]
        indices_pos_label = np.where(train_dict['weak_target'] == 1)[0]
        if len(indices_pos_label) > int(args.init_n/2):
            selected_neg_label = np.random.choice(indices_neg_label,
                                                    int(args.init_n/2),
                                                    replace=False)
            selected_pos_label = np.random.choice(indices_pos_label,
                                                    int(args.init_n/2),
                                                    replace=False)
        # If limit reached, take as many positive as possible and pad with negatives
        else:
            selected_pos_label = np.random.choice(indices_pos_label,
                                                    len(indices_pos_label),
                                                    replace=False)
            selected_neg_label = np.random.choice(indices_neg_label,
                                                    int(args.init_n) - len(indices_pos_label),
                                                    replace=False)
        x_indices_initial = np.concatenate([selected_neg_label, selected_pos_label])
    else:
        print('Invalid Cold Start Policy')
    # Set x and y initial
    x_indices_initial = x_indices_initial.astype(int)
    y_initial = np.array([train_dict['target'][i] for i in x_indices_initial])
    print('y selected', train_dict['target'][x_indices_initial])
    print(f'Starting imbalance (train): {np.round(np.mean(y_initial),4)}')
    # Set validation indices for transformers framework
    val_indices = genenrate_val_indices(y_initial)
    
    return x_indices_initial, y_initial, val_indices

def dict_to_transformer_dataset(data_dict, tokenizer):
    encodings = tokenizer(data_dict['data'], truncation=True, padding=True)
    return TransformersDataset(
        [(torch.tensor(input_ids).reshape(1, -1), torch.tensor(attention_mask).reshape(1, -1), labels) 
         for input_ids, attention_mask, labels in 
              zip(encodings['input_ids'], encodings['attention_mask'], data_dict['target'])
              ]
        )
    

def parse_args():
    parser=argparse.ArgumentParser(description="Active Learning Experiment Runner with Transformers Integration")
    parser.add_argument('--method', type = str, metavar ="", default = 'AL', help="Supervised == SL or Active == AL")
    parser.add_argument('--framework', type = str, metavar ="", default = 'TF', help="Transformers == TF or SkLearn == SK")
    parser.add_argument('--datadir', type = str, metavar ="",default = './data/', help="Path to directory with data files")
    parser.add_argument('--dataset', type = str, metavar ="",default = 'wiki', help="Name of dataset")
    parser.add_argument('--outdir', type = str, metavar ="",default = './results/', help="Path to output directory for storing results")
    parser.add_argument('--transformer_model', type = str, metavar ="",default = 'distilbert-base-uncased', help="Name of HuggingFace transformer model")
    parser.add_argument('--n_epochs', type = int, metavar ="",default =  5, help = "Number of epochs for model training")
    parser.add_argument('--batch_size', type = int, metavar ="", default = 16, help = 'Number of samples per batch')
    parser.add_argument('--eval_steps', type = int, metavar ="", default = 20000, help = 'Evaluation after a number of training steps')
    parser.add_argument('--class_imbalance', type = int, metavar ="", default = 50, help = 'Class imbalance desired in train dataset')
    parser.add_argument('--init_n', type = int, metavar ="", default = 20, help = 'Initial batch size for training')
    parser.add_argument('--cold_strategy', metavar ="", default = 'BalancedWeak', help = 'Method of cold start to select initial examples')
    parser.add_argument('--query_n', type = int, metavar ="", default = 100, help = 'Batch size per active learning query for training')
    parser.add_argument('--query_strategy', metavar ="", default = 'LeastConfidence()', help = 'Method of active learning query for training')
    parser.add_argument('--train_n', type = int, metavar ="", default = 20000, help = 'Total number of training examples')
    parser.add_argument('--test_n', type = int, metavar ="", default = 5000, help = 'Total number of testing examples')
    parser.add_argument('--run_n', type = int, metavar ="", default = 5, help = 'Number of times to run each model')
    args=parser.parse_args()
    print("the inputs are:")
    for arg in vars(args):
        print("{} is {}".format(arg, getattr(args, arg)))
    return args

In [80]:
args=parse_args()
args.framework = 'TF'
# Load data
train_df, test_dfs = data_loader(args)
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model, cache_dir='.cache/')    
tokenizer.add_special_tokens({'additional_special_tokens': ["[URL]", "[EMOJI]", "[USER]"]})
train_dict = df_to_dict('train', train_df)
indices_initial, y_initial, val_indices = _genenrate_start_indices(train_dict, args)

train_trans_dataset = dict_to_transformer_dataset(train_dict, tokenizer)


the inputs are:
method is AL
framework is /Users/raymond/Library/Jupyter/runtime/kernel-v2-4635LG7dVw9xiuV7.json
datadir is ./data/
dataset is wiki
outdir is ./results/
transformer_model is distilbert-base-uncased
n_epochs is 5
batch_size is 16
eval_steps is 20000
class_imbalance is 50
init_n is 20
cold_strategy is BalancedWeak
query_n is 100
query_strategy is LeastConfidence()
train_n is 20000
test_n is 5000
run_n is 5
y selected [1 1 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1]
Starting imbalance (train): 0.7
Starting imbalance: 0.7
Setting val indices




In [82]:
transformer_model = TransformerModelArguments(args.transformer_model)
clf_factory = TransformerBasedClassificationFactory(transformer_model,
                                                    num_classes=2,
                                                    kwargs={
                                                        'device': 'mps', 
                                                        'num_epochs': args.n_epochs,
                                                        # 'mini_batch_size': 16,
                                                        'class_weight': 'balanced'
                                                    })
query_strategy = eval(args.query_strategy)
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train_trans_dataset)


In [83]:
indices_labeled = active_learner.initialize_data(indices_initial, y_initial, indices_validation=val_indices)

torch.Size([1, 512])
<class 'small_text.integrations.transformers.datasets.TransformersDatasetView'>
<class 'small_text.integrations.transformers.datasets.TransformersDatasetView'>


KeyboardInterrupt: 

In [84]:
train_trans_dataset[indices_initial]

<small_text.integrations.transformers.datasets.TransformersDatasetView at 0x2dae01690>

In [81]:
train_trans_dataset.data[0]

(tensor([[  101,  1036,  8840,  2140,  1010,  2145,  9345,  4667,  2026,  3160,
           1029,  2073,  2003,  1996,  3120,  2008,  1036,  1036, 14163, 20051,
           3406,  1036,  1036,  2003,  1037,  1036,  1036, 14286, 11701,  1036,
           1036,  2744,  1029,  1996,  2783,  1036,  1036,  3120,  1036,  1036,
           7604,  2178,  3120,  2029,  2987,  1005,  1056,  2360,  2012,  2035,
           2054,  1996,  2034,  3120,  1036,  1036, 14964,  1036,  1036,  1012,
           1036,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [73]:
a = tokenizer(['hello you', 'nice to meet you'], padding=True)['input_ids']
[torch.tensor(b) for b in a]

[tensor([ 101, 7592, 2017,  102,    0,    0]),
 tensor([ 101, 3835, 2000, 3113, 2017,  102])]

In [16]:
args.framework

'/Users/raymond/Library/Jupyter/runtime/kernel-v2-4635c68Se6l7zTgJ.json'

In [34]:
train_encodings = tokenizer(train_dict['data'], truncation=True, padding=True)

In [37]:
train_full = TextDataset(train_encodings, train_dict['target'])


In [38]:
val_indices = _genenrate_val_indices(train_dict['target'])
indices = np.arange(len(train_dict['target']))
val_mask = np.isin(indices, val_indices)
train_indices = indices[~val_mask]

train_dataset = Subset(train_full, train_indices)
val_dataset = Subset(train_full, val_indices)

Starting imbalance: 0.5
Setting val indices


In [4]:
indices_neg_label = np.where(train_full.y == 0)[0]
indices_pos_label = np.where(train_full.y == 1)[0]
all_indices = np.concatenate([indices_neg_label, indices_pos_label])
np.random.shuffle(all_indices)
x_indices_initial = all_indices.astype(int)
y_initial = np.array([train_full.y[i] for i in x_indices_initial])
print(f'Starting imbalance: {np.round(np.mean(y_initial),2)}')
print('Setting val indices')
val_indices = np.concatenate([np.random.choice(indices_pos_label, 
                                                int(0.1*len(indices_pos_label)),
                                                replace=False),
                                np.random.choice(indices_neg_label,
                                                int(0.1*len(indices_neg_label)),
                                                replace=False)
                                ])
indices = np.arange(x_indices_initial.shape[0])
print(indices.shape)
mask = np.isin(indices, val_indices)
print(mask.shape)

Starting imbalance: 0.5
Setting val indices
(20000,)
(20000,)


In [5]:
train = train_full[indices[~mask]]
valid = train_full[indices[mask]]
train_dataset = TensorDataset(torch.concat(train.x, dim=0), torch.Tensor(train.y))
valid_dataset = TensorDataset(torch.concat(valid.x, dim=0), torch.Tensor(valid.y))


In [6]:
import datasets
import torch

import numpy as np

from matplotlib import rcParams

datasets.logging.set_verbosity_error()

# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET

# set matplotlib params
rcParams.update({'xtick.labelsize': 14, 'ytick.labelsize': 14, 'axes.labelsize': 16})

# fix the random seed
seed = 2022
torch.manual_seed(seed)
np.random.seed(seed)

In [7]:
import logging
import numpy as np


from transformers import AutoTokenizer

from small_text import TransformersDataset


raw_dataset = datasets.load_dataset('rotten_tomatoes')
num_classes = raw_dataset['train'].features['label'].num_classes

transformer_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)


target_labels = np.arange(num_classes)

train = TransformersDataset.from_arrays(raw_dataset['train']['text'],
                                        raw_dataset['train']['label'],
                                        tokenizer,
                                        max_length=60,
                                        target_labels=target_labels)
test = TransformersDataset.from_arrays(raw_dataset['test']['text'], 
                                       raw_dataset['test']['label'],
                                       tokenizer,
                                       max_length=60,
                                       target_labels=target_labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




(tensor([[ 101, 2178, 3793, 7099, 1012,  102,    0,    0]], device='mps:0'),
 tensor([[1, 1, 1, 1, 1, 1, 0, 0]], device='mps:0'),
 tensor([1], device='mps:0'))