In [2]:
import argparse
import torch
import json
import logging
import random
import os
import numpy as np

from transformers import AutoTokenizer
from datetime import datetime
from preprocess import data_loader, df_to_dict
from evaluation import evaluate
from learner_functions import perform_active_learning
from dataset_utils import genenrate_start_indices, dict_to_transformer_dataset

from small_text import (
    EmptyPoolException,
    PoolBasedActiveLearner,
    PoolExhaustedException,
    RandomSampling,
    random_initialization_balanced
)
from small_text.query_strategies.strategies import (QueryStrategy,
                                                    RandomSampling,
                                                    ConfidenceBasedQueryStrategy,
                                                    LeastConfidence,
                                                    EmbeddingBasedQueryStrategy,
                                                    EmbeddingKMeans,
                                                    ContrastiveActiveLearning)
from small_text.integrations.transformers.classifiers.classification import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory

In [5]:

def parse_args():
    parser=argparse.ArgumentParser(description="Active Learning Experiment Runner with Transformers Integration")
    parser.add_argument('--method', type = str, metavar ="", default = 'AL', help="Supervised == SL or Active == AL")
    parser.add_argument('--framework', type = str, metavar ="", default = 'TF', help="Transformers == TF or SkLearn == SK")
    parser.add_argument('--datadir', type = str, metavar ="",default = './data/', help="Path to directory with data files")
    parser.add_argument('--dataset', type = str, metavar ="",default = 'wiki', help="Name of dataset")
    parser.add_argument('--outdir', type = str, metavar ="",default = './results/', help="Path to output directory for storing results")
    parser.add_argument('--transformer_model', type = str, metavar ="",default = 'distilroberta-base', help="Name of HuggingFace transformer model")
    parser.add_argument('--n_epochs', type = int, metavar ="",default =  5, help = "Number of epochs for model training")
    parser.add_argument('--batch_size', type = int, metavar ="", default = 16, help = 'Number of samples per batch')
    parser.add_argument('--eval_steps', type = int, metavar ="", default = 20000, help = 'Evaluation after a number of training steps')
    parser.add_argument('--class_imbalance', type = int, metavar ="", default = 50, help = 'Class imbalance desired in train dataset')
    parser.add_argument('--init_n', type = int, metavar ="", default = 20, help = 'Initial batch size for training')
    parser.add_argument('--cold_strategy', metavar ="", default = 'BalancedRandom', help = 'Method of cold start to select initial examples')
    parser.add_argument('--query_n', type = int, metavar ="", default = 100, help = 'Batch size per active learning query for training')
    parser.add_argument('--query_strategy', metavar ="", default = 'LeastConfidence()', help = 'Method of active learning query for training')
    parser.add_argument('--train_n', type = int, metavar ="", default = 20000, help = 'Total number of training examples')
    parser.add_argument('--test_n', type = int, metavar ="", default = 5000, help = 'Total number of testing examples')
    parser.add_argument('--run_n', type = int, metavar ="", default = 5, help = 'Number of times to run each model')
    # parser.add_argument("-f", "--fff", help="a dummy argument to fool ipython", default="1")
    args=parser.parse_args()
    print("the inputs are:")
    for arg in vars(args):
        print("{} is {}".format(arg, getattr(args, arg)))
    return args

In [6]:
current_datetime = datetime.now()
args=parse_args()
args.framework = 'TF'
EXP_DIR = f'{args.outdir}/{args.method}_{args.framework}_{args.dataset}_{args.class_imbalance}_{args.train_n}'
if not os.path.exists(EXP_DIR):
    os.makedirs(EXP_DIR)
output = {}
for arg in vars(args):
    output[arg] = getattr(args, arg)
    
logging.basicConfig(filename=f"{EXP_DIR}/log.txt",level=logging.DEBUG)
logging.captureWarnings(True)
logf = open(f"{EXP_DIR}/err.log", "w")
with open(f'{EXP_DIR}/START_{current_datetime}.json', 'w') as fp:
    json.dump(output, fp)

the inputs are:
method is AL
framework is /home/raymond/.local/share/jupyter/runtime/kernel-v2-550fBo6Yk8T3J13.json
datadir is ./data/
dataset is wiki
outdir is ./results/
transformer_model is distilbert-base-uncased
n_epochs is 5
batch_size is 16
eval_steps is 20000
class_imbalance is 50
init_n is 20
cold_strategy is BalancedRandom
query_n is 100
query_strategy is LeastConfidence()
train_n is 20000
test_n is 5000
run_n is 5


In [7]:
results_dict = {}
predictions_dict = {}
# Load data
train_df, test_dfs = data_loader(args)
for run in range(1):
    seed_value = run
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    
    print(f'----RUN {run}: {args.method} LEARNER----')
    print(f'----Seed: {seed_value}----')

    tokenizer = AutoTokenizer.from_pretrained(args.transformer_model, cache_dir='.cache/')    
    tokenizer.add_special_tokens({'additional_special_tokens': ["[URL]", "[EMOJI]", "[USER]"]})
    
    test_datasets = {}
    matching_indexes = {}
    for j in test_dfs.keys():
        matching_indexes[j] = test_dfs[j].index.tolist()
        data_dict = df_to_dict('test', test_dfs[j])
        processed_data = dict_to_transformer_dataset(data_dict, tokenizer)
        test_datasets[j] = processed_data

----RUN 0: AL LEARNER----
----Seed: 0----


In [8]:
train_dict = df_to_dict('train', train_df)
indices_initial, y_initial, val_indices = genenrate_start_indices(train_dict, args)

train_trans_dataset = dict_to_transformer_dataset(train_dict, tokenizer)

transformer_model = TransformerModelArguments(args.transformer_model)
clf_factory = TransformerBasedClassificationFactory(transformer_model,
                                                    num_classes=2,
                                                    kwargs={
                                                        'device': 'cuda', 
                                                        'num_epochs': args.n_epochs,
                                                        'mini_batch_size': args.batch_size,
                                                        'class_weight': 'balanced'
                                                    })


y selected [0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]
Starting imbalance (train): 0.5
Starting imbalance: 0.5
Setting val indices


In [9]:
transformer_model.model

AttributeError: 'TransformerBasedClassificationFactory' object has no attribute 'classifier'

In [10]:
indices_labeled = active_learner.initialize_data(indices_initial, y_initial, indices_validation=val_indices)

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[URL]', '[USER]', '[EMOJI]']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30522: AddedToken("[URL]", rstrip=False, lstrip=False, 

In [10]:
query_strategy = eval(args.query_strategy)
active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train_trans_dataset)

print('\n----Initalising----\n')
iter_results_dict = {}
iter_preds_dict = {}
indices_labeled = active_learner.initialize_data(indices_initial, y_initial, indices_validation=val_indices)
print('Learner initalized ok.')


----Initalising----

Learner initalized ok.


In [23]:
active_learner.classifier

<small_text.integrations.transformers.classifiers.classification.TransformerBasedClassification at 0x7f8089ed1bd0>

In [22]:
train_trans_dataset[0].data

[(tensor([[  101,  1036,  8840,  2140,  1010,  2145,  9345,  4667,  2026,  3160,
            1029,  2073,  2003,  1996,  3120,  2008,  1036,  1036, 14163, 20051,
            3406,  1036,  1036,  2003,  1037,  1036,  1036, 14286, 11701,  1036,
            1036,  2744,  1029,  1996,  2783,  1036,  1036,  3120,  1036,  1036,
            7604,  2178,  3120,  2029,  2987,  1005,  1056,  2360,  2012,  2035,
            2054,  1996,  2034,  3120,  1036,  1036, 14964,  1036,  1036,  1012,
            1036,   102,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,    

In [16]:
args.framework

'/Users/raymond/Library/Jupyter/runtime/kernel-v2-4635c68Se6l7zTgJ.json'

In [34]:
train_encodings = tokenizer(train_dict['data'], truncation=True, padding=True)

In [37]:
train_full = TextDataset(train_encodings, train_dict['target'])


In [38]:
val_indices = _genenrate_val_indices(train_dict['target'])
indices = np.arange(len(train_dict['target']))
val_mask = np.isin(indices, val_indices)
train_indices = indices[~val_mask]

train_dataset = Subset(train_full, train_indices)
val_dataset = Subset(train_full, val_indices)

Starting imbalance: 0.5
Setting val indices


In [4]:
indices_neg_label = np.where(train_full.y == 0)[0]
indices_pos_label = np.where(train_full.y == 1)[0]
all_indices = np.concatenate([indices_neg_label, indices_pos_label])
np.random.shuffle(all_indices)
x_indices_initial = all_indices.astype(int)
y_initial = np.array([train_full.y[i] for i in x_indices_initial])
print(f'Starting imbalance: {np.round(np.mean(y_initial),2)}')
print('Setting val indices')
val_indices = np.concatenate([np.random.choice(indices_pos_label, 
                                                int(0.1*len(indices_pos_label)),
                                                replace=False),
                                np.random.choice(indices_neg_label,
                                                int(0.1*len(indices_neg_label)),
                                                replace=False)
                                ])
indices = np.arange(x_indices_initial.shape[0])
print(indices.shape)
mask = np.isin(indices, val_indices)
print(mask.shape)

Starting imbalance: 0.5
Setting val indices
(20000,)
(20000,)


In [5]:
train = train_full[indices[~mask]]
valid = train_full[indices[mask]]
train_dataset = TensorDataset(torch.concat(train.x, dim=0), torch.Tensor(train.y))
valid_dataset = TensorDataset(torch.concat(valid.x, dim=0), torch.Tensor(valid.y))


In [6]:
import datasets
import torch

import numpy as np

from matplotlib import rcParams

datasets.logging.set_verbosity_error()

# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET

# set matplotlib params
rcParams.update({'xtick.labelsize': 14, 'ytick.labelsize': 14, 'axes.labelsize': 16})

# fix the random seed
seed = 2022
torch.manual_seed(seed)
np.random.seed(seed)

In [7]:
import logging
import numpy as np


from transformers import AutoTokenizer

from small_text import TransformersDataset


raw_dataset = datasets.load_dataset('rotten_tomatoes')
num_classes = raw_dataset['train'].features['label'].num_classes

transformer_model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    transformer_model_name
)


target_labels = np.arange(num_classes)

train = TransformersDataset.from_arrays(raw_dataset['train']['text'],
                                        raw_dataset['train']['label'],
                                        tokenizer,
                                        max_length=60,
                                        target_labels=target_labels)
test = TransformersDataset.from_arrays(raw_dataset['test']['text'], 
                                       raw_dataset['test']['label'],
                                       tokenizer,
                                       max_length=60,
                                       target_labels=target_labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




(tensor([[ 101, 2178, 3793, 7099, 1012,  102,    0,    0]], device='mps:0'),
 tensor([[1, 1, 1, 1, 1, 1, 0, 0]], device='mps:0'),
 tensor([1], device='mps:0'))