In [1]:
import tqdm
import datasets

In [2]:
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
DEV = False
model_name = "avsolatorio/GIST-small-Embedding-v0"

In [4]:
def import_labelled_data(path="data/labelled/data.json", group_relevant=True):
    data = pd.read_json(path, encoding="latin-1")
    data["relevance"] = data["class"].apply(
        lambda x: "relevant" if x != "irrelevant" else x
    )
    return data


data = import_labelled_data(path="../../data/labelled/data.json", group_relevant=False)

# drop null classes
data = data.dropna(subset=["class"])


if DEV:
    data = data.sample(5000)


# train test split
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

data.head()

Unnamed: 0,url,text,class,relevance
0,https://www.wetlands.org/wp-content/uploads/20...,\n \n \nFlamingo\nFlamingo\nFlamingo\nFlaming...,Birds,relevant
1,https://www.wetlands.org/publications/flamingo...,\n\n \n \n \n \n \n \nABOUT THE GROUP \n \nThe...,Birds,relevant
2,https://www.wetlands.org/publications/the-stat...,\n\n\n\n\n\n(FIRST PAGE) \n \n \n \n \nTHE STA...,Birds,relevant
3,https://www.sciencedirect.com/science/article/...,\nPlease contact us via our\nsupport center fo...,Mammals,relevant
4,https://www.wetlands.org/publications/strategi...,Strategies for wise use of Wetlands:\nBest Pra...,Wetlands,relevant


In [5]:
from chunking import chunk_dataset_and_explode


# roughly 4 characters per token
max_len = 2048

train_data = chunk_dataset_and_explode(train_data, max_len=max_len, overlap=int(max_len * 0.2))
test_data = chunk_dataset_and_explode(test_data, max_len=max_len, overlap=int(max_len * 0.2))
val_data = chunk_dataset_and_explode(val_data, max_len=max_len, overlap=int(max_len * 0.2))

In [6]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data, split="train")
test_dataset = Dataset.from_pandas(test_data, split="test")
val_dataset = Dataset.from_pandas(val_data, split="val")

train_dataset

Dataset({
    features: ['chunk_id', 'url', 'text', 'class', 'relevance'],
    num_rows: 93801
})

In [7]:
from fastfit import sample_dataset, FastFitTrainer


train_dataset = sample_dataset(train_dataset, label_column='relevance',num_samples_per_label=150,seed=42)
val_dataset = val_dataset.shuffle(seed=42).select(range(200))
test_dataset = test_dataset.shuffle(seed=42).select(range(1500))

In [8]:

#! had to modify FastFitTrainer to at /fastfit/train.py, line 879, to add trust_remote_code=True to the loading of 'accuracy' metrics
#! don't know why it's not default, since accuracy is the default in fastfit

#* note that since SetFit uses evaluation_strategy as the argument name rather than eval_strategy
#* I had to change it in the FastFitTrainer call below
#* if using the latest transformers version (transformers>=4.41.0), use eval_strategy

#! another change in FastFitTrainer, also at line 879; commented out the fixed version above
#! since load_metric is deprecated in favour of evaluate.load()
#! using evaluate means we can use evaluate.combine(), which lets us calculate multiple metrics at once
#! also, add the ability to just send in our own compute_metrics function
#! essentially, copy the below code to replace line 879

### Insert into line 879.

```python
        # metric = load_metric(self.data_args.metric_name, experiment_id=uuid.uuid4())
        from evaluate import combine, load
        if type(self.data_args.metric_name) == str: # single metric name
            metrics = [load(self.data_args.metric_name, experiment_id=uuid.uuid4())]
        elif type(self.data_args.metric_name) == list: # compute multiple metrics
            metrics = [load(metric,experiment_id=uuid.uuid4()) for metric in self.data_args.metric_name]

        # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
        # predictions and label_ids field) and has to return a dictionary string to float.
        def compute_metrics(p: EvalPrediction):
            predictions = (
                p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            )
            predictions = (
                np.squeeze(predictions)
                if self.is_regression
                else np.argmax(predictions, axis=1)
            )
            references = p.label_ids

            results = {}

            for metric in metrics:
                if metric.name != 'accuracy':
                    results.update(metric.compute(predictions=predictions, references=references,average='macro'))
                else:
                    results.update(metric.compute(predictions=predictions, references=references))

            return results
    ```

In [9]:
# same args as the huggingface TrainingArguments


trainer = FastFitTrainer(
    model_name_or_path=model_name,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    test_dataset=test_dataset,
    output_dir=f'models/{model_name}',
    overwrite_output_dir=True,
    label_column_name="relevance",
    text_column_name="text",
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_text_length=2048,
    seed=42,
    num_repeats=1,
    evaluation_strategy="epoch",
    fp16=True,
    logging_steps=1,
    metric_name=['precision','accuracy','f1']
)





Flattening the indices:   0%|          | 0/200 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1500 [00:00<?, ? examples/s]

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/300 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/200 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/1500 [00:00<?…

Running tokenizer on dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [10]:
import torch
torch.cuda.empty_cache()

In [11]:

#! another fastfit library modification
#! in /fastfit/train.py, line 971, change ignore_keys_for_eval from type set to a list
#! since it gets concatenated to a list later on
#! note that since we've added lines above, this is now line 981
#! the line beginning ignore_keys_for_eval={"doc_input_ids","doc_attention_mask","labels"}


model = trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Accuracy,F1
1,4.299,4.486179,0.97191,0.975,0.974536
2,3.9602,4.249847,0.988372,0.99,0.98977
3,3.8898,4.215579,0.988372,0.99,0.98977
4,3.8389,4.182316,0.982759,0.985,0.984678
5,3.8206,4.225079,0.982759,0.985,0.984678
6,3.8141,4.203447,0.982759,0.985,0.984678
7,3.8089,4.185375,0.982759,0.985,0.984678
8,3.7995,4.189353,0.982759,0.985,0.984678
9,3.7977,4.207273,0.982759,0.985,0.984678
10,3.7905,4.220812,0.982759,0.985,0.984678


***** train metrics *****
  epoch                    =       20.0
  total_flos               =        0GF
  train_loss               =     4.1634
  train_runtime            = 0:01:05.49
  train_samples            =        300
  train_samples_per_second =      91.61
  train_steps_per_second   =      1.527


In [12]:
results = trainer.evaluate()

***** eval metrics *****
  epoch                   =       20.0
  eval_accuracy           =      0.985
  eval_f1                 =     0.9847
  eval_loss               =     4.2296
  eval_precision          =     0.9828
  eval_runtime            = 0:00:01.04
  eval_samples            =        200
  eval_samples_per_second =    191.977
  eval_steps_per_second   =       3.84


In [13]:
print(f'Accuracy: {results["eval_accuracy"]}')

Accuracy: 0.985


In [14]:
model.save_pretrained(f'models/{model_name}')

In [17]:
tokenizer = trainer.tokenizer

In [18]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_compile_jinja_template',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_mask_token',
 '

In [16]:
results = trainer.test()

***** test metrics *****
  epoch                   =       20.0
  eval_accuracy           =     0.9833
  eval_f1                 =     0.9819
  eval_loss               =     4.3387
  eval_precision          =     0.9802
  eval_runtime            = 0:00:07.06
  eval_samples_per_second =    212.386
  eval_steps_per_second   =      3.398
  test_samples            =       1500
