In [1]:
import tqdm
import datasets

In [1]:
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
DEV = False
classifier_type = 'relevance'
model_name = "avsolatorio/GIST-small-Embedding-v0"
# model_name = 'nomic-ai/nomic-embed-text-v1' # work on larger context sizes

In [4]:
def import_labelled_data(path="data/labelled/data.json", group_relevant=True):
    data = pd.read_json(path, encoding="latin-1")
    data["relevance"] = data["class"].apply(
        lambda x: "relevant" if x != "irrelevant" else x
    )
    return data


data = import_labelled_data(path="../../data/labelled/data.json", group_relevant=False)

# drop null classes
data = data.dropna(subset=["class"])


if DEV:
    data = data.sample(5000)


# train test split
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

data

Unnamed: 0,url,text,class,relevance
0,,\n \n \n Control of freshwater \n invasi...,Invasive Fish,relevant
1,,1 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,Marine,relevant
2,,"1 \n \nPhilip A. Martin, Ricardo Rocha, \nRebe...",Shrubland,relevant
3,PDF-PLACEHOLDER,1 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,..,relevant
4,,1 \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,Marine,relevant
...,...,...,...,...
9533,https://deadliestwebattacks.com/assets/devops-...,"DevOps Is Automation, DevSecOps Is People Mike...",irrelevant,irrelevant
9534,https://www.daytonsuperior.com/docs/default-so...,Yeti-Anchor® Precast & Tilt-Up TECHNICAL DATA ...,irrelevant,irrelevant
9535,https://dce-uae.com/wp-content/uploads/2024/04...,MAŁOPOLSKA 14 UNESCO World Heritage List sites...,irrelevant,irrelevant
9536,https://www.dekosrl.com/wp-content/uploads/202...,"CHAPTER ONE TheCompany 4 Mission: know-how, pe...",irrelevant,irrelevant


In [5]:
from chunking import chunk_dataset_and_explode


# roughly 4 characters per token
max_len = 512

train_data = chunk_dataset_and_explode(train_data, max_len=max_len, overlap=int(max_len * 0.2))
test_data = chunk_dataset_and_explode(test_data, max_len=max_len, overlap=int(max_len * 0.2))
val_data = chunk_dataset_and_explode(val_data, max_len=max_len, overlap=int(max_len * 0.2))

In [6]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data, split="train")
test_dataset = Dataset.from_pandas(test_data, split="test")
val_dataset = Dataset.from_pandas(val_data, split="val")

train_dataset

Dataset({
    features: ['chunk_id', 'url', 'text', 'class', 'relevance'],
    num_rows: 367046
})

In [7]:
from fastfit import sample_dataset, FastFitTrainer


train_dataset = sample_dataset(train_dataset, label_column=classifier_type,num_samples_per_label=250,seed=42)
val_dataset = val_dataset.shuffle(seed=42).select(range(500))
test_dataset = test_dataset.shuffle(seed=42).select(range(1500))

In [8]:

#! had to modify FastFitTrainer to at /fastfit/train.py, line 879, to add trust_remote_code=True to the loading of 'accuracy' metrics
#! don't know why it's not default, since accuracy is the default in fastfit

#* note that since SetFit uses evaluation_strategy as the argument name rather than eval_strategy
#* I had to change it in the FastFitTrainer call below
#* if using the latest transformers version (transformers>=4.41.0), use eval_strategy

#! another change in FastFitTrainer, also at line 879; commented out the fixed version above
#! since load_metric is deprecated in favour of evaluate.load()
#! using evaluate means we can use evaluate.combine(), which lets us calculate multiple metrics at once
#! also, add the ability to just send in our own compute_metrics function
#! essentially, copy the below code to replace line 879

### Insert into line 879.

```python
        # metric = load_metric(self.data_args.metric_name, experiment_id=uuid.uuid4())
        from evaluate import combine, load
        if type(self.data_args.metric_name) == str: # single metric name
            metrics = [load(self.data_args.metric_name, experiment_id=uuid.uuid4())]
        elif type(self.data_args.metric_name) == list: # compute multiple metrics
            metrics = [load(metric,experiment_id=uuid.uuid4()) for metric in self.data_args.metric_name]

        # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
        # predictions and label_ids field) and has to return a dictionary string to float.
        def compute_metrics(p: EvalPrediction):
            predictions = (
                p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            )
            predictions = (
                np.squeeze(predictions)
                if self.is_regression
                else np.argmax(predictions, axis=1)
            )
            references = p.label_ids

            results = {}

            for metric in metrics:
                if metric.name != 'accuracy':
                    results.update(metric.compute(predictions=predictions, references=references,average='macro'))
                else:
                    results.update(metric.compute(predictions=predictions, references=references))

            return results
    ```

In [9]:
# same args as the huggingface TrainingArguments


trainer = FastFitTrainer(
    model_name_or_path=model_name,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    test_dataset=test_dataset,
    output_dir=f'models/{classifier_type}/{model_name}',
    overwrite_output_dir=True,
    label_column_name=classifier_type,
    text_column_name="text",
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_text_length=512,
    num_repeats=1,
    evaluation_strategy="steps",
    eval_steps=1,
    save_strategy="steps",
    save_steps=4,
    fp16=True,
    logging_steps=1,
    metric_name=['precision','accuracy','f1'],
    load_best_model_at_end=True,
)





Flattening the indices:   0%|          | 0/500 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1500 [00:00<?, ? examples/s]

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/500 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/500 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/1500 [00:00<?…

Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [10]:
import torch
torch.cuda.empty_cache()

In [11]:

#! another fastfit library modification
#! in /fastfit/train.py, line 971, change ignore_keys_for_eval from type set to a list
#! since it gets concatenated to a list later on
#! note that since we've added lines above, this is now line 981
#! the line beginning ignore_keys_for_eval={"doc_input_ids","doc_attention_mask","labels"}


model = trainer.train()



Step,Training Loss,Validation Loss,Precision,Accuracy,F1
1,5.5903,5.533229,0.694402,0.716,0.691084
2,5.2311,5.218047,0.807495,0.82,0.812184
3,4.986,5.016044,0.869274,0.88,0.875206
4,4.8373,4.865735,0.923473,0.932,0.927989
5,4.7468,4.754911,0.934177,0.942,0.938517
6,4.6452,4.666359,0.960633,0.966,0.96381
7,4.5642,4.594436,0.960633,0.966,0.96381
8,4.324,4.533224,0.958878,0.964,0.961641
9,4.4359,4.483872,0.961375,0.966,0.963733
10,4.4161,4.443537,0.96568,0.968,0.96568


***** train metrics *****
  epoch                    =       20.0
  total_flos               =        0GF
  train_loss               =     4.1944
  train_runtime            = 0:07:50.40
  train_samples            =        500
  train_samples_per_second =     21.258
  train_steps_per_second   =       0.34


In [12]:
results = trainer.evaluate()

***** eval metrics *****
  epoch                   =       20.0
  eval_accuracy           =      0.974
  eval_f1                 =      0.972
  eval_loss               =     4.3281
  eval_precision          =     0.9748
  eval_runtime            = 0:00:02.09
  eval_samples            =        500
  eval_samples_per_second =    238.564
  eval_steps_per_second   =      3.817


In [13]:
print(f'Accuracy: {results["eval_accuracy"]}')

Accuracy: 0.974


In [14]:
model.save_pretrained(f'models/{classifier_type}/{model_name}')

In [15]:
results = trainer.test()

***** test metrics *****
  epoch                   =       20.0
  eval_accuracy           =      0.982
  eval_f1                 =     0.9801
  eval_loss               =     4.3242
  eval_precision          =      0.979
  eval_runtime            = 0:00:06.09
  eval_samples_per_second =    246.289
  eval_steps_per_second   =      3.941
  test_samples            =       1500
