In [1]:
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
classifier_type = 'relevance'
model_name = "avsolatorio/GIST-small-Embedding-v0"

In [3]:
def import_labelled_data(path="data/labelled/data.json"):
    data = pd.read_json(path, encoding="latin-1")
    return data


data = import_labelled_data(path="../../data/labelled/data.json", )



if classifier_type == 'multiclasses':
    # drop irrelevant parts
    data = data[data["relevance"] != "irrelevant"]


# train test split
from sklearn.model_selection import train_test_split

# 0.49, 0.21, 0.3 split
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.3, random_state=42)

display(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20901 entries, 0 to 20900
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           20901 non-null  object
 1   text          20901 non-null  object
 2   relevance     20901 non-null  object
 3   multiclasses  20901 non-null  object
dtypes: object(4)
memory usage: 653.3+ KB


None

Unnamed: 0,url,text,relevance,multiclasses
0,https://www.conservationevidence.com/synopsis/...,1 \n \n \n2 \n \n \nSubtidal Benthic Invertebr...,relevant,[Marine Invertebrates]
1,https://www.conservationevidence.com/synopsis/...,\n \n \n Control of freshwater \n invasi...,relevant,"[Fish, Rivers and Lakes, Invasive]"
2,https://www.conservationevidence.com/synopsis/...,1 \n \nGrassland Conservation \n2 \n \nGrassla...,relevant,[Grassland]
3,https://www.conservationevidence.com/synopsis/...,\n \n \nii \n \n \n \n \n \n \n \nPrimate Co...,relevant,[Mammals]
4,https://www.conservationevidence.com/synopsis/...,CONSERVATION EVIDENCE SERIES SYNOPSES\nTerrest...,relevant,[Mammals]


In [4]:
from chunking import chunk_dataset_and_explode


# roughly 4 characters per token
max_len = 512

train_data = chunk_dataset_and_explode(train_data, max_len=max_len, overlap=int(max_len * 0.2))
test_data = chunk_dataset_and_explode(test_data, max_len=max_len, overlap=int(max_len * 0.2))
val_data = chunk_dataset_and_explode(val_data, max_len=max_len, overlap=int(max_len * 0.2))

In [5]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data, split="train")
test_dataset = Dataset.from_pandas(test_data, split="test")
val_dataset = Dataset.from_pandas(val_data, split="val")

train_dataset

Dataset({
    features: ['chunk_id', 'url', 'text', 'relevance', 'multiclasses'],
    num_rows: 1660297
})

In [6]:
from fastfit import sample_dataset, FastFitTrainer

if classifier_type == 'relevance':
    num_samples_per_label = 500
elif classifier_type == 'multiclasses':
    num_samples_per_label = 50

train_dataset = sample_dataset(train_dataset, label_column=classifier_type,num_samples_per_label=num_samples_per_label,seed=42)
val_dataset = val_dataset.select(range(500)).shuffle(seed=42)
test_dataset = test_dataset.select(range(500)).shuffle(seed=42)

# train_dataset, val_dataset, test_dataset = train_dataset.shuffle(seed=42), val_dataset.shuffle(seed=42), test_dataset.shuffle(seed=42)

In [7]:

#! had to modify FastFitTrainer to at /fastfit/train.py, line 879, to add trust_remote_code=True to the loading of 'accuracy' metrics
#! don't know why it's not default, since accuracy is the default in fastfit



#! IMPORTANT: another change in FastFitTrainer, also at line 879; comment out and replace the fixed version above
#! since load_metric is deprecated in favour of evaluate.load()
#! added functionality for sending in multiple metrics to evaluate at once
#! added macro averages for non-accuracy metrics too
#! essentially, copy the below code to replace line 879

### Insert into line 879.

```python
        # metric = load_metric(self.data_args.metric_name, experiment_id=uuid.uuid4())
        from evaluate import combine, load
        if type(self.data_args.metric_name) == str: # single metric name
            metrics = [load(self.data_args.metric_name, experiment_id=uuid.uuid4())]
        elif type(self.data_args.metric_name) == list: # compute multiple metrics
            metrics = [load(metric,experiment_id=uuid.uuid4()) for metric in self.data_args.metric_name]

        # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
        # predictions and label_ids field) and has to return a dictionary string to float.
        def compute_metrics(p: EvalPrediction):
            predictions = (
                p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            )
            predictions = (
                np.squeeze(predictions)
                if self.is_regression
                else np.argmax(predictions, axis=1)
            )
            references = p.label_ids

            results = {}

            for metric in metrics:
                if metric.name != 'accuracy':
                    results.update(metric.compute(predictions=predictions, references=references,average='macro'))
                else:
                    results.update(metric.compute(predictions=predictions, references=references))

            return results
    ```

In [8]:
# same args as the huggingface TrainingArguments


trainer = FastFitTrainer(
    model_name_or_path=model_name,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    test_dataset=test_dataset,
    output_dir=f'models/{classifier_type}/{model_name}',
    overwrite_output_dir=True,
    label_column_name=classifier_type,
    text_column_name="text",
    num_train_epochs=5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_text_length=512,
    num_repeats=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    logging_strategy='epoch',
    metric_name=['precision','recall','f1','accuracy'],
    load_best_model_at_end=True
)





Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/500 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/500 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/500 [00:00<?,…

Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

print(torch.version.cuda)

torch.cuda.empty_cache()
print(trainer.model.device)

True
0
NVIDIA GeForce RTX 3090
12.1
cuda:0


In [10]:

#! another fastfit library modification
#! in /fastfit/train.py, line 971, change ignore_keys_for_eval from type set to a list
#! since it gets concatenated to a list later on
#! note that since we've added lines above, this is now line 981
#! the line beginning ignore_keys_for_eval={"doc_input_ids","doc_attention_mask","labels"}


model = trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,4.8977,4.587396,0.980287,0.976293,0.977804,0.978
2,4.4068,4.358639,0.971831,0.965517,0.96765,0.968
3,4.3092,4.313827,0.971831,0.965517,0.96765,0.968
4,4.2508,4.314916,0.968531,0.961207,0.963575,0.964
5,4.2352,4.300236,0.976868,0.971983,0.973748,0.974


***** train metrics *****
  epoch                    =        5.0
  total_flos               =        0GF
  train_loss               =     4.4199
  train_runtime            = 0:00:32.40
  train_samples            =        500
  train_samples_per_second =      77.15
  train_steps_per_second   =      1.234


In [11]:
results = trainer.evaluate()

***** eval metrics *****
  epoch                   =        5.0
  eval_accuracy           =      0.974
  eval_f1                 =     0.9737
  eval_loss               =     4.3002
  eval_precision          =     0.9769
  eval_recall             =      0.972
  eval_runtime            = 0:00:01.98
  eval_samples            =        500
  eval_samples_per_second =    251.886
  eval_steps_per_second   =       4.03


In [12]:
print(f'Accuracy: {results["eval_accuracy"]}')

Accuracy: 0.974


In [13]:
model.save_pretrained(f'models/{classifier_type}/{model_name}')

In [14]:
results = trainer.test()

***** test metrics *****
  epoch                   =        5.0
  eval_accuracy           =        1.0
  eval_f1                 =        1.0
  eval_loss               =     4.7645
  eval_precision          =        1.0
  eval_recall             =        1.0
  eval_runtime            = 0:00:01.99
  eval_samples_per_second =    250.794
  eval_steps_per_second   =      4.013
  test_samples            =        500


In [15]:
# predict on the test set
model = trainer.export_model()
model.cuda()

FastFit(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_af