### Separating Actions from conservation-adjacent texts
'Irrelevant' now represents non-actions, 'Relevant' represents action evidence.

In [8]:
from tqdm.notebook import tqdm
import pandas as pd

In [9]:
DEV = True

if DEV:
    model_name = "avsolatorio/GIST-small-Embedding-v0"
else:
    model_name = "avsolatorio/GIST-Embedding-v0"

In [10]:
def import_labelled_data(path="data/level-1.5/merged/data.json"):
    data = pd.read_json(path, encoding="latin-1")
    return data


data = import_labelled_data(path="../../../data/level-1.5/merged/data.json", )



# train test split
from sklearn.model_selection import train_test_split

# 0.49, 0.21, 0.3 split
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.3, random_state=42)

display(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10436 entries, 0 to 10435
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           10436 non-null  object 
 1   text          10436 non-null  object 
 2   relevance     10436 non-null  object 
 3   multiclasses  10436 non-null  object 
 4   score         223 non-null    float64
dtypes: float64(1), object(4)
memory usage: 407.8+ KB


None

Unnamed: 0,url,text,relevance,multiclasses,score
0,https://www.conservationevidence.com/synopsis/...,1 \n \n \n2 \n \n \nSubtidal Benthic Invertebr...,relevant,[Marine Invertebrates],
1,https://www.conservationevidence.com/synopsis/...,\n \n \n Control of freshwater \n invasi...,relevant,"[Fish, Rivers and Lakes, Invasive]",
2,https://www.conservationevidence.com/synopsis/...,1 \n \nGrassland Conservation \n2 \n \nGrassla...,relevant,[Grassland],
3,https://www.conservationevidence.com/synopsis/...,\n \n \nii \n \n \n \n \n \n \n \nPrimate Co...,relevant,[Mammals],
4,https://www.conservationevidence.com/synopsis/...,CONSERVATION EVIDENCE SERIES SYNOPSES\nTerrest...,relevant,[Mammals],


In [11]:
from chunking import chunk_dataset_and_explode


# roughly 4 characters per token
max_len = 512

train_data = chunk_dataset_and_explode(train_data, max_len=max_len, overlap=int(max_len * 0.2))
test_data = chunk_dataset_and_explode(test_data, max_len=max_len, overlap=int(max_len * 0.2))
val_data = chunk_dataset_and_explode(val_data, max_len=max_len, overlap=int(max_len * 0.2))

  0%|          | 0/5113 [00:00<?, ?it/s]

  0%|          | 0/3131 [00:00<?, ?it/s]

  0%|          | 0/2192 [00:00<?, ?it/s]

In [12]:
def stratified_sample(dataset,label_column: str = 'relevance',num_samples_per_label: int = 100):
    return (
        dataset
        .sample(frac=1,random_state=42)
        .groupby(label_column)[dataset.columns]
        .apply(lambda x: x.sample(min(num_samples_per_label,len(x)),random_state=42),include_groups=True).reset_index(drop=True)
    )

if DEV:
    train_data = stratified_sample(train_data,num_samples_per_label=500)
    val_data = val_data.sample(100,random_state=42)
    test_data = test_data.sample(200,random_state=42)
else:
    train_data = train_data.sample(frac=1,random_state=42)
    val_data = val_data.sample(500,random_state=42)
    test_data = test_data.sample(frac=1,random_state=42)


In [13]:
train_data

Unnamed: 0,chunk_id,url,text,relevance,multiclasses,score
0,10130,https://www.wetlands.org/publications/flamingo...,chilensis \nSite \nLat. \nLong. \n2008 \n200...,irrelevant,[Birds],
1,10364,https://www.zooreach.org/ZOO_WILD_Activities/2...,guide – how best you can use this The Sahyādr...,irrelevant,[],0.984325
2,10369,https://www.forestfoundation.ph/wp-content/upl...,ish Philippine tropical forests. %e TFCA its g...,irrelevant,[],0.982479
3,10352,https://digitalarchive.worldfishcenter.org/bit...,"10,480 (3.6%) 22 ▪ area, topography and soils...",irrelevant,[],0.986896
4,10382,https://www.wyomingwildsheep.org/wp-content/up...,tiful area of Wyoming with hunter when we acce...,irrelevant,[],0.979519
...,...,...,...,...,...,...
995,4,https://www.conservationevidence.com/synopsis/...,85 fawns/adult female; before control: 0.84 fa...,relevant,[Mammals],
996,2,https://www.conservationevidence.com/synopsis/...,...... \n83 \n2.23. Apply herbicide before see...,relevant,[Grassland],
997,3957,https://www.conservationevidence.com/individua...,"12,000 tadpoles were raised in captivity. Tad...",relevant,[Amphibians],
998,16,https://www.conservationevidence.com/synopsis/...,A review of a houbara bustard Chlamydotis undu...,relevant,[Birds],


In [14]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data, split="train")
test_dataset = Dataset.from_pandas(test_data, split="test")
val_dataset = Dataset.from_pandas(val_data, split="val")

train_dataset

Dataset({
    features: ['chunk_id', 'url', 'text', 'relevance', 'multiclasses', 'score'],
    num_rows: 1000
})

In [15]:

#! had to modify FastFitTrainer to at /fastfit/train.py, line 879, to add trust_remote_code=True to the loading of 'accuracy' metrics
#! don't know why it's not default, since accuracy is the default in fastfit



#! IMPORTANT: another change in FastFitTrainer, also at line 879; comment out and replace the fixed version above
#! since load_metric is deprecated in favour of evaluate.load()
#! added functionality for sending in multiple metrics to evaluate at once
#! added macro averages for non-accuracy metrics too
#! essentially, copy the below code to replace line 879

### Insert into line 879.

```python
        # metric = load_metric(self.data_args.metric_name, experiment_id=uuid.uuid4())
        from evaluate import combine, load
        if type(self.data_args.metric_name) == str: # single metric name
            metrics = [load(self.data_args.metric_name, experiment_id=uuid.uuid4())]
        elif type(self.data_args.metric_name) == list: # compute multiple metrics
            metrics = [load(metric,experiment_id=uuid.uuid4()) for metric in self.data_args.metric_name]

        # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
        # predictions and label_ids field) and has to return a dictionary string to float.
        def compute_metrics(p: EvalPrediction):
            predictions = (
                p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            )
            predictions = (
                np.squeeze(predictions)
                if self.is_regression
                else np.argmax(predictions, axis=1)
            )
            references = p.label_ids

            results = {}

            for metric in metrics:
                if metric.name != 'accuracy':
                    results.update(metric.compute(predictions=predictions, references=references,average='macro'))
                else:
                    results.update(metric.compute(predictions=predictions, references=references))

            return results
    ```

In [16]:
from fastfit import FastFitTrainer

# same args as the huggingface TrainingArguments
if DEV:
    output_dir = f'models/relevance/dev/{model_name}'
else:
    output_dir = f'models/relevance/{model_name}'

trainer = FastFitTrainer(
    model_name_or_path=model_name,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    test_dataset=test_dataset,
    output_dir=output_dir,
    overwrite_output_dir=True,
    label_column_name='relevance',
    text_column_name="text",
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_text_length=512,
    num_repeats=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy='epoch',
    metric_name=['precision','recall','f1','accuracy'],
    load_best_model_at_end=True,
    metric_for_best_model='precision',
    fp16=True,
)





Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/1000 [00:00<?…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/100 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/200 [00:00<?,…

Running tokenizer on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

In [17]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

print(torch.version.cuda)

torch.cuda.empty_cache()
print(trainer.model.device)

True
0
NVIDIA GeForce RTX 3090
12.1
cuda:0


In [18]:

#! another fastfit library modification
#! in /fastfit/train.py, line 971, change ignore_keys_for_eval from type set to a list
#! since it gets concatenated to a list later on
#! note that since we've added lines above, this is now line 981
#! the line beginning ignore_keys_for_eval={"doc_input_ids","doc_attention_mask","labels"}


model = trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,4.779,4.252465,0.94978,0.950321,0.949955,0.95
2,4.3283,4.226601,0.94,0.940705,0.939976,0.94
3,4.2099,4.205118,0.959936,0.959936,0.959936,0.96
4,4.1665,4.267502,0.959936,0.959936,0.959936,0.96
5,4.1429,4.305388,0.959936,0.959936,0.959936,0.96
6,4.1336,4.349062,0.959936,0.959936,0.959936,0.96
7,4.1294,4.436815,0.959936,0.959936,0.959936,0.96
8,4.1273,4.390973,0.959936,0.959936,0.959936,0.96
9,4.1262,4.3982,0.959936,0.959936,0.959936,0.96
10,4.1299,4.396714,0.959936,0.959936,0.959936,0.96


***** train metrics *****
  epoch                    =       10.0
  total_flos               =        0GF
  train_loss               =     4.2273
  train_runtime            = 0:01:23.01
  train_samples            =       1000
  train_samples_per_second =    120.462
  train_steps_per_second   =      1.927


In [19]:
results = trainer.evaluate()

***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =       0.96
  eval_f1                 =     0.9599
  eval_loss               =     4.2051
  eval_precision          =     0.9599
  eval_recall             =     0.9599
  eval_runtime            = 0:00:00.49
  eval_samples            =        100
  eval_samples_per_second =    201.895
  eval_steps_per_second   =      4.038


In [20]:
results

{'eval_loss': 4.205118179321289,
 'eval_precision': 0.9599358974358975,
 'eval_recall': 0.9599358974358975,
 'eval_f1': 0.9599358974358975,
 'eval_accuracy': 0.96,
 'eval_runtime': 0.4953,
 'eval_samples_per_second': 201.895,
 'eval_steps_per_second': 4.038,
 'epoch': 10.0,
 'eval_samples': 100}

In [21]:
print(f'Accuracy: {results["eval_accuracy"]}')

Accuracy: 0.96


In [22]:
results = trainer.test()

***** test metrics *****
  epoch                   =       10.0
  eval_accuracy           =       0.93
  eval_f1                 =     0.9196
  eval_loss               =     4.4866
  eval_precision          =     0.9196
  eval_recall             =     0.9196
  eval_runtime            = 0:00:00.93
  eval_samples_per_second =    213.762
  eval_steps_per_second   =      4.275
  test_samples            =        200


In [23]:


from os import makedirs, path
if not path.exists(f'models/relevance/dev/{model_name}'):
    makedirs(f'models/relevance/dev/{model_name}')

model.save_pretrained(f'models/relevance/dev/{model_name}')

if not DEV:
    model.save_pretrained(f'models/relevance/{model_name}')