### Separating Actions from conservation-adjacent texts
'Irrelevant' now represents non-actions, 'Relevant' represents action evidence.

In [1]:
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
DEV = True

if DEV:
    model_name = "avsolatorio/GIST-small-Embedding-v0"
else:
    model_name = "avsolatorio/GIST-Embedding-v0"

In [3]:
def import_labelled_data(path="data/level-1.5/merged/data.json"):
    data = pd.read_json(path, encoding="latin-1")
    return data


data = import_labelled_data(path="../../../data/level-1.5/merged/data.json", )



# train test split
from sklearn.model_selection import train_test_split

# 0.45 train, 0.15 val, 0.4 test
train_data, test_data = train_test_split(data, test_size=0.4, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

display(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8803 entries, 0 to 8802
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           8803 non-null   object 
 1   text          8803 non-null   object 
 2   relevance     8803 non-null   object 
 3   multiclasses  8803 non-null   object 
 4   score-lv1     207 non-null    float64
dtypes: float64(1), object(4)
memory usage: 344.0+ KB


None

Unnamed: 0,url,text,relevance,multiclasses,score-lv1
0,https://www.conservationevidence.com/synopsis/...,1 \n \n \n2 \n \n \nSubtidal Benthic Invertebr...,relevant,[Marine Invertebrates],
1,https://www.conservationevidence.com/synopsis/...,\n \n \n Control of freshwater \n invasi...,relevant,"[Fish, Rivers and Lakes, Invasive]",
2,https://www.conservationevidence.com/synopsis/...,1 \n \nGrassland Conservation \n2 \n \nGrassla...,relevant,[Grassland],
3,https://www.conservationevidence.com/synopsis/...,\n \n \nii \n \n \n \n \n \n \n \nPrimate Co...,relevant,[Mammals],
4,https://www.conservationevidence.com/synopsis/...,CONSERVATION EVIDENCE SERIES SYNOPSES\nTerrest...,relevant,[Mammals],


In [4]:
from chunking import chunk_dataset_and_explode


# roughly 4 characters per token
max_len = 512

train_data = chunk_dataset_and_explode(train_data, max_len=max_len, overlap=int(max_len * 0.2))
test_data = chunk_dataset_and_explode(test_data, max_len=max_len, overlap=int(max_len * 0.2))
val_data = chunk_dataset_and_explode(val_data, max_len=max_len, overlap=int(max_len * 0.2))

  0%|          | 0/3960 [00:00<?, ?it/s]

  0%|          | 0/3522 [00:00<?, ?it/s]

  0%|          | 0/1321 [00:00<?, ?it/s]

In [5]:
train_data['relevance'].value_counts()

relevance
relevant      51706
irrelevant    39541
Name: count, dtype: int64

In [6]:
def stratified_sample(dataset,label_column: str = 'relevance',num_samples_per_label: int = 100):
    return (
        dataset
        .sample(frac=1,random_state=42)
        .groupby(label_column)[dataset.columns]
        .apply(lambda x: x.sample(min(num_samples_per_label,len(x)),random_state=42),include_groups=True).reset_index(drop=True)
    )


def sorted_stratified_sample(dataset,label_column: str = 'relevance', sorting_column: str = 'score-lv1', num_samples_per_label: int = 100):
    # get top num_samples_per_label samples per label
    return (
        dataset
        .sort_values(sorting_column,ascending=False)
        .groupby(label_column)[dataset.columns]
        .apply(lambda x: x.head(min(num_samples_per_label,len(x))),include_groups=True).reset_index(drop=True)
    )

if DEV:
    train_data = sorted_stratified_sample(train_data,label_column='relevance', sorting_column='score-lv1', num_samples_per_label=250)
    val_data = val_data.sample(100,random_state=42)
    test_data = test_data.sample(200,random_state=42)
else:
    train_data = train_data.sample(frac=1,random_state=42)
    val_data = val_data.sample(500,random_state=42)
    test_data = test_data.sample(frac=1,random_state=42)


In [7]:
train_data

Unnamed: 0,chunk_id,url,text,relevance,multiclasses,score-lv1
0,8600,https://files.worldwildlife.org/wwfcmsprod/fil...,PLOW 0 2 0 PRINT 2 orld Wildlife Fund’s 2020 P...,irrelevant,[],1.0
1,8600,https://files.worldwildlife.org/wwfcmsprod/fil...,report is based on an updat- accessibility of ...,irrelevant,[],1.0
2,8600,https://files.worldwildlife.org/wwfcmsprod/fil...,"5,000 2014 40,685,000 230,499,000 1,139,000 4,...",irrelevant,[],1.0
3,8599,https://www.wyomingwildsheep.org/wp-content/up...,"rically, native desert bighorn sheep occupied ...",irrelevant,[],1.0
4,8599,https://www.wyomingwildsheep.org/wp-content/up...,ected. Ninety- During winter rams consumed mor...,irrelevant,[],1.0
...,...,...,...,...,...,...
495,204,https://www.conservationevidence.com/individua...,bodies of 51 species were edible. The results ...,relevant,[],
496,204,https://www.conservationevidence.com/individua...,ched). Effects of trampling: Trampling signifi...,relevant,[],
497,204,https://www.conservationevidence.com/individua...,"nnual differences. In every year, the number o...",relevant,[],
498,204,https://www.conservationevidence.com/individua...,"trampled plots (195 and 189, respectively).Con...",relevant,[],


In [8]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data, split="train")
test_dataset = Dataset.from_pandas(test_data, split="test")
val_dataset = Dataset.from_pandas(val_data, split="val")

train_dataset

Dataset({
    features: ['chunk_id', 'url', 'text', 'relevance', 'multiclasses', 'score-lv1'],
    num_rows: 500
})

In [9]:

#! had to modify FastFitTrainer to at /fastfit/train.py, line 879, to add trust_remote_code=True to the loading of 'accuracy' metrics
#! don't know why it's not default, since accuracy is the default in fastfit



#! IMPORTANT: another change in FastFitTrainer, also at line 879; comment out and replace the fixed version above
#! since load_metric is deprecated in favour of evaluate.load()
#! added functionality for sending in multiple metrics to evaluate at once
#! added macro averages for non-accuracy metrics too
#! essentially, copy the below code to replace line 879

### Insert into line 879.

```python
        # metric = load_metric(self.data_args.metric_name, experiment_id=uuid.uuid4())
        from evaluate import combine, load
        if type(self.data_args.metric_name) == str: # single metric name
            metrics = [load(self.data_args.metric_name, experiment_id=uuid.uuid4())]
        elif type(self.data_args.metric_name) == list: # compute multiple metrics
            metrics = [load(metric,experiment_id=uuid.uuid4()) for metric in self.data_args.metric_name]

        # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
        # predictions and label_ids field) and has to return a dictionary string to float.
        def compute_metrics(p: EvalPrediction):
            predictions = (
                p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            )
            predictions = (
                np.squeeze(predictions)
                if self.is_regression
                else np.argmax(predictions, axis=1)
            )
            references = p.label_ids

            results = {}

            for metric in metrics:
                if metric.name != 'accuracy':
                    results.update(metric.compute(predictions=predictions, references=references,average='macro'))
                else:
                    results.update(metric.compute(predictions=predictions, references=references))

            return results
    ```

In [10]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

print(torch.version.cuda)

torch.cuda.empty_cache()


True
0
NVIDIA GeForce RTX 3090
12.1


In [11]:
from fastfit import FastFitTrainer

# same args as the huggingface TrainingArguments
if DEV:
    output_dir = f'models/relevance/dev/{model_name}'
else:
    output_dir = f'models/relevance/{model_name}'

trainer = FastFitTrainer(
    model_name_or_path=model_name,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    test_dataset=test_dataset,
    output_dir=output_dir,
    overwrite_output_dir=True,
    label_column_name='relevance',
    text_column_name="text",
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_text_length=512,
    num_repeats=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy='epoch',
    metric_name=['precision','recall','f1','accuracy'],
    load_best_model_at_end=True,
    metric_for_best_model='recall',
    fp16=True,
)

print(trainer.model.device)





Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/500 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/100 [00:00<?,…

Running tokenizer on dataset to infer max length for both query and document:   0%|          | 0/200 [00:00<?,…

Running tokenizer on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

cuda:0


In [12]:

#! another fastfit library modification
#! in /fastfit/train.py, line 971, change ignore_keys_for_eval from type set to a list
#! since it gets concatenated to a list later on
#! note that since we've added lines above, this is now line 981
#! the line beginning ignore_keys_for_eval={"doc_input_ids","doc_attention_mask","labels"}


model = trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,3.9898,4.762214,0.742188,0.726461,0.729167,0.74
2,3.554,4.818264,0.782853,0.786526,0.779647,0.78
3,3.4758,5.239364,0.714383,0.69237,0.693802,0.71
4,3.4442,5.705395,0.739389,0.728896,0.731294,0.74
5,3.4375,6.216745,0.729944,0.717532,0.719888,0.73
6,3.431,6.458174,0.727827,0.719968,0.721965,0.73
7,3.4388,6.645418,0.716667,0.711039,0.712644,0.72
8,3.4216,6.773955,0.727827,0.719968,0.721965,0.73
9,3.4316,6.847547,0.727827,0.719968,0.721965,0.73
10,3.4352,6.868251,0.727827,0.719968,0.721965,0.73


***** train metrics *****
  epoch                    =       10.0
  total_flos               =        0GF
  train_loss               =     3.5059
  train_runtime            = 0:00:54.96
  train_samples            =        500
  train_samples_per_second =     90.968
  train_steps_per_second   =      2.911


In [13]:
results = trainer.evaluate()

***** eval metrics *****
  epoch                   =       10.0
  eval_accuracy           =       0.78
  eval_f1                 =     0.7796
  eval_loss               =     4.8183
  eval_precision          =     0.7829
  eval_recall             =     0.7865
  eval_runtime            = 0:00:00.39
  eval_samples            =        100
  eval_samples_per_second =    251.615
  eval_steps_per_second   =      5.032


In [14]:
results

{'eval_loss': 4.818263530731201,
 'eval_precision': 0.7828525641025641,
 'eval_recall': 0.786525974025974,
 'eval_f1': 0.7796474358974359,
 'eval_accuracy': 0.78,
 'eval_runtime': 0.3974,
 'eval_samples_per_second': 251.615,
 'eval_steps_per_second': 5.032,
 'epoch': 10.0,
 'eval_samples': 100}

In [15]:
print(f'Accuracy: {results["eval_accuracy"]}')

Accuracy: 0.78


In [16]:
results = trainer.test()

***** test metrics *****
  epoch                   =       10.0
  eval_accuracy           =      0.795
  eval_f1                 =     0.7944
  eval_loss               =     4.8178
  eval_precision          =     0.8114
  eval_recall             =     0.8028
  eval_runtime            = 0:00:00.69
  eval_samples_per_second =    287.624
  eval_steps_per_second   =      5.752
  test_samples            =        200


In [17]:


from os import makedirs, path
if not path.exists(f'models/relevance/dev/{model_name}'):
    makedirs(f'models/relevance/dev/{model_name}')

model.save_pretrained(f'models/relevance/dev/{model_name}')

if not DEV:
    model.save_pretrained(f'models/relevance/{model_name}')