In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.27.0-py2.py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting validators<1,>=0.2 (from streamlit->simpletransformers)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K     

In [2]:
!pip install datasets
!pip install sentence_transformers
!pip install setfit

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=1ad06a6cd0f289a405fd4ba3980b98ca0e223ac945b8f05631075200da0d2bd4
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
Collecting setfit
  Downloading setfit-0.7.0-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m2.0 MB/s[0

# Libraries & Functions

In [3]:
from datasets import Dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset



In [4]:
import pandas as pd 
import numpy as np
from tqdm import tqdm

In [5]:
from simpletransformers.classification import ClassificationModel
import torch

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [7]:
model_names = ["distilbert"]
checkpoint_names = ["distilbert-base-uncased"]

In [8]:
focus_names = ["Growth Form"]#, "Life Form"]
focus_codes = ["1.2.1"]#, "2.3.1"]

# Input Data

In [9]:
raw_datasets = dict()

## POWO Dataset

In [10]:
working_dir = "..//input//powo-gift-final//" 

df_POWO_Cat =  pd.read_excel(working_dir + "POWO_GIFT_Final.xlsx")
df_POWO_Cat_Preproc = df_POWO_Cat.drop_duplicates(subset = ["BERT_description"])
df_POWO_Cat_Preproc = df_POWO_Cat_Preproc[df_POWO_Cat_Preproc["BERT_description"].apply(lambda x: len(x.split(" ")))>10]
raw_datasets["POWO"] = df_POWO_Cat_Preproc

## WIKI Dataset

In [11]:
def fix_WIKI(name, description):
    for n in name.split(" "):
        description = str(description).replace(n.lower(), "")
    return description.strip()

In [12]:
working_dir = "..//input//wiki-gift-final//" 

df_WIKI_Cat =  pd.read_excel(working_dir + "WIKI_GIFT_Final.xlsx")
df_WIKI_Cat_Preproc = df_WIKI_Cat.drop_duplicates(subset = ["BERT_description"])
df_WIKI_Cat_Preproc["BERT_description"] = df_WIKI_Cat_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)
df_WIKI_Cat_Preproc = df_WIKI_Cat_Preproc[df_WIKI_Cat_Preproc["BERT_description"].apply(lambda x: len(str(x).split(" ")))>10]
raw_datasets["WIKI"] = df_WIKI_Cat_Preproc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_WIKI_Cat_Preproc["BERT_description"] = df_WIKI_Cat_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)


## Preprocess Datasets

In [13]:
label_map = {
    "Growth Form": {"herb": 0, "shrub": 1, "tree": 2},
    "Life Form": {"phanerophyte": 0, "chamaephyte": 1, "hemicryptophyte": 2, "cryptophyte": 3, "therophyte": 4},
}


In [14]:
preprocessed_dataset_dict = {}
sample_size = 5000
for focus_name, focus_code in zip(focus_names, focus_codes):
    for dataset_name in list(raw_datasets.keys()):
        labelencoder = LabelEncoder()

        dataset_masked = raw_datasets[dataset_name][raw_datasets[dataset_name][focus_code].notna()]
        dataset_masked = dataset_masked[dataset_masked[focus_code].apply(lambda x: x in label_map[focus_name].keys())].sample(sample_size)
        dataset_masked[focus_code + "_encoded"] = labelencoder.fit_transform(dataset_masked[focus_code])

        indices_train, indices_test \
            = train_test_split(dataset_masked.index.values, test_size=0.25, random_state=42)
            
        df_train = dataset_masked.loc[indices_train, ["BERT_description", focus_code + "_encoded"]]
        df_train.columns = ["text", "labels"]
        df_test = dataset_masked.loc[indices_test, ["BERT_description", focus_code + "_encoded"]]
        df_test.columns = ["text", "labels"]
        
        preprocessed_dataset_dict[dataset_name, focus_name, "train"] = df_train
        preprocessed_dataset_dict[dataset_name, focus_name, "validation"] = df_test

# Model Training & Evaluation

## DistilBERT

In [26]:
if torch.cuda.is_available():  # Tell PyTorch to use the GPU. 
    device = torch.device("cuda") 
    print('There are %d GPU(s) available.' % torch.cuda.device_count()) 
    print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
import gc
torch.cuda.empty_cache()
gc.collect()

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


30

In [27]:
results_list = []
FS_sample_sizes = [32, 128, 512]
for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
    for dataset_name in list(raw_datasets.keys())[:]:
        for FS_sample_size in FS_sample_sizes:
            print(model_name, dataset_name)

            model = ClassificationModel(
                model_name,
                model_checkpoint,
                num_labels = preprocessed_dataset_dict[dataset_name, focus_name, "train"]["labels"].nunique(),
                args = {"num_train_epochs": 3, "train_batch_size":8, "eval_batch_size":8, "reprocess_input_data": True, "overwrite_output_dir": True, "save_model_every_epoch": False, "save_eval_checkpoints": False, "max_seq_length": 512}, #"weight_decay": 0.01, "learning_rate": 2e-5, 
            )
            # Train the model
            model.train_model(preprocessed_dataset_dict[dataset_name, focus_name, "train"].sample(FS_sample_size, random_state = 42))

            # Evaluate the model
            result, model_outputs, wrong_predictions = model.eval_model(preprocessed_dataset_dict[dataset_name, focus_name, "validation"])
            preprocessed_dataset_dict[dataset_name, focus_name, "validation"]["prediction"] = np.argmax(model_outputs, axis=1)
            results = calculate_scores(preprocessed_dataset_dict[dataset_name, focus_name, "validation"]["labels"], preprocessed_dataset_dict[dataset_name, focus_name, "validation"]["prediction"], average = "macro")
            results_list.append([dataset_name, focus_name, FS_sample_size] + results + [model_checkpoint])

            torch.cuda.empty_cache()
            gc.collect()


df_results = pd.DataFrame(results_list, columns=["Dataset", "Trait", "Sample Size", "Accuracy", "Precision", "Recall", "F1-Score", "Model"])

distilbert POWO


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/32 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


distilbert POWO


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/128 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


distilbert POWO


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/512 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/64 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/64 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

distilbert WIKI


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/32 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


distilbert WIKI


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/128 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/16 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


distilbert WIKI


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/512 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/64 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/64 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

In [28]:
df_results

Unnamed: 0,Dataset,Trait,Sample Size,Accuracy,Precision,Recall,F1-Score,Model
0,POWO,Growth Form,32,0.7128,0.2376,0.333333,0.27744,distilbert-base-uncased
1,POWO,Growth Form,128,0.7128,0.2376,0.333333,0.27744,distilbert-base-uncased
2,POWO,Growth Form,512,0.8848,0.819866,0.782708,0.796845,distilbert-base-uncased
3,WIKI,Growth Form,32,0.5224,0.174133,0.333333,0.228762,distilbert-base-uncased
4,WIKI,Growth Form,128,0.5232,0.507606,0.334234,0.230679,distilbert-base-uncased
5,WIKI,Growth Form,512,0.8736,0.865919,0.822443,0.838805,distilbert-base-uncased


In [29]:
df_results.to_excel("FewShotLearningTraitClassification_Encoder.xlsx", index = False)

## SetFit

In [15]:
if torch.cuda.is_available():  # Tell PyTorch to use the GPU. 
    device = torch.device("cuda") 
    print('There are %d GPU(s) available.' % torch.cuda.device_count()) 
    print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
import gc
torch.cuda.empty_cache()
gc.collect()

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


118

In [20]:
results_list = []
FS_sample_sizes = [32, 128, 512]
for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
    for dataset_name in list(raw_datasets.keys())[:]:
        for FS_sample_size in FS_sample_sizes:
            print(model_name, dataset_name)

            # Load a SetFit model from Hub
            model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
            
            train_dataset = Dataset.from_pandas(preprocessed_dataset_dict[dataset_name, focus_name, "train"].sample(FS_sample_size, random_state = 42))
            eval_dataset = Dataset.from_pandas(preprocessed_dataset_dict[dataset_name, focus_name, "validation"])

            # Create trainer
            trainer = SetFitTrainer(
                model=model,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                loss_class=CosineSimilarityLoss,
                metric="f1",
                metric_kwargs = {"average": "macro"},
                batch_size=8,
                num_iterations=20, 
                num_epochs=1, 
                column_mapping={"text": "text", "labels": "label"}
            )
            
            trainer.train()
            metrics = trainer.evaluate()

            results_list.append([dataset_name, focus_name, FS_sample_size, 0, 0, 0, metrics["f1"], model_checkpoint])

            torch.cuda.empty_cache()
            gc.collect()


df_results = pd.DataFrame(results_list, columns=["Dataset", "Trait", "Sample Size", "Accuracy", "Precision", "Recall", "F1-Score", "Model"])

distilbert POWO


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1280
  Num epochs = 1
  Total optimization steps = 160
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/160 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

distilbert POWO


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 5120
  Num epochs = 1
  Total optimization steps = 640
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

distilbert POWO


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 20480
  Num epochs = 1
  Total optimization steps = 2560
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2560 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

distilbert WIKI


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 1280
  Num epochs = 1
  Total optimization steps = 160
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/160 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

distilbert WIKI


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 5120
  Num epochs = 1
  Total optimization steps = 640
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/640 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

distilbert WIKI


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 20480
  Num epochs = 1
  Total optimization steps = 2560
  Total train batch size = 8


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2560 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

In [21]:
df_results

Unnamed: 0,Dataset,Trait,Sample Size,Accuracy,Precision,Recall,F1-Score,Model
0,POWO,Growth Form,32,0,0,0,0.306703,distilbert-base-uncased
1,POWO,Growth Form,128,0,0,0,0.794675,distilbert-base-uncased
2,POWO,Growth Form,512,0,0,0,0.835472,distilbert-base-uncased
3,WIKI,Growth Form,32,0,0,0,0.519236,distilbert-base-uncased
4,WIKI,Growth Form,128,0,0,0,0.821678,distilbert-base-uncased
5,WIKI,Growth Form,512,0,0,0,0.829715,distilbert-base-uncased


In [22]:
df_results.to_excel("FewShotLearningTraitClassification_SetFit.xlsx", index = False)