In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting transformers>=4.31.0 (from simpletransformers)
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.26.0-py2.py3-none-any.whl (8.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Col

# Libraries & Functions

In [2]:
import pandas as pd 
import numpy as np
from tqdm import tqdm

In [3]:
from simpletransformers.classification import ClassificationModel
import torch

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

def calculate_scores(y_test, y_pred, average = "binary"):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average = average)
    recall = recall_score(y_test, y_pred, average = average)
    f1 = f1_score(y_test, y_pred, average = average)
    return [accuracy, precision, recall, f1]

In [5]:
model_names = ["distilbert", "distilbert", "debertav2", "electra"]
checkpoint_names = ["ViktorDo/EcoBERT-Pretrained", "distilbert-base-uncased", "microsoft/deberta-v3-base", "google/electra-base-discriminator"]

# Input Data

In [6]:
raw_datasets = dict()

In [7]:
description_threshold = 100

## POWO Dataset

In [8]:
working_dir = "..//input//powo-family//" 

df_POWO_Fam =  pd.read_excel(working_dir + "POWO_Family.xlsx")
df_POWO_Fam_Preproc = df_POWO_Fam.drop_duplicates(subset = ["BERT_description"])
df_POWO_Fam_Preproc = df_POWO_Fam_Preproc[df_POWO_Fam_Preproc["BERT_description"].apply(lambda x: len(x.split(" ")))>10]

POWO_Filter = df_POWO_Fam_Preproc["family"].value_counts().keys().values[:10]
df_POWO_Fam_Preproc = df_POWO_Fam_Preproc[df_POWO_Fam_Preproc["family"].apply(lambda x: x in POWO_Filter)].groupby('family', group_keys=False).apply(lambda x: x.sample(500, random_state = 42))
raw_datasets["POWO"] = df_POWO_Fam_Preproc

## WIKI Dataset

In [9]:
def fix_WIKI(name, description):
    for n in name.split(" "):
        description = str(description).replace(n.lower(), "")
    return description.strip()

In [10]:
working_dir = "..//input//wiki-family//" 

df_WIKI_Fam =  pd.read_excel(working_dir + "WIKI_Family.xlsx")
df_WIKI_Fam_Preproc = df_WIKI_Fam.drop_duplicates(subset = ["BERT_description"])
df_WIKI_Fam_Preproc["BERT_description"] = df_WIKI_Fam_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)
df_WIKI_Fam_Preproc = df_WIKI_Fam_Preproc[df_WIKI_Fam_Preproc["BERT_description"].apply(lambda x: len(str(x).split(" ")))>10]

WIKI_Filter = df_WIKI_Fam_Preproc["family"].value_counts().keys().values[:10]
df_WIKI_Fam_Preproc = df_WIKI_Fam_Preproc[df_WIKI_Fam_Preproc["family"].apply(lambda x: x in WIKI_Filter)].groupby('family', group_keys=False).apply(lambda x: x.sample(500, random_state = 42))
raw_datasets["WIKI"] = df_WIKI_Fam_Preproc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_WIKI_Fam_Preproc["BERT_description"] = df_WIKI_Fam_Preproc[["name", "BERT_description"]].apply(lambda x: fix_WIKI(x[0], x[1]), axis = 1)


## Preprocess Datasets

In [11]:
preprocessed_dataset_dict = {}
for dataset_name in list(raw_datasets.keys()):
    labelencoder = LabelEncoder()
    raw_datasets[dataset_name]["family_encoded"] = labelencoder.fit_transform(raw_datasets[dataset_name]["family"])

    indices_train, indices_test \
        = train_test_split(raw_datasets[dataset_name].index.values, test_size=0.25, random_state=42)
    
    
    df_train = raw_datasets[dataset_name].loc[indices_train, ["BERT_description", "family_encoded"]]
    df_train.columns = ["text", "labels"]
    df_test = raw_datasets[dataset_name].loc[indices_test, ["BERT_description", "family_encoded"]]
    df_test.columns = ["text", "labels"]

    preprocessed_dataset_dict[dataset_name, "train"] = df_train
    preprocessed_dataset_dict[dataset_name, "validation"] = df_test

# Model Training & Evaluation

In [12]:
if torch.cuda.is_available():  # Tell PyTorch to use the GPU. 
    device = torch.device("cuda") 
    print('There are %d GPU(s) available.' % torch.cuda.device_count()) 
    print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
import gc
torch.cuda.empty_cache()
gc.collect()

There are 2 GPU(s) available.
We will use the GPU: Tesla T4


130

In [13]:
results_list = []

for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
    for dataset_name in list(raw_datasets.keys())[:]:

        print(model_checkpoint, dataset_name)
        
        if(model_checkpoint == "ViktorDo/EcoBERT-Pretrained"):
            model = ClassificationModel(
                model_name,
                model_checkpoint,
                tokenizer_name = "distilbert-base-uncased",
                num_labels = 10, #preprocessed_dataset_dict[dataset_name, "train"]["labels"].nunique(),
                args = {"num_train_epochs": 3, "train_batch_size":8, "eval_batch_size":8, "reprocess_input_data": True, "overwrite_output_dir": True, "save_model_every_epoch": False, "save_eval_checkpoints": False, "max_seq_length": 512}, #"weight_decay": 0.01, "learning_rate": 2e-5, 
            )
        else:
            model = ClassificationModel(
                model_name,
                model_checkpoint,
                num_labels = 10, #preprocessed_dataset_dict[dataset_name, "train"]["labels"].nunique(),
                args = {"num_train_epochs": 3, "train_batch_size":8, "eval_batch_size":8, "reprocess_input_data": True, "overwrite_output_dir": True, "save_model_every_epoch": False, "save_eval_checkpoints": False, "max_seq_length": 512}, #"weight_decay": 0.01, "learning_rate": 2e-5, 
            )
        # Train the model
        model.train_model(preprocessed_dataset_dict[dataset_name, "train"])

        # Evaluate the model
        result, model_outputs, wrong_predictions = model.eval_model(preprocessed_dataset_dict[dataset_name, "validation"])
        preprocessed_dataset_dict[dataset_name, "validation"]["prediction"] = np.argmax(model_outputs, axis=1)
        results = calculate_scores(preprocessed_dataset_dict[dataset_name, "validation"]["labels"], preprocessed_dataset_dict[dataset_name, "validation"]["prediction"], average = "macro")
        results_list.append([dataset_name, "Family"] + results + [model_name])

        torch.cuda.empty_cache()
        gc.collect()


df_results = pd.DataFrame(results_list, columns=["Dataset", "Trait", "Accuracy", "Precision", "Recall", "F1-Score", "Model"])

ViktorDo/EcoBERT-Pretrained POWO


Downloading (…)lve/main/config.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

ViktorDo/EcoBERT-Pretrained WIKI


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

distilbert-base-uncased POWO


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

distilbert-base-uncased WIKI


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

microsoft/deberta-v3-base POWO


Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

microsoft/deberta-v3-base WIKI


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

google/electra-base-discriminator POWO


Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

google/electra-base-discriminator WIKI


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3750 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/157 [00:00<?, ?it/s]

In [14]:
df_results = pd.DataFrame(results_list, columns=["Dataset", "Trait", "Accuracy", "Precision", "Recall", "F1-Score", "Model"])

In [15]:
df_results

Unnamed: 0,Dataset,Trait,Accuracy,Precision,Recall,F1-Score,Model
0,POWO,Family,0.9648,0.965422,0.965381,0.965209,distilbert
1,WIKI,Family,0.9712,0.97069,0.970147,0.97025,distilbert
2,POWO,Family,0.968,0.9684,0.968147,0.968233,distilbert
3,WIKI,Family,0.9712,0.970701,0.97007,0.970276,distilbert
4,POWO,Family,0.948,0.949728,0.948704,0.948944,debertav2
5,WIKI,Family,0.9656,0.965516,0.964464,0.964734,debertav2
6,POWO,Family,0.964,0.9651,0.964041,0.964369,electra
7,WIKI,Family,0.9712,0.970611,0.970563,0.970299,electra


In [16]:
df_results.to_excel("FamilyClassification_Encoder_Results.xlsx", index = False)

In [17]:
df_results

Unnamed: 0,Dataset,Trait,Accuracy,Precision,Recall,F1-Score,Model
0,POWO,Family,0.9648,0.965422,0.965381,0.965209,distilbert
1,WIKI,Family,0.9712,0.97069,0.970147,0.97025,distilbert
2,POWO,Family,0.968,0.9684,0.968147,0.968233,distilbert
3,WIKI,Family,0.9712,0.970701,0.97007,0.970276,distilbert
4,POWO,Family,0.948,0.949728,0.948704,0.948944,debertav2
5,WIKI,Family,0.9656,0.965516,0.964464,0.964734,debertav2
6,POWO,Family,0.964,0.9651,0.964041,0.964369,electra
7,WIKI,Family,0.9712,0.970611,0.970563,0.970299,electra
