In [1]:
from datasets import load_dataset
    
dataset = load_dataset('knowledgator/events_classification_biotech') 
    
classes = [class_ for class_ in dataset['train'].features['label 1'].names if class_]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5'],
        num_rows: 2759
    })
    test: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5'],
        num_rows: 381
    })
})

In [8]:
from transformers import AutoTokenizer

model_path = 'afmck/testing-llama-tiny'

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id= tokenizer.eos_token_id

In [9]:
def preprocess_function(example):
    text = f"{example['title']}.\n{example['content']}"
    all_labels = example['all_labels']
    labels = [0. for i in range(len(classes))]
    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1.
  
    example = tokenizer(text, padding = True, truncation=True)
    example['labels'] = labels
    #print(len(all_labels))
    return example

tokenized_dataset = dataset.map(preprocess_function)

Map:   0%|          | 0/2759 [00:00<?, ? examples/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [14]:
print(tokenizer.vocab_size)
tokenizer.pad_token_id = tokenizer.eos_token_id
print(tokenizer.eos_token_id)
print(tokenizer.vocab_size)

32000
2
32000


In [19]:
'''
from transformers import LlamaForSequenceClassification, LlamaConfig
config = LlamaConfig(
        vocab_size=tokenizer.vocab_size,  # Based on the number of unique tokens
        hidden_size = 256,
        intermediate_size = 256,
        num_hidden_layers = 4,
        num_attention_heads = 4,
        max_position_embeddings = 512,
        use_cache = True,
        rope_theta=256,
        num_label = 30,
        
        pad_token_id = tokenizer.eos_token_id,
        bos_token_id = tokenizer.bos_token_id,
        eos_token_id = tokenizer.eos_token_id,
        
    )
model = LlamaForSequenceClassification(config)
'''
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=30, 
    id2label=id2class, 
    label2id=class2id, 
    problem_type = "multi_label_classification",
    pad_token_id = tokenizer.eos_token_id,
    
)
model.resize_token_embeddings(len(tokenizer))

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at afmck/testing-llama-tiny and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(32000, 512)

In [20]:
print(f"Total parameters in the model: {model.num_parameters()}")

Total parameters in the model: 37376512


In [21]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(

   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=2,
   per_device_eval_batch_size=2,
   num_train_epochs=20,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1599,0.138741,0.954331,0.462963,0.712025,0.342988
2,0.0983,0.121031,0.959318,0.579186,0.712695,0.487805
3,0.0464,0.118363,0.964392,0.647008,0.750503,0.568598
4,0.028,0.123717,0.965004,0.65338,0.757028,0.574695
5,0.0192,0.125284,0.965704,0.667233,0.752874,0.599085
6,0.0158,0.132015,0.966842,0.672429,0.776447,0.592988
7,0.0141,0.131792,0.966929,0.675815,0.772549,0.60061
8,0.0109,0.143265,0.966317,0.677284,0.752328,0.615854
9,0.0091,0.141085,0.967542,0.681545,0.779961,0.605183
10,0.0084,0.147862,0.966842,0.680708,0.760829,0.615854
