In [19]:
import pickle
from datasets import Dataset

def load_data_from_pickle(filepath):
    with open(filepath, 'rb') as f:
        data = pickle.load(f)
    return data['texts'],data['labels']

def prepare_dataset(texts, labels):
    ds = Dataset.from_dict({
        'texts': texts,
        'labels': labels,
    })
    return ds.train_test_split(test_size=0.2, shuffle=True)

In [20]:
dataset_id = 'dataset/addition_dataset_ct.pkl'
texts, labels = load_data_from_pickle(dataset_id)
dataset = prepare_dataset(texts, labels)

In [22]:
from transformers import AutoTokenizer

model_path = 'afmck/testing-llama-tiny'

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id= tokenizer.eos_token_id

In [24]:
def preprocess_function(example):
    labels = example['labels']
    example = tokenizer(example['texts'], padding = True, truncation=True)
    example['labels'] = labels
    #print(example)
    return example

tokenized_dataset = dataset.map(preprocess_function)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [25]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [27]:
print(tokenizer.vocab_size)
tokenizer.pad_token_id = tokenizer.eos_token_id
print(tokenizer.eos_token_id)
print(tokenizer.vocab_size)

32000
2
32000


In [33]:
'''
from transformers import LlamaForSequenceClassification, LlamaConfig
config = LlamaConfig(
        vocab_size=tokenizer.vocab_size,  # Based on the number of unique tokens
        hidden_size = 256,
        intermediate_size = 256,
        num_hidden_layers = 4,
        num_attention_heads = 4,
        max_position_embeddings = 512,
        use_cache = True,
        rope_theta=256,
        num_label = 30,
        
        pad_token_id = tokenizer.eos_token_id,
        bos_token_id = tokenizer.bos_token_id,
        eos_token_id = tokenizer.eos_token_id,
        
    )
model = LlamaForSequenceClassification(config)
'''
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=220, 
    #id2label=id2class, 
    #label2id=class2id, 
    problem_type = "multi_label_classification",
    pad_token_id = tokenizer.eos_token_id,
    
)
model.resize_token_embeddings(len(tokenizer))

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at afmck/testing-llama-tiny and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(32000, 512)

In [34]:
print(f"Total parameters in the model: {model.num_parameters()}")

Total parameters in the model: 37473792


In [35]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(

   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=2,
   per_device_eval_batch_size=2,
   num_train_epochs=20,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0613,0.061463,0.98145,0.898862,0.98823,0.824318
2,0.0554,0.055427,0.982493,0.904449,0.99563,0.828568
3,0.0523,0.052058,0.982936,0.907187,0.994552,0.833932
4,0.0502,0.050514,0.983357,0.910091,0.989693,0.842341
5,0.0475,0.049228,0.983784,0.912765,0.987748,0.848364
6,0.0458,0.048475,0.984309,0.915877,0.987208,0.854159
7,0.0435,0.047249,0.984575,0.917528,0.985873,0.858045
8,0.0411,1.650538,0.847266,0.155306,0.173739,0.140409


KeyboardInterrupt: 

In [38]:
trainer.evaluate(tokenized_dataset["test"])

{'eval_loss': 0.04566410183906555,
 'eval_accuracy': 0.9845227272727273,
 'eval_f1': 0.9172298118527882,
 'eval_precision': 0.9858135646358032,
 'eval_recall': 0.8575681818181818}

In [40]:
for i in range(6,21):
    print('Class: ', i)
    dataset_id = f'dataset/addition_dataset_ct{i}.pkl'
    texts, labels = load_data_from_pickle(dataset_id)
    test_dataset = prepare_dataset(texts, labels)
    tt_dataset = test_dataset.map(preprocess_function)
    print(trainer.evaluate(tt_dataset["test"]))

Class:  6


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.22792428731918335, 'eval_accuracy': 0.9621181818181819, 'eval_f1': 0.7907922482176925, 'eval_precision': 0.8831015922852657, 'eval_recall': 0.7159545454545454}
Class:  7


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.3505258858203888, 'eval_accuracy': 0.9544022727272727, 'eval_f1': 0.7467656228306008, 'eval_precision': 0.8397535980923724, 'eval_recall': 0.6723181818181818}
Class:  8


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.4678576588630676, 'eval_accuracy': 0.9466454545454546, 'eval_f1': 0.7028655326042934, 'eval_precision': 0.7931329981718465, 'eval_recall': 0.6310454545454546}
Class:  9


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.579229474067688, 'eval_accuracy': 0.9383704545454545, 'eval_f1': 0.6567946235334321, 'eval_precision': 0.7411099368769816, 'eval_recall': 0.5897045454545454}
Class:  10


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.689910888671875, 'eval_accuracy': 0.9298454545454545, 'eval_f1': 0.609700586688246, 'eval_precision': 0.6871295029639762, 'eval_recall': 0.5479545454545455}
Class:  11


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.7915732860565186, 'eval_accuracy': 0.9222204545454545, 'eval_f1': 0.5674599663806069, 'eval_precision': 0.6391902280686769, 'eval_recall': 0.5102045454545454}
Class:  12


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.9031037092208862, 'eval_accuracy': 0.9140045454545455, 'eval_f1': 0.5219094309107449, 'eval_precision': 0.5876678807193262, 'eval_recall': 0.46938636363636366}
Class:  13


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.9993358254432678, 'eval_accuracy': 0.9058772727272727, 'eval_f1': 0.47732034227730524, 'eval_precision': 0.5366975080887779, 'eval_recall': 0.42977272727272725}
Class:  14


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.1136912107467651, 'eval_accuracy': 0.8974477272727273, 'eval_f1': 0.4304161775286856, 'eval_precision': 0.4840578064223049, 'eval_recall': 0.3874772727272727}
Class:  15


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.2094836235046387, 'eval_accuracy': 0.8885772727272727, 'eval_f1': 0.3815002649307395, 'eval_precision': 0.42874156411274317, 'eval_recall': 0.34363636363636363}
Class:  16


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.2928709983825684, 'eval_accuracy': 0.8805772727272727, 'eval_f1': 0.3379948093834253, 'eval_precision': 0.37920506586758634, 'eval_recall': 0.30486363636363634}
Class:  17


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.3807802200317383, 'eval_accuracy': 0.8722022727272727, 'eval_f1': 0.292576144526778, 'eval_precision': 0.32766928734466144, 'eval_recall': 0.26427272727272727}
Class:  18


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.4795019626617432, 'eval_accuracy': 0.8640613636363637, 'eval_f1': 0.24701957575376093, 'eval_precision': 0.27687314801749685, 'eval_recall': 0.22297727272727272}
Class:  19


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.5584899187088013, 'eval_accuracy': 0.8560818181818182, 'eval_f1': 0.20387226552677895, 'eval_precision': 0.22813731007315702, 'eval_recall': 0.18427272727272728}
Class:  20


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.6505376100540161, 'eval_accuracy': 0.847265909090909, 'eval_f1': 0.15530612501414043, 'eval_precision': 0.17373941899378498, 'eval_recall': 0.1404090909090909}
