In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
from datasets import Dataset

def load_data_from_pickle(filepath):
    with open(filepath, 'rb') as f:
        data = pickle.load(f)
    return data['texts'],data['labels']

def prepare_dataset(texts, labels):
    ds = Dataset.from_dict({
        'texts': texts,
        'tg': labels,
    })
    return ds.train_test_split(test_size=0.2, shuffle=True)

In [3]:
dataset_id = 'dataset/addition_dataset_ct.pkl'
texts, labels = load_data_from_pickle(dataset_id)
dataset = prepare_dataset(texts, labels)

In [4]:
from transformers import AutoTokenizer

model_path = 'afmck/testing-llama-tiny'

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id= tokenizer.eos_token_id

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
import numpy as np
def preprocess_function(example):
    labels = example['tg']
    texts = example['texts']
    example = tokenizer(texts, padding = True, truncation=True)
    labels = np.array(labels,dtype = np.float64)
    example['labels'] = labels
    #print(example)
    return example

tokenized_dataset = dataset.map(preprocess_function)
tokenized_dataset['train'][0]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'texts': '126 + 479',
 'tg': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0

In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
import evaluate
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    

def sigmoid(x):
    return 1/(1 + np.exp(-x))


def multi_label_metrics(predictions, labels, threshold=0.5):
    
    _predictions = sigmoid(predictions.reshape(predictions.shape[0],22,11))
    y_pred = np.argmax(_predictions, axis=2)
    y_true = np.argmax(labels.reshape(predictions.shape[0],22,11), axis=2)
    accuracy = np.mean(y_pred==y_true)
    abs_acc = np.mean([np.array_equal(y_pred[i],y_true[i]) for i in range(predictions.shape[0])])
    metrics = {'accuracy': accuracy,
              'abs_accuracy': abs_acc}
    return metrics



def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    result = multi_label_metrics(
        predictions=predictions, 
        labels=labels)
    return result
     

In [8]:
print(tokenizer.vocab_size)
tokenizer.pad_token_id = tokenizer.eos_token_id
print(tokenizer.eos_token_id)
print(tokenizer.vocab_size)

32000
2
32000


In [9]:
from transformers import LlamaConfig, LlamaForSequenceClassification
config = LlamaConfig(
        vocab_size=tokenizer.vocab_size,  # Based on the number of unique tokens
        hidden_size = 256,
        intermediate_size = 512,
        num_hidden_layers = 22,
        num_attention_heads = 8,
        max_position_embeddings = 512,
        use_cache = True,
        rope_theta=20,
        num_labels = 242,
        problem_type = "multi_label_classification",
        pad_token_id = tokenizer.eos_token_id,
        
    )
model = LlamaForSequenceClassification(config)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [10]:
print(f"Total parameters in the model: {model.num_parameters()}")

Total parameters in the model: 22683392


In [11]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(

   output_dir="my_awesome_model",
   learning_rate=5e-5,
   per_device_train_batch_size=64,
   per_device_eval_batch_size=64,
   num_train_epochs=100,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [12]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33msmahmud[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Abs Accuracy
1,No log,0.220331,0.820205,0.0
2,No log,0.127969,0.82875,0.0085
3,No log,0.098574,0.825977,0.007
4,0.184300,0.084935,0.827,0.008
5,0.184300,0.077002,0.831614,0.007
6,0.184300,0.071484,0.846341,0.008
7,0.184300,0.06735,0.849364,0.0085
8,0.072900,0.064375,0.849159,0.011
9,0.072900,0.061901,0.850841,0.01
10,0.072900,0.059753,0.853977,0.0155


TrainOutput(global_step=12500, training_loss=0.023020541746616364, metrics={'train_runtime': 836.5271, 'train_samples_per_second': 956.335, 'train_steps_per_second': 14.943, 'total_flos': 973821542400000.0, 'train_loss': 0.023020541746616364, 'epoch': 100.0})

In [13]:
trainer.evaluate(tokenized_dataset["train"])

{'eval_loss': 0.01532689854502678,
 'eval_accuracy': 0.9998295454545455,
 'eval_abs_accuracy': 0.99625,
 'eval_runtime': 2.6476,
 'eval_samples_per_second': 3021.599,
 'eval_steps_per_second': 47.212,
 'epoch': 100.0}

In [14]:
for i in range(6,21):
    print('Class: ', i)
    dataset_id = f'dataset/addition_dataset_ct{i}.pkl'
    texts, labels = load_data_from_pickle(dataset_id)
    test_dataset = prepare_dataset(texts, labels)
    tt_dataset = test_dataset.map(preprocess_function)
    print(trainer.evaluate(tt_dataset["test"]))

Class:  6


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.14920403063297272, 'eval_accuracy': 0.7350454545454546, 'eval_abs_accuracy': 0.0, 'eval_runtime': 0.7432, 'eval_samples_per_second': 2691.131, 'eval_steps_per_second': 43.058, 'epoch': 100.0}
Class:  7


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.22493773698806763, 'eval_accuracy': 0.6848409090909091, 'eval_abs_accuracy': 0.0, 'eval_runtime': 0.8445, 'eval_samples_per_second': 2368.295, 'eval_steps_per_second': 37.893, 'epoch': 100.0}
Class:  8


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.2914382219314575, 'eval_accuracy': 0.6366136363636363, 'eval_abs_accuracy': 0.0, 'eval_runtime': 0.8346, 'eval_samples_per_second': 2396.256, 'eval_steps_per_second': 38.34, 'epoch': 100.0}
Class:  9


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.3523438274860382, 'eval_accuracy': 0.5906136363636364, 'eval_abs_accuracy': 0.0, 'eval_runtime': 0.8633, 'eval_samples_per_second': 2316.602, 'eval_steps_per_second': 37.066, 'epoch': 100.0}
Class:  10


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.42352843284606934, 'eval_accuracy': 0.545, 'eval_abs_accuracy': 0.0, 'eval_runtime': 0.8842, 'eval_samples_per_second': 2262.042, 'eval_steps_per_second': 36.193, 'epoch': 100.0}
Class:  11


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.48837733268737793, 'eval_accuracy': 0.5001136363636364, 'eval_abs_accuracy': 0.0, 'eval_runtime': 0.932, 'eval_samples_per_second': 2145.9, 'eval_steps_per_second': 34.334, 'epoch': 100.0}
Class:  12


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.5528785586357117, 'eval_accuracy': 0.4536363636363636, 'eval_abs_accuracy': 0.0, 'eval_runtime': 0.9619, 'eval_samples_per_second': 2079.222, 'eval_steps_per_second': 33.268, 'epoch': 100.0}
Class:  13


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.6217707991600037, 'eval_accuracy': 0.40925, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.0214, 'eval_samples_per_second': 1958.089, 'eval_steps_per_second': 31.329, 'epoch': 100.0}
Class:  14


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.6867889165878296, 'eval_accuracy': 0.36468181818181816, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.0475, 'eval_samples_per_second': 1909.281, 'eval_steps_per_second': 30.548, 'epoch': 100.0}
Class:  15


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.7504351735115051, 'eval_accuracy': 0.3196590909090909, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.0999, 'eval_samples_per_second': 1818.371, 'eval_steps_per_second': 29.094, 'epoch': 100.0}
Class:  16


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.8207371830940247, 'eval_accuracy': 0.2727045454545455, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.1632, 'eval_samples_per_second': 1719.349, 'eval_steps_per_second': 27.51, 'epoch': 100.0}
Class:  17


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.8846883177757263, 'eval_accuracy': 0.22638636363636364, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.1762, 'eval_samples_per_second': 1700.334, 'eval_steps_per_second': 27.205, 'epoch': 100.0}
Class:  18


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 0.9446865916252136, 'eval_accuracy': 0.18184090909090908, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.2192, 'eval_samples_per_second': 1640.474, 'eval_steps_per_second': 26.248, 'epoch': 100.0}
Class:  19


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.006532073020935, 'eval_accuracy': 0.13504545454545455, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.2491, 'eval_samples_per_second': 1601.098, 'eval_steps_per_second': 25.618, 'epoch': 100.0}
Class:  20


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'eval_loss': 1.0655534267425537, 'eval_accuracy': 0.09252272727272727, 'eval_abs_accuracy': 0.0, 'eval_runtime': 1.3223, 'eval_samples_per_second': 1512.525, 'eval_steps_per_second': 24.2, 'epoch': 100.0}
