In [None]:
!pip install datasets
!pip install transformers
!pip install einops accelerate bitsandbytes
!pip install sentence_transformers
!pip install git+https://github.com/huggingface/peft.git
!pip install accelerate
!pip install evaluate
!pip install wandb

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15
Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downl

In [None]:
!pip install --upgrade peft
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2


In [None]:
MAX_LEN = 300
llm_model = 'recogna-nlp/bode-7b-alpaca-pt-br'
hf_auth = 'you_key'

## Data preparation

In [None]:
from datasets import load_dataset
dataset = load_dataset("Silly-Machine/TuPyE-Dataset",'binary')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.28k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.99M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/989k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

### Target distribution in the train dataset

In [None]:
pos_weights = len(dataset['train'].to_pandas()) / (2 * dataset['train'].to_pandas().hate.value_counts()[1])
neg_weights = len(dataset['train'].to_pandas()) / (2 * dataset['train'].to_pandas().hate.value_counts()[0])


### Compute the maximum length of the column text

In [None]:
# Number of Characters
max_char = dataset['train'].to_pandas()['text'].str.len().max()
# Number of Words
max_words = dataset['train'].to_pandas()['text'].str.split().str.len().max()


### Let's take a look to one row example of training data

In [None]:
dataset['train'][0]

{'source': 'twitter',
 'text': 'rt @user quero mais aaaaaa ',
 'researcher': 'leite et al',
 'year': 2020,
 'aggressive': 0,
 'hate': 0}

## Data Processing

In [None]:
col_to_delete = ['source','researcher','year','aggressive']

# Load Llama 2 Tokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding
from peft import PeftModel, PeftConfig
#
config = PeftConfig.from_pretrained(llm_model)

llama_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,add_prefix_space=True,trust_remote_code=True, token=hf_auth)
llama_tokenizer.pad_token_id = llama_tokenizer.eos_token_id
llama_tokenizer.pad_token = llama_tokenizer.eos_token

def llama_preprocessing_function(examples):
    return llama_tokenizer(examples['text'], truncation=True, max_length=MAX_LEN)

llama_tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
llama_tokenized_datasets = llama_tokenized_datasets.rename_column("hate", "label")
llama_tokenized_datasets.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
llama_data_collator = DataCollatorWithPadding(tokenizer=llama_tokenizer)

adapter_config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/34934 [00:00<?, ? examples/s]

Map:   0%|          | 0/8734 [00:00<?, ? examples/s]

## Models

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

llama_model =  AutoModelForSequenceClassification.from_pretrained(
  pretrained_model_name_or_path=config.base_model_name_or_path,
  num_labels=2,
  device_map="auto",
  offload_folder="offload",
  trust_remote_code=True,
  token=hf_auth
)

llama_model.config.pad_token_id = llama_model.config.eos_token_id

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### LoRa setup for Llama 2 classifier

In [None]:
from peft import get_peft_model, LoraConfig, TaskType
llama_peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=[
        "q_proj",
        "v_proj",
    ],
)

llama_model = get_peft_model(llama_model, llama_peft_config)
llama_model.print_trainable_parameters()

trainable params: 8,396,800 || all params: 6,615,748,608 || trainable%: 0.12692138860666938


## Setup the trainer

### Evaluation Metrics

In [None]:
# import evaluate
# import numpy as np

# def compute_metrics(eval_pred):
#     # All metrics are already predefined in the HF `evaluate` package
#     precision_metric = evaluate.load("precision")
#     recall_metric = evaluate.load("recall")
#     f1_metric= evaluate.load("f1")
#     accuracy_metric = evaluate.load("accuracy")

#     logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
#     predictions = np.argmax(logits, axis=-1)
#     precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
#     recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
#     f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
#     accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
#     # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores.
#     return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# Custom function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Weighted metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1_weighted': f1,
        'precision_weighted': precision,
        'recall_weighted': recall
    }

### Custom Trainer for Weighted Loss

In [None]:
from transformers import Trainer

class WeightedCELossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # Get model's predictions
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([neg_weights, pos_weights], device=model.device, dtype=logits.dtype))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


In [None]:
from transformers import TrainingArguments, Trainer

llama_model = llama_model.cuda()

lr = 1e-5
batch_size = 8
num_epochs = 20
training_args = TrainingArguments(
    output_dir="llama-lora-token-classification",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    warmup_steps=500,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="wandb",
    fp16=True,
    gradient_checkpointing=True,
)



llama_trainer = WeightedCELossTrainer(
    model=llama_model,
    args=training_args,
    train_dataset=llama_tokenized_datasets['train'],
    eval_dataset=llama_tokenized_datasets["test"],
    data_collator=llama_data_collator,
    compute_metrics=compute_metrics
)


In [None]:
# 7 - Train the model
llama_trainer.train()

# 8 - Save the fine-tuned model
llama_model.save_pretrained('./fine_tuned_custom_sentiment_model')
llama_tokenizer.save_pretrained('./fine_tuned_custom_sentiment_model')


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,Precision Weighted,Recall Weighted
1,0.7633,0.791161,0.8049,0.810975,0.817558,0.8049
2,0.7141,0.696034,0.779711,0.801797,0.832187,0.779711
3,0.6488,0.700089,0.814174,0.823956,0.83561,0.814174
4,0.633,0.651622,0.772155,0.799864,0.84269,0.772155
5,0.629,0.652567,0.80158,0.818989,0.842807,0.80158
6,0.6544,0.67213,0.826883,0.833827,0.841866,0.826883
7,0.597,0.657039,0.819785,0.830292,0.843296,0.819785
8,0.6344,0.69313,0.840966,0.842275,0.843626,0.840966
9,0.6481,0.74068,0.857339,0.850309,0.844623,0.857339
10,0.6323,0.668333,0.836272,0.840225,0.844563,0.836272




('./fine_tuned_custom_sentiment_model/tokenizer_config.json',
 './fine_tuned_custom_sentiment_model/special_tokens_map.json',
 './fine_tuned_custom_sentiment_model/tokenizer.model',
 './fine_tuned_custom_sentiment_model/added_tokens.json',
 './fine_tuned_custom_sentiment_model/tokenizer.json')

In [None]:
results = llama_trainer.evaluate()
print(results)

{'eval_loss': 0.6418256163597107, 'eval_accuracy': 0.8219601557133043, 'eval_f1_weighted': 0.8334531410324304, 'eval_precision_weighted': 0.8482025966621025, 'eval_recall_weighted': 0.8219601557133043, 'eval_runtime': 111.1117, 'eval_samples_per_second': 78.606, 'eval_steps_per_second': 9.828, 'epoch': 20.0}


## Upload fine tuned model

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'stor

In [None]:
refined_model_name = "Silly-Machine/TuPy-Llama-Lora-Binary-Classifier_e20"
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('./fine_tuned_custom_sentiment_model')
tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_custom_sentiment_model')

# Push to Hugging Face
model.push_to_hub(refined_model_name)
tokenizer.push_to_hub(refined_model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Silly-Machine/TuPy-Llama-Lora-Binary-Classifier_e20/commit/b2e74d16ee455299d15c8ae8a02848fe25e085ba', commit_message='Upload tokenizer', commit_description='', oid='b2e74d16ee455299d15c8ae8a02848fe25e085ba', pr_url=None, pr_revision=None, pr_num=None)