In [1]:
# loading dependencies
!pip install accelerate==0.27.2 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.41.3 --progress-bar off
!pip install -q transformers==4.38.2 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off

Collecting accelerate==0.27.2
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.27.2


In [2]:
import os
from random import randrange
from functools import partial
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoModelForSequenceClassification,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification, TaskType
from trl import SFTTrainer

2024-04-23 17:14:27.817325: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 17:14:27.817421: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 17:14:28.095712: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from imblearn.metrics import geometric_mean_score
import numpy as np
import pandas as pd

In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model_name = "meta-llama/Llama-2-7b-hf"
output_dir = "/kaggle/working/results"

In [None]:
# uncomment to remove directories

# import shutil
# shutil.rmtree('/kaggle/working/news_classification_llama2_7b')

In [7]:
def create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
    """
    Configures model quantization method using bitsandbytes to speed up training and inference

    :param load_in_4bit: Load model in 4-bit precision mode
    :param bnb_4bit_use_double_quant: Nested quantization for 4-bit model
    :param bnb_4bit_quant_type: Quantization data type for 4-bit model
    :param bnb_4bit_compute_dtype: Computation data type for 4-bit model
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    return bnb_config

In [8]:
# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

*loading model and tokenizer*

In [None]:
# loading model and tokenizer
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

label2id = { 'POLITICS': 0, 'SOCIAL': 1, 'HEALTH': 2, 'CRIME': 3, 'SCIENCE': 4 }
id2label = {0: 'POLITICS', 1: 'SOCIAL', 2: 'HEALTH', 3: 'CRIME', 4: 'SCIENCE'}

#load model
model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        device_map = "auto", # dispatch the model efficiently on the available resources
        max_memory = {i: max_memory for i in range(n_gpus)},
        num_labels = 5,
        label2id = label2id,
        id2label = id2label
    )

#load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)
tokenizer.pad_token = tokenizer.eos_token

model.config.pad_token_id = model.config.eos_token_id

In [10]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
LlamaForSequenceClassification                          --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   131,072,000
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      101,195,776
│    │    └─LlamaDecoderLayer: 3-2                      101,195,776
│    │    └─LlamaDecoderLayer: 3-3                      101,195,776
│    │    └─LlamaDecoderLayer: 3-4                      101,195,776
│    │    └─LlamaDecoderLayer: 3-5                      101,195,776
│    │    └─LlamaDecoderLayer: 3-6                      101,195,776
│    │    └─LlamaDecoderLayer: 3-7                      101,195,776
│    │    └─LlamaDecoderLayer: 3-8                      101,195,776
│    │    └─LlamaDecoderLayer: 3-9                      101,195,776
│    │    └─LlamaDecoderLayer: 3-10                     101,195,776
│    │    └

In [11]:
# Loading train data [DOMAIN CLASSIFIER]

train_df = pd.read_csv("/kaggle/input/all-data/training_df.csv")
display(len(train_df))
display(train_df['domain'].value_counts())

target_map = { 'POLITICS': 0, 'SOCIAL': 1, 'HEALTH': 2, 'CRIME': 3, 'SCIENCE':4 }
train_df['target'] = train_df['domain'].map(target_map)

train_df = train_df.drop(['domain'], axis=1)
train_df = train_df.rename(columns={'target': 'labels'})

display(train_df.dtypes)
display(train_df.head())

80204

domain
POLITICS    52558
SOCIAL      17712
HEALTH       8565
SCIENCE       726
CRIME         643
Name: count, dtype: int64

Unnamed: 0.1     int64
Unnamed: 0       int64
text            object
label            int64
metadata        object
labels           int64
dtype: object

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,label,metadata,labels
0,30966,30966,Jennifer Aniston and Justin Theroux Double-Dat...,1,"[{'article': None, 'author': None, 'date': Non...",1
1,17410,17410,Kim Kardashian West on Her New Beauty Line and...,0,"[{'article': None, 'author': None, 'date': Non...",1
2,23715,23715,Ruby Rose Admits That Being Mean Doesn’t Suit ...,0,"[{'article': None, 'author': None, 'date': Non...",1
3,30383,30383,Kourtney Kardashian moves on from Younes Bendj...,1,"[{'article': None, 'author': None, 'date': Non...",1
4,57496,57496,(Reuters) - The United States is in an economi...,0,"[{'article': None, 'author': None, 'date': 'Au...",0


In [12]:
# Loading train data [DOMAIN CLASSIFIER]

valid_df = pd.read_csv("/kaggle/input/all-data/valid_df.csv")
display(len(valid_df))
display(valid_df['domain'].value_counts())

target_map = { 'POLITICS': 0, 'SOCIAL': 1, 'HEALTH': 2, 'CRIME': 3, 'SCIENCE':4 }
valid_df['target'] = valid_df['domain'].map(target_map)

valid_df = valid_df.drop(['domain'], axis=1)
valid_df = valid_df.rename(columns={'target': 'labels'})

display(valid_df.dtypes)
display(valid_df.head())

10026

domain
POLITICS    6570
SOCIAL      2214
HEALTH      1071
SCIENCE       91
CRIME         80
Name: count, dtype: int64

Unnamed: 0.1     int64
Unnamed: 0       int64
text            object
label            int64
metadata        object
labels           int64
dtype: object

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,label,metadata,labels
0,4669,4669,"Behind the scenes, a small team of FBI agents ...",1,"[{'article': None, 'author': None, 'date': Non...",2
1,14777,14777,Dream Kardashian Celebrates 1st Birthday With ...,0,"[{'article': None, 'author': None, 'date': Non...",1
2,65651,65651,"(In March 16 item, corrects timing of legisla...",0,"[{'article': None, 'author': None, 'date': 'Ma...",0
3,87314,87314,This is gonna be a tough pill for the left to ...,1,"[{'article': None, 'author': None, 'date': 'Ju...",0
4,78482,78482,Wow. The cruelty of Republicans knows no bound...,1,"[{'article': None, 'author': None, 'date': 'Ap...",0


In [None]:
display(len(train_df))
train_df = train_df.dropna(subset=['text'])
display(len(train_df))
display(len(valid_df))
valid_df = valid_df.dropna(subset=['text'])
display(len(valid_df))

In [13]:
# only run to limit training data

# subsets completed: 0-6000,6000-12000,12000-18000,18000-24000,24000-30000, 30000-36000, 36000-42000
train_df = train_df[42000:48000]
display(len(train_df))

6000

In [14]:
# convert to dataset
import datasets
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

In [15]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        # max_length = 512 # uncomment to truncate input to 512 tokens
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [16]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """
    
    encoding = tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

    return encoding

In [17]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    col_to_delete = ['Unnamed: 0.1', 'Unnamed: 0', 'text', 'metadata', 'label']
    
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    # Apply the preprocessing function and remove the undesired columns
    dataset_tokenized = dataset.map(_preprocessing_function, batched=True, remove_columns=col_to_delete)
    # Rename the target to label as for HugginFace standards
    dataset_tokenized = dataset_tokenized.rename_column("labels", "label")
    # Set to torch format
    dataset_tokenized.set_format("torch")
    # Shuffle dataset
    dataset_tokenized = dataset_tokenized.shuffle(seed = seed) 

    return dataset_tokenized

In [None]:
# preprocessing

# Random seed
seed = 33

max_length = get_max_length(model)
preprocessed_train_dataset = preprocess_dataset(tokenizer, max_length, seed, train_ds)
preprocessed_validate_dataset = preprocess_dataset(tokenizer, max_length, seed, valid_ds)


In [19]:
import numpy as np

np.object = object
display(preprocessed_train_dataset[0])
display(preprocessed_validate_dataset[0])

{'label': tensor(1),
 '__index_level_0__': tensor(43726),
 'input_ids': tensor([    1, 12001,  5357,   382,  6321,   870]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1])}

{'label': tensor(2),
 '__index_level_0__': tensor(2636),
 'input_ids': tensor([    1,   319,  4151, 29999,   264, 13273, 21614, 10849,   501, 29889,
         29903, 29889, 27436,   297,  8175,   363, 19937, 29899, 29896, 29929,
         14502, 29871,  2045,   597,   276,   329, 29889,  2288, 29914, 29941,
         29888, 29941, 29965,  1867, 29955, 30081, 11942, 29889, 24946, 29889,
           510, 29914, 29947, 29907,   582, 29967, 29947, 29963, 29945, 18091]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1])}

# Training model

In [20]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 64

# Dropout probability for LoRA layers
lora_dropout = 0.1

# Bias
bias = "none"

# Task type
# task_type = "CAUSAL_LM" # basic
# task_type = "SEQ_CLS" #class
task_type = TaskType.SEQ_CLS

################################################################################
# TrainingArguments parameters
################################################################################


# Batch size per GPU for training
per_device_train_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Optimizer to use
optim = "paged_adamw_32bit"

# Number of training steps (overrides num_train_epochs)
max_steps = -1
train_epochs = 20

# Linear warmup steps from 0 to learning_rate
warmup_steps = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
# Use fp32 by setting both to False
# fp16 = True
fp16 = False
bf16 = False

# Log every X updates steps
# need to be multiples of gradient_accumulation_steps
logging_steps = 16
eval_steps = 16

In [21]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [22]:
def create_peft_config(r, lora_alpha, target_modules, lora_dropout, bias, task_type):
    """
    Creates Parameter-Efficient Fine-Tuning configuration for the model

    :param r: LoRA attention dimension
    :param lora_alpha: Alpha parameter for LoRA scaling
    :param modules: Names of the modules to apply LoRA to
    :param lora_dropout: Dropout Probability for LoRA layers
    :param bias: Specifies if the bias parameters should be trained
    """
    config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return config

In [23]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [24]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  
  acc = np.mean(predictions == labels)
  precision = precision_score(labels, predictions, average='weighted')
  recall = recall_score(labels, predictions, average='weighted')
  f1 = f1_score(labels, predictions, average='weighted')
  gmean = geometric_mean_score(labels, predictions, average='weighted')
  return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1_score': f1, 'gmean': gmean}

In [25]:
def fine_tune(model,
          tokenizer,
          train_ds,
          valid_ds,
          lora_r,
          lora_alpha,
          lora_dropout,
          bias,
          task_type,
          per_device_train_batch_size,
          gradient_accumulation_steps,
          warmup_steps,
          max_steps,
          learning_rate,
          fp16,
          bf16,
          logging_steps,
          output_dir,
          optim):
    """
    Prepares and fine-tune the pre-trained model.

    :param model: Pre-trained Hugging Face model
    :param tokenizer: Model tokenizer
    :param dataset: Preprocessed training dataset
    """

    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare the model for training
    model = prepare_model_for_kbit_training(model)

    # Get LoRA module names
    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model = model,
        train_dataset = train_ds,
#         eval_dataset = valid_ds,
        eval_dataset = valid_ds.shuffle().select(range(100)),
        compute_metrics=compute_metrics,
        args = TrainingArguments(
            num_train_epochs=train_epochs,
            per_device_train_batch_size = per_device_train_batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_steps = warmup_steps,
            # max_steps = max_steps,
            learning_rate = learning_rate,
            fp16 = fp16,
            bf16 = bf16,
            logging_steps = logging_steps,
            output_dir = output_dir,
            optim = optim,
            evaluation_strategy = "steps", # does evaluation during training
            # evaluation_strategy = "epoch", # does evaluation at the end of the epoch
            # save_strategy = "steps",
            eval_steps = eval_steps,
            # save_steps = max_steps,
            # report_to="tensorboard"
        ),
        data_collator = DataCollatorWithPadding(tokenizer)
    )

    model.config.use_cache = False

    model = torch.compile(model)

    do_train = True

    # Launch training and log metrics
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

#     compute evaluation results
    metrics = trainer.evaluate()
    # save evaluation results
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)
    print(metrics)

    # Save model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok = True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
#     del model
#     del trainer
#     torch.cuda.empty_cache()

In [26]:
fine_tune(model, tokenizer, preprocessed_train_dataset, preprocessed_validate_dataset, lora_r, lora_alpha, lora_dropout, bias, task_type, per_device_train_batch_size, gradient_accumulation_steps, warmup_steps, max_steps, learning_rate, fp16, bf16, logging_steps, output_dir, optim)

LoRA module names: ['o_proj', 'q_proj', 'gate_proj', 'down_proj', 'v_proj', 'up_proj', 'k_proj']
All Parameters: 3,409,358,848 || Trainable Parameters: 40,017,920 || Trainable Parameters %: 1.1737667339850522


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training...


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Score,Gmean
1,4.2603,2.996352,0.2,0.448222,0.2,0.247947,0.409871
2,3.6596,2.571281,0.28,0.503356,0.28,0.347937,0.482048
3,2.8261,2.014228,0.38,0.475202,0.38,0.415662,0.529108
4,1.0161,1.723536,0.56,0.552281,0.56,0.524699,0.580547
5,2.1269,1.474201,0.58,0.571138,0.58,0.523106,0.571333
6,0.3663,1.184089,0.64,0.645107,0.64,0.58032,0.615025
7,0.1121,1.070202,0.68,0.691264,0.68,0.619418,0.644764
8,0.0474,1.071423,0.76,0.777928,0.76,0.704994,0.729657
9,0.001,1.135467,0.76,0.785237,0.76,0.696054,0.724089
10,1.0729,1.103226,0.76,0.785237,0.76,0.696054,0.724089


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


***** train metrics *****
  epoch                    =        0.0
  total_flos               =   839310GF
  train_loss               =      0.943
  train_runtime            = 1:12:25.70
  train_samples_per_second =      0.018
  train_steps_per_second   =      0.005
{'train_runtime': 4345.707, 'train_samples_per_second': 0.018, 'train_steps_per_second': 0.005, 'total_flos': 901202842828800.0, 'train_loss': 0.9430344320251607, 'epoch': 0.0}


  _warn_prf(average, modifier, msg_start, len(result))


***** eval metrics *****
  epoch                   =        0.0
  eval_accuracy           =        0.9
  eval_f1_score           =     0.8805
  eval_gmean              =     0.9023
  eval_loss               =     0.5952
  eval_precision          =     0.8805
  eval_recall             =        0.9
  eval_runtime            = 0:03:16.16
  eval_samples_per_second =       0.51
  eval_steps_per_second   =      0.066
{'eval_loss': 0.5952011346817017, 'eval_accuracy': 0.9, 'eval_precision': 0.8804761904761904, 'eval_recall': 0.9, 'eval_f1_score': 0.880484742530197, 'eval_gmean': 0.9022528195444666, 'eval_runtime': 196.1678, 'eval_samples_per_second': 0.51, 'eval_steps_per_second': 0.066, 'epoch': 0.0}
Saving last checkpoint of the model...


# SAVING TO HF
*restart, import all dependencies*


In [27]:
# load adapter config saved in output_dir after training model

label2id = { 'POLITICS': 0, 'SOCIAL': 1, 'HEALTH': 2, 'CRIME': 3, 'SCIENCE': 4 }
id2label = {0: 'POLITICS', 1: 'SOCIAL', 2: 'HEALTH', 3: 'CRIME', 4: 'SCIENCE'}

# Load fine-tuned weights
model = AutoPeftModelForSequenceClassification.from_pretrained(
    output_dir, device_map = "auto", torch_dtype = torch.bfloat16, offload_folder='./', num_labels=5, label2id=label2id, id2label=id2label)

model.config.pad_token_id = model.config.eos_token_id

# Merge the LoRA layers with the base model
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# only to remove directories
import shutil
# shutil.rmtree('/kaggle/working/news_classification_llama2_7b/final_merged_checkpoint')

In [None]:
# Save fine-tuned model at a new location
output_merged_dir = "/kaggle/working/news_classification_llama2_7b/final_merged_checkpoint_1"
os.makedirs(output_merged_dir, exist_ok = True)
model.save_pretrained(output_merged_dir, safe_serialization = True)

In [28]:
merged_model = "FakeNewsLlama/CombinedDomainClassifier_E1_2" # assign new name
model.push_to_hub(merged_model, use_auth_token = True)



model-00003-of-00003.safetensors:   0%|          | 0.00/3.33G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/FakeNewsLlama/CombinedDomainClassifier_S20/commit/a100267b19637503496eacd62f30a5f8a43b0091', commit_message='Upload LlamaForSequenceClassification', commit_description='', oid='a100267b19637503496eacd62f30a5f8a43b0091', pr_url=None, pr_revision=None, pr_num=None)

In [29]:
# Save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(output_merged_dir)

In [30]:
tokenizer.push_to_hub(merged_model, use_auth_token = True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/FakeNewsLlama/CombinedDomainClassifier_S20/commit/0bcec72ce28dc01f946b816d4580f1585764dd95', commit_message='Upload tokenizer', commit_description='', oid='0bcec72ce28dc01f946b816d4580f1585764dd95', pr_url=None, pr_revision=None, pr_num=None)

# TESTING
*restart only*

In [1]:
!pip install accelerate==0.27.2 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.41.3 --progress-bar off
!pip install -q transformers==4.38.2 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off

!pip install datasets==2.14.7

Collecting accelerate==0.27.2
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.27.2
Collecting datasets==2.14.7
  Downloading datasets-2.14.7-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets==2.14.7)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.7)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.14.7)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with 

In [2]:
import os
from random import randrange
from functools import partial
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoModelForSequenceClassification,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification, TaskType
from trl import SFTTrainer

2024-04-23 18:48:32.916328: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 18:48:32.916436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 18:48:33.085217: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from transformers import pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset

In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
 # Get number of GPU device and set maximum memory
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

trained_model_name = "FakeNewsLlama/CombinedDomainClassifier_E1_2"

In [7]:
test_file_path = "/kaggle/input/all-data/valid_df.csv"
test_ds = load_dataset("csv", data_files=test_file_path)["train"]

test_ds_subset = test_ds[:200]
test_ds_subset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


{'Unnamed: 0.1': [4669,
  14777,
  65651,
  87314,
  78482,
  80092,
  25133,
  57804,
  88187,
  19170,
  15227,
  62042,
  46384,
  91669,
  33427,
  574,
  95025,
  98807,
  29626,
  39224,
  37639,
  39523,
  36217,
  42624,
  85700,
  67739,
  33815,
  24520,
  32331,
  5499,
  25960,
  6796,
  48205,
  40721,
  74642,
  73036,
  18555,
  21121,
  73984,
  5218,
  8127,
  22882,
  21951,
  3235,
  50946,
  35001,
  46773,
  45888,
  31570,
  43258,
  73345,
  50015,
  63605,
  72444,
  56848,
  39251,
  49887,
  96778,
  6424,
  29949,
  4354,
  51700,
  11688,
  77411,
  41593,
  18151,
  34205,
  72526,
  63542,
  48946,
  98537,
  54231,
  78492,
  68175,
  39868,
  92361,
  96273,
  37609,
  70840,
  67474,
  29994,
  61046,
  47374,
  30771,
  11419,
  57774,
  65368,
  77620,
  93886,
  59756,
  26959,
  26935,
  90452,
  20579,
  68188,
  67532,
  59621,
  55709,
  16832,
  32619,
  16077,
  20361,
  32476,
  69280,
  9315,
  14540,
  47576,
  65444,
  59367,
  27865,
  280

In [8]:
# prompting using validation set
results = pd.DataFrame(columns = ['id', 'text', 'expected_domain', 'predicted_domain'])

gen = pipeline('text-classification', model=trained_model_name, tokenizer=trained_model_name, device_map="auto")
# output in form: [{'label': 'TRUE', 'score': 0.6169524788856506},...]

config.json:   0%|          | 0.00/944 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.33G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [None]:
id_num = 0
results = pd.DataFrame(columns = ['id', 'text', 'expected_domain', 'predicted_domain'])

for predicted in gen(KeyDataset(test_ds, "text"), batch_size=8, truncation=True, max_length=512):
    print("finished", id_num)
    results.loc[id_num] = [str(id_num), test_ds['text'][id_num], test_ds_subset['domain'][id], predicted['label']]
    if id_num < 3:
        print("expected:",test_ds_subset['domain'][id],"; predicted:",predicted['label'])
    id_num += 1

display(results)

In [12]:
len(results)

200

In [None]:
# save results

os.makedirs("/kaggle/working/tests/", exist_ok = True)
results.to_csv("/kaggle/working/tests/combined_results_1.csv")

In [13]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from imblearn.metrics import geometric_mean_score
import numpy as np
import pandas as pd

In [23]:
def compute_metrics(results,labels,predictions):
  correct = 0
  for i in range(len(labels)):
    if labels[i] == predictions[i]:
      correct += 1
  print(f"Number of labels that were correctly labelled: {correct}")
  print(f"Accuracy: {correct/len(labels)}")

  precision = precision_score(labels, predictions, average='weighted')
  recall = recall_score(labels, predictions, average='weighted')
  f1 = f1_score(labels, predictions, average='weighted')
  gmean = geometric_mean_score(labels, predictions, average='weighted')

  return precision, recall, f1, gmean

In [25]:
# compute training metrics

precision, recall, f1, gmean = compute_metrics(results, results["expected_domain"], results["predicted_domain"])
print("PRECISION: " + str(precision))
print("RECALL: " + str(recall))
print("F1: " + str(f1))
print("Gmean: " + str(gmean))

Number of labels that were correctly labelled: 180
Accuracy: 0.9
PRECISION: 0.8921460750487156
RECALL: 0.9
F1: 0.8904849842306927
Gmean: 0.8966372273513976


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
