In [1]:
# loading dependencies
!pip install accelerate==0.27.2 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.41.3 --progress-bar off
!pip install -q transformers==4.38.2 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off



In [2]:
import os
from random import randrange
from functools import partial
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForSequenceClassification, TaskType
from trl import SFTTrainer

2024-04-05 02:44:27.899718: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-05 02:44:27.899856: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-05 02:44:28.029673: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from imblearn.metrics import geometric_mean_score
import numpy as np
import pandas as pd

In [4]:
# require login to access Llama 2 models
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model_name = "meta-llama/Llama-2-7b-hf" # base model
output_dir = "/kaggle/working/model_results" # location where model run data is stored

In [None]:
# uncomment to remove directories

#import shutil
# shutil.rmtree('/kaggle/working/news_classification_llama2_7b')

In [6]:
def create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
    """
    Configures model quantization method using bitsandbytes to speed up training and inference

    :param load_in_4bit: Load model in 4-bit precision mode
    :param bnb_4bit_use_double_quant: Nested quantization for 4-bit model
    :param bnb_4bit_quant_type: Quantization data type for 4-bit model
    :param bnb_4bit_compute_dtype: Computation data type for 4-bit model
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    return bnb_config

In [7]:
# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

*loading model and tokenizer*

In [None]:
# loading model and tokenizer
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

label2id = {'TRUE': 0, 'FAKE': 1}
id2label = {0: 'TRUE', 1: 'FAKE'}

#load model
model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        device_map = "auto", # dispatch the model efficiently on the available resources
        max_memory = {i: max_memory for i in range(n_gpus)},
        num_labels = 2,
        label2id = label2id,
        id2label = id2label
    )

#load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)
tokenizer.pad_token = tokenizer.eos_token

model.config.pad_token_id = model.config.eos_token_id
model.config.problem_type = "single_label_classification"

In [9]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
LlamaForSequenceClassification                          --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   131,072,000
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      101,195,776
│    │    └─LlamaDecoderLayer: 3-2                      101,195,776
│    │    └─LlamaDecoderLayer: 3-3                      101,195,776
│    │    └─LlamaDecoderLayer: 3-4                      101,195,776
│    │    └─LlamaDecoderLayer: 3-5                      101,195,776
│    │    └─LlamaDecoderLayer: 3-6                      101,195,776
│    │    └─LlamaDecoderLayer: 3-7                      101,195,776
│    │    └─LlamaDecoderLayer: 3-8                      101,195,776
│    │    └─LlamaDecoderLayer: 3-9                      101,195,776
│    │    └─LlamaDecoderLayer: 3-10                     101,195,776
│    │    └

In [10]:
# Loading train data

train_df = pd.read_csv("/kaggle/input/social-fakenews-data/train_social_df.csv")
display(train_df['label'].value_counts())

target_map = { 'TRUE': 0, 'FAKE': 1 }
target_map_hot = { 'TRUE': [0], 'FAKE': [1] }
train_df['target'] = train_df['label'].map(target_map)

train_df = train_df.drop(['label'], axis=1)
train_df = train_df.rename(columns={'target': 'labels'})

display(train_df.dtypes)
display(train_df.head())

label
TRUE    13454
FAKE     4258
Name: count, dtype: int64

Unnamed: 0     int64
text          object
metadata      object
domain        object
labels         int64
dtype: object

Unnamed: 0.1,Unnamed: 0,text,metadata,domain,labels
0,28049,18 Best Labor Day Sales for 2017,"[{'article': None, 'author': None, 'date': Non...",SOCIAL,0
1,30056,"Song Review: Rihanna and Drake's 'Work,' Off o...","[{'article': None, 'author': None, 'date': Non...",SOCIAL,1
2,31252,"Michael Strahan, Sara Haines Will Host ABCs Ne...","[{'article': None, 'author': None, 'date': Non...",SOCIAL,1
3,19084,Vanessa Marcil,"[{'article': None, 'author': None, 'date': Non...",SOCIAL,0
4,24595,RuPauls Drag Race winners list: Where are all ...,"[{'article': None, 'author': None, 'date': Non...",SOCIAL,0


In [11]:
# Loading valid data

valid_df = pd.read_csv("/kaggle/input/social-fakenews-data/valid_social_df.csv")
display(valid_df['label'].value_counts())

valid_df['target'] = valid_df['label'].map(target_map)

valid_df = valid_df.drop(['label'], axis=1)
valid_df = valid_df.rename(columns={'target': 'labels'})

display(valid_df.head())

label
TRUE    1681
FAKE     533
Name: count, dtype: int64

Unnamed: 0.1,Unnamed: 0,text,metadata,domain,labels
0,31932,Scott Disick looks miserable on day out in LA ...,"[{'article': None, 'author': None, 'date': Non...",SOCIAL,1
1,15501,Princess Diana documentary: Sons to speak abou...,"[{'article': None, 'author': None, 'date': Non...",SOCIAL,0
2,20631,"10 Best Dressed: Week of November 5, 2018","[{'article': None, 'author': None, 'date': Non...",SOCIAL,0
3,29337,Selena Gomez & Justin Theroux May Be Dating An...,"[{'article': None, 'author': None, 'date': Non...",SOCIAL,1
4,23240,Elizabeth Chambers is Suing an Impostor Over O...,"[{'article': None, 'author': None, 'date': Non...",SOCIAL,0


In [12]:
# convert to dataset
import datasets
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

In [14]:
# Only run if need to generate class weight tensor

import numpy as np
from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight

labels = train_ds["labels"]
display(np.unique(labels)) # e.g array([0, 1])
weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
display(weights) # e.g. array([0.6582429 , 2.07984969])

weights_tensor = torch.from_numpy(weights)
display(weights_tensor) # e.g. tensor([0.6582, 2.0798], dtype=torch.float64)

In [13]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [14]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """
    
    encoding = tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

    return encoding

In [15]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    col_to_delete = ['Unnamed: 0', 'text', 'metadata', 'domain']
    
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    # Apply the preprocessing function and remove the undesired columns
    dataset_tokenized = dataset.map(_preprocessing_function, batched=True, remove_columns=col_to_delete)
    # Rename the target to label as for HugginFace standards
    dataset_tokenized = dataset_tokenized.rename_column("labels", "label")
    # Set to torch format
    dataset_tokenized.set_format("torch")
    
    dataset_tokenized = dataset_tokenized.shuffle(seed = seed)

    return dataset_tokenized

In [16]:
# preprocessing

# Random seed
seed = 33

max_length = get_max_length(model)
preprocessed_train_dataset = preprocess_dataset(tokenizer, max_length, seed, train_ds)
preprocessed_validate_dataset = preprocess_dataset(tokenizer, max_length, seed, valid_ds)


Found max lenth: 4096


  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [17]:
import numpy as np

np.object = object
display(preprocessed_train_dataset[0])
display(preprocessed_validate_dataset[0])

{'label': tensor(1),
 'input_ids': tensor([    1,   341,  2735,   341,   423, 29991,  2266,  1334,  2921, 11454,
         15944,  7137, 24239,   373,  4834,   292, 23816, 29901,   306,  9897,
          3880,   304,  2866,   331,  2341, 26407, 15785]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1])}

{'label': tensor(0),
 'input_ids': tensor([    1,  1085,  1255,   472,  3824,   317,  5861,   405,   403,  5334,
         27305, 29892,  2296,  4233,  9943, 29879, 26178]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

# Training model

In [18]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 64

# Dropout probability for LoRA layers
lora_dropout = 0.1

# Bias
bias = "none"

# Task type
# task_type = "CAUSAL_LM" # for prompt-based classification
task_type = TaskType.SEQ_CLS

################################################################################
# TrainingArguments parameters
################################################################################


# Batch size per GPU for training
per_device_train_batch_size = 32

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-5

# Optimizer to use
optim = "paged_adamw_32bit"

# Number of training steps (overrides num_train_epochs)
max_steps = -1
train_epochs = 2

# Linear warmup steps from 0 to learning_rate
warmup_steps = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
# Use fp32 by setting both to False
# fp16 = True
fp16 = False
bf16 = False

# Log every X updates steps
# need to be multiples of gradient_accumulation_steps
logging_steps = 16
eval_steps = 16

In [19]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [20]:
def create_peft_config(r, lora_alpha, target_modules, lora_dropout, bias, task_type):
    """
    Creates Parameter-Efficient Fine-Tuning configuration for the model

    :param r: LoRA attention dimension
    :param lora_alpha: Alpha parameter for LoRA scaling
    :param modules: Names of the modules to apply LoRA to
    :param lora_dropout: Dropout Probability for LoRA layers
    :param bias: Specifies if the bias parameters should be trained
    """
    config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return config

In [21]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [22]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1) # prediction is based on label with highest score

  acc = np.mean(predictions == labels)
  precision = precision_score(labels, predictions, average='weighted')
  recall = recall_score(labels, predictions, average='weighted')
  f1 = f1_score(labels, predictions, average='weighted')
  gmean = geometric_mean_score(labels, predictions, average='weighted')
  
  return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1_score': f1, 'gmean': gmean}

In [25]:
# Only use if setting custom weights
# + update Trainer in fine_tune function to use CustomTrainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        weights_tensor = torch.tensor([0.6582, 2.0798]) # manually input weights acquired earlier in code
        weights_tensor = weights_tensor.to(labels.get_device())
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [23]:
def fine_tune(model,
          tokenizer,
          train_ds,
          valid_ds,
          lora_r,
          lora_alpha,
          lora_dropout,
          bias,
          task_type,
          per_device_train_batch_size,
          gradient_accumulation_steps,
          warmup_steps,
          max_steps,
          learning_rate,
          fp16,
          bf16,
          logging_steps,
          output_dir,
          optim):
    """
    Prepares and fine-tune the pre-trained model.

    :param model: Pre-trained Hugging Face model
    :param tokenizer: Model tokenizer
    :param dataset: Preprocessed training dataset
    """

    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare the model for training
    model = prepare_model_for_kbit_training(model)

    # Get LoRA module names
    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model = model,
        train_dataset = train_ds,
#         eval_dataset = valid_ds,
        eval_dataset = valid_ds.shuffle().select(range(100)),
        compute_metrics=compute_metrics,
        args = TrainingArguments(
            num_train_epochs=train_epochs,
            per_device_train_batch_size = per_device_train_batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_steps = warmup_steps,
            # max_steps = max_steps,
            learning_rate = learning_rate,
            fp16 = fp16,
            bf16 = bf16,
            logging_steps = logging_steps,
            output_dir = output_dir,
            optim = optim,
            evaluation_strategy = "steps", # does evaluation during training
            # evaluation_strategy = "epoch", # does evaluation at the end of the epoch
            # save_strategy = "steps",
            eval_steps = eval_steps,
            # save_steps = max_steps,
            # report_to="tensorboard"
        ),
        data_collator = DataCollatorWithPadding(tokenizer)
    )

    model.config.use_cache = False

    model = torch.compile(model)

    do_train = True

    # Launch training and log metrics
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

#     compute evaluation results
    metrics = trainer.evaluate()
    # save evaluation results
    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)
    print(metrics)
        
    # Save model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok = True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
#     del model
#     del trainer
#     torch.cuda.empty_cache()

In [24]:
print("Training with parameters:\n\tlearning_rate:",learning_rate,"\n\tepochs:",train_epochs,"\n\tbatch_size:",per_device_train_batch_size,"\n\tlog_steps:",logging_steps,"\n\tfp16:",fp16)
fine_tune(model, tokenizer, preprocessed_train_dataset, preprocessed_validate_dataset, lora_r, lora_alpha, lora_dropout, bias, task_type, per_device_train_batch_size, gradient_accumulation_steps, warmup_steps, max_steps, learning_rate, fp16, bf16, logging_steps, output_dir, optim)

Training with parameters:
	learning_rate: 0.0005 
	epochs: 2 
	batch_size: 32 
	log_steps: 16 
	fp16: False
LoRA module names: ['v_proj', 'q_proj', 'k_proj', 'gate_proj', 'down_proj', 'up_proj', 'o_proj']
All Parameters: 3,409,334,272 || Trainable Parameters: 39,993,344 || Trainable Parameters %: 1.1730543504770188


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training...


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Score,Gmean
16,1.1029,0.715524,0.78,0.799191,0.78,0.786731,0.751021
32,0.5743,0.817054,0.75,0.813131,0.75,0.652323,0.46513
48,0.4883,0.364748,0.86,0.855524,0.86,0.856009,0.790316
64,0.4093,0.364516,0.83,0.826118,0.83,0.813076,0.700377
80,0.3746,0.324239,0.86,0.86,0.86,0.86,0.817013
96,0.4111,0.416213,0.81,0.831307,0.81,0.816555,0.796668
112,0.3703,0.392089,0.85,0.875281,0.85,0.826496,0.697937
128,0.3491,0.328481,0.85,0.852004,0.85,0.850905,0.810409
144,0.4409,0.426454,0.77,0.819344,0.77,0.781926,0.782129
160,0.3306,0.307904,0.85,0.869154,0.85,0.855175,0.848752


***** train metrics *****
  epoch                    =       1.99
  total_flos               = 48378478GF
  train_loss               =     0.3782
  train_runtime            = 8:13:43.53
  train_samples_per_second =      1.196
  train_steps_per_second   =      0.009
{'train_runtime': 29623.536, 'train_samples_per_second': 1.196, 'train_steps_per_second': 0.009, 'total_flos': 5.194599609729024e+16, 'train_loss': 0.3781633778758671, 'epoch': 1.99}


***** eval metrics *****
  epoch                   =       1.99
  eval_accuracy           =       0.89
  eval_f1_score           =     0.8893
  eval_gmean              =     0.8499
  eval_loss               =     0.3067
  eval_precision          =     0.8888
  eval_recall             =       0.89
  eval_runtime            = 0:00:36.13
  eval_samples_per_second =      2.767
  eval_steps_per_second   =       0.36
{'eval_loss': 0.3067086935043335, 'eval_accuracy': 0.89, 'eval_precision': 0.8887999999999999, 'eval_recall': 0.89, 'eval_f1_score': 0.8892906961442294, 'eval_gmean': 0.849929555598727, 'eval_runtime': 36.1372, 'eval_samples_per_second': 2.767, 'eval_steps_per_second': 0.36, 'epoch': 1.99}
***** train metrics *****
  epoch                    =       1.99
  total_flos               = 48378478GF
  train_loss               =     0.3782
  train_runtime            = 8:13:43.53
  train_samples_per_second =      1.196
  train_steps_per_second   =      0.009
{'train_runtime': 29623.536,

# SAVING TO HF
*restart, import all dependencies*


In [34]:
# load adapter config saved in output_dir after training model

label2id = {'TRUE': 0, 'FAKE': 1}
id2label = {0: 'TRUE', 1: 'FAKE'}

# Load fine-tuned weights
model = AutoPeftModelForSequenceClassification.from_pretrained(
    output_dir, device_map = "auto", torch_dtype = torch.bfloat16, offload_folder='./', num_labels=2, label2id=label2id, id2label=id2label)

model.config.pad_token_id = model.config.eos_token_id
model.config.problem_type = "single_label_classification"

# Merge the LoRA layers with the base model
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Save fine-tuned model at a new location
output_merged_dir = "/kaggle/working/model_checkpoint"
os.makedirs(output_merged_dir, exist_ok = True)
model.save_pretrained(output_merged_dir, safe_serialization = True)

In [None]:
# Push model to HuggingFace

merged_model = "FakeNewsLlama/CombinedDomainClassifier_E1_2" # assign new name
model.push_to_hub(merged_model, use_auth_token = True)

In [37]:
# Get tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [38]:
# Save tokenizer

tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(output_merged_dir)

('/kaggle/working/social_e2_weighted_checkpoint/tokenizer_config.json',
 '/kaggle/working/social_e2_weighted_checkpoint/special_tokens_map.json',
 '/kaggle/working/social_e2_weighted_checkpoint/tokenizer.model',
 '/kaggle/working/social_e2_weighted_checkpoint/added_tokens.json',
 '/kaggle/working/social_e2_weighted_checkpoint/tokenizer.json')

In [None]:
# Push tokenizer to HuggingFace

tokenizer.push_to_hub(merged_model, use_auth_token = True)

# TESTING
*restart only*

In [1]:
!pip install accelerate==0.27.2 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.41.3 --progress-bar off
!pip install -q transformers==4.38.2 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off
!pip install datasets==2.18.0



In [3]:
import os
from random import randrange
from functools import partial
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoModelForSequenceClassification,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification, TaskType
from trl import SFTTrainer

2024-04-09 00:09:33.423339: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-09 00:09:33.423455: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-09 00:09:33.547158: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
from transformers import pipeline
from datasets import load_dataset
from transformers.pipelines.pt_utils import KeyDataset

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Get number of GPU device and set maximum memory
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

trained_model_name = "FakeNewsLlama/TrueFakeBaseline"

In [7]:
test_file_path = "/kaggle/input/pheme-data-csv/pheme.csv"

In [8]:
test_ds = load_dataset("csv", data_files=test_file_path)["train"]
len(test_ds)

Generating train split: 0 examples [00:00, ? examples/s]

6424

In [9]:
gen = pipeline('text-classification', model=trained_model_name, tokenizer=trained_model_name, device_map="auto")
# output in form: [{'label': 'TRUE', 'score': 0.6169524788856506},...]

config.json:   0%|          | 0.00/873 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.33G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [10]:
id_num = 0
results = pd.DataFrame(columns = ['id', 'text', 'expected_label', 'predicted_label'])
tf_map = {0: 'TRUE', 1:'FAKE'}

for predicted in gen(KeyDataset(test_ds, "text"), batch_size=8, truncation=True, max_length=512):
    print("finished", id_num)
    results.loc[id_num] = [str(id_num), test_ds['text'][id_num], tf_map.get(test_ds['label'][id_num]), predicted['label']]
    if id_num < 3:
        print("expected:",tf_map.get(test_ds['label'][id_num]),"; predicted:",predicted['label'])
    id_num += 1

len(results)

finished 0
expected: TRUE ; predicted: TRUE
finished 1
expected: TRUE ; predicted: TRUE
finished 2
expected: TRUE ; predicted: TRUE
finished 3
finished 4
finished 5
finished 6
finished 7
finished 8
finished 9
finished 10
finished 11
finished 12
finished 13
finished 14
finished 15
finished 16
finished 17
finished 18
finished 19
finished 20
finished 21
finished 22
finished 23
finished 24
finished 25
finished 26
finished 27
finished 28
finished 29
finished 30
finished 31
finished 32
finished 33
finished 34
finished 35
finished 36
finished 37
finished 38
finished 39
finished 40
finished 41
finished 42
finished 43
finished 44
finished 45
finished 46
finished 47
finished 48
finished 49
finished 50
finished 51
finished 52
finished 53
finished 54
finished 55
finished 56
finished 57
finished 58
finished 59
finished 60
finished 61
finished 62
finished 63
finished 64
finished 65
finished 66
finished 67
finished 68
finished 69
finished 70
finished 71
finished 72
finished 73
finished 74
finished 75

6424

In [11]:
len(results)

6424

In [12]:
display(results)

Unnamed: 0,id,text,expected_label,predicted_label
0,0,"Breaking: At least 10 dead, 5 injured after tO...",TRUE,TRUE
1,1,France: 10 people dead after shooting at HQ of...,TRUE,TRUE
2,2,Ten killed in shooting at headquarters of Fren...,TRUE,TRUE
3,3,BREAKING: 10 dead in shooting at headquarters ...,TRUE,FAKE
4,4,Reuters: 10 people shot dead at headquarters o...,TRUE,TRUE
...,...,...,...,...
6419,6419,Sydney siege ends as police storm Lindt Cafe a...,TRUE,TRUE
6420,6420,"Breaking News: #SydneySiege is over, according...",TRUE,TRUE
6421,6421,Watch gunfire erupt and hostages flee chocolat...,TRUE,TRUE
6422,6422,Authorities have confirmed that #sydneysiege i...,TRUE,TRUE


In [22]:
# only uncomment and use if wrong expected label

# tfmap = {0: 'TRUE', 1: 'FAKE'}

# results["expected_label"] = [tfmap.get(i)
#                               for i in results["expected_label"].values]
# # results["expected_label"] = test_ds['label']
# display(results)

Unnamed: 0,id,text,expected_label,predicted_label
0,0,COX S BAZAR (Reuters) - Grieving Rohingya Musl...,TRUE,TRUE
1,1,DAR ES SALAAM (Reuters) - Tanzania shut down a...,TRUE,TRUE
2,2,MADRID (Reuters) - Catalonia s High Court on F...,TRUE,TRUE
3,3,"ERBIL, Iraq (Reuters) - An ban on internationa...",TRUE,TRUE
4,4,TALLINN (Reuters) - French President Emmanuel ...,TRUE,TRUE
...,...,...,...,...
23831,23831,Says the large trade deficit with Japan stems ...,FAKE,TRUE
23832,23832,"""Tens of thousands"" of people leave New York e...",FAKE,TRUE
23833,23833,"""I have fought for our shared values without b...",FAKE,TRUE
23834,23834,"""Germany halts all Covid-19 vaccines, says the...",FAKE,TRUE


In [13]:
# save results

os.makedirs("/kaggle/working/tests/", exist_ok = True)
results.to_csv("/kaggle/working/tests/baseline_test_results_part3_pheme.csv")

In [14]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from imblearn.metrics import geometric_mean_score
import numpy as np
import pandas as pd

In [15]:
def compute_metrics(results,labels,predictions):
  correct = 0
  for i in range(len(labels)):
    if labels[i] == predictions[i]:
      correct += 1
  print(f"Number of labels that were correctly labelled: {correct}")
  print(f"Accuracy: {correct/len(labels)}")

  precision = precision_score(labels, predictions, average='weighted')
  recall = recall_score(labels, predictions, average='weighted')
  f1 = f1_score(labels, predictions, average='weighted')
  gmean = geometric_mean_score(labels, predictions, average='weighted')

  return precision, recall, f1, gmean


In [16]:
precision, recall, f1, gmean = compute_metrics(results, results["expected_label"], results["predicted_label"])
print("PRECISION: " + str(precision))
print("RECALL: " + str(recall))
print("F1: " + str(f1))
print("Gmean: " + str(gmean))

Number of labels that were correctly labelled: 4603
Accuracy: 0.7165317559153176
PRECISION: 0.691865197109522
RECALL: 0.7165317559153176
F1: 0.7029192058899121
Gmean: 0.4943249392704716
