In [None]:
!pip install datasets
!pip install evaluate
!pip install accelerate -U
!pip install peft
!pip install --upgrade pyarrow
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [None]:
import torch
import evaluate
import numpy as np
import pandas as pd
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig)
from datasets import (load_dataset, Dataset)
from torch.utils.data import DataLoader
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model
from google.colab import drive

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7ed5c9326950>

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
import gc
import os

def clear_cache():
  for _ in range(5):
    gc.collect()
    torch.cuda.empty_cache()
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
data_dir = "/content/drive/MyDrive/"
# data_csv = data_dir + "Subset2.csv"

In [None]:
train_df = pd.read_csv(data_dir+ "train.csv")
validation_df = pd.read_csv(data_dir + "test.csv")

train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)


In [None]:
# prompt: do a loop that goes over both datasets that we created and prints the number of post with column label = 1 and column label = 0... print also the name (Train, validation and test) of the dataset

for dataset, name in zip([train_dataset, validation_dataset, ], ["Train", "Validation", ]):
  num_label_1 = len(dataset.filter(lambda example: example['label'] == 1))
  num_label_0 = len(dataset.filter(lambda example: example['label'] == 0))
  print(f"{name} Dataset:")
  print(f"Number of posts with label 1: {num_label_1}")
  print(f"Number of posts with label 0: {num_label_0}")
  print("----")


Filter:   0%|          | 0/36640 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36640 [00:00<?, ? examples/s]

Train Dataset:
Number of posts with label 1: 18012
Number of posts with label 0: 18628
----


Filter:   0%|          | 0/12214 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12214 [00:00<?, ? examples/s]

Validation Dataset:
Number of posts with label 1: 5965
Number of posts with label 0: 6249
----


In [None]:
LLAMA = "meta-llama/Meta-Llama-3-8B"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    return tokenizer(
        example['body'],
        padding="max_length",
        truncation=True,
        max_length=512  # Adjust max_length based on your needs
    )

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = validation_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/36640 [00:00<?, ? examples/s]

Map:   0%|          | 0/12214 [00:00<?, ? examples/s]

In [None]:
def format_labels(example):
    example['labels'] = example['label']
    return example

tokenized_train_dataset = tokenized_train_dataset.map(format_labels)
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_val_dataset = tokenized_val_dataset.map(format_labels)
tokenized_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/36640 [00:00<?, ? examples/s]

Map:   0%|          | 0/12214 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    num_labels=2,
    quantization_config=quantization_config,
    device_map="auto",
    pad_token_id=tokenizer.pad_token_id,
    offload_folder="content/drive/MyDrive/offload", # Specify a folder to offload parts of the model to disk
)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    modules_to_save=["score"],
    task_type=TaskType.SEQ_CLS,
)

model = get_peft_model(model, lora_config)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids
    preds = np.argmax(predictions, axis=1)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall}

In [None]:
BATCH_SIZE = 32

In [None]:
clear_cache()

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
    save_total_limit=3,
    save_strategy="epoch",
    label_names=["labels"],
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args,

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.3071,0.350293,0.835271,0.856447,0.796144


  return fn(*args, **kwargs)
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.3071,0.350293,0.835271,0.856447,0.796144
2,0.2969,0.342187,0.846488,0.931253,0.740319
3,0.2792,0.329353,0.849844,0.877812,0.804526


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=3435, training_loss=0.37458488885665736, metrics={'train_runtime': 21538.5567, 'train_samples_per_second': 5.103, 'train_steps_per_second': 0.159, 'total_flos': 2.3614356406350643e+18, 'train_loss': 0.37458488885665736, 'epoch': 3.0})

In [None]:
# Save the fine-tuned model
trainer.save_model("/content/drive/MyDrive/fine_tuned_llama")