In [None]:
!pip install --upgrade datasets
!pip install --upgrade transformers
!pip install bitsandbytes
!pip install accelerate
!pip install jsonlines
!pip install --upgrade peft
!pip install --upgrade trl
!pip install tensorboard

Collecting bitsandbytes
  Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->bitsandbytes)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3

In [2]:
# import required packages
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset, Dataset
import jsonlines as jl
from peft import LoraConfig
from trl import SFTTrainer
import random

In [3]:
# hugging_face authentication
hf_token = "write_your_huggibg_face_token"
from huggingface_hub import login
login(token="write_your_hugging_face_token", add_to_git_credential=True)

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/malarvis/.cache/huggingface/token
Login successful


**OPIONIONS MODEL**

In [4]:
# read data
opinion_train_data = pd.read_csv("opinion_train.csv")
opinion_val_data = pd.read_csv("opinion_val.csv")

In [5]:
# remove missing values if any
opinion_train_data = opinion_train_data.dropna()
opinion_val_data = opinion_val_data.dropna()

In [6]:
opinion_train = Dataset.from_pandas(opinion_train_data)
opinions_train = []
for __, data in enumerate(opinion_train):
    opinions_train.append({"text": data['sentence']})
opinion_val = Dataset.from_pandas(opinion_val_data)
opinions_val = []
for __, data in enumerate(opinion_val):
    opinions_val.append({"text": data['sentence']})

In [7]:
# write the constructed list into json
with jl.open('opinion-train.jsonl', 'w') as writer:
    writer.write_all(opinions_train[0:])
with jl.open('opinion-val.jsonl', 'w') as writer:
    writer.write_all(opinions_val[0:])

In [8]:
# convert json to huggingface dataset
opinion_dataset_train = load_dataset('json', data_files='opinion-train.jsonl', split="train")
print(opinion_dataset_train)
opinion_dataset_val = load_dataset('json', data_files='opinion-val.jsonl', split="train")
print(opinion_dataset_val)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 890115
})


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 1556245
})


In [9]:
if len(opinion_dataset_val) > 50000:
    opinion_dataset_val = opinion_dataset_val.shuffle(seed=42).select(range(50000))
else:
    opinion_dataset_val = opinion_dataset_val

print("Sampled Validation Dataset:")
print(opinion_dataset_val)

Sampled Validation Dataset:
Dataset({
    features: ['text'],
    num_rows: 50000
})


**DEFINING PARAMETERS**

In [10]:
# define some variables - model names
model_name2 = "google/gemma-2b"
new_model2 = "gemma-ft-opinion"

################################################################################
# LoRA parameters
################################################################################
# LoRA attention dimension
# lora_r = 64
lora_r2 = 8
# Alpha parameter for LoRA scaling
lora_alpha2 = 8
# Dropout probability for LoRA layers
lora_dropout2 = 0.2

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit2 = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype2 = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type2 = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant2 = False

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir2 = "results2"
# Number of training epochs
num_train_epochs2 = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp162 = False
bf162 = False
# Batch size per GPU for training
per_device_train_batch_size2 = 4
# Batch size per GPU for evaluation
per_device_eval_batch_size2 = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps2 = 1
# Enable gradient checkpointing
gradient_checkpointing2 = True
# Maximum gradient normal (gradient clipping)
max_grad_norm2 = 0.5
# Initial learning rate (AdamW optimizer)
learning_rate2 = 3e-5
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay2 = 0.001
# Optimizer to use
optim2 = "paged_adamw_32bit"
# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type2 = "constant"
# Number of training steps (overrides num_train_epochs)
max_steps2 = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio2 = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length2 = True
# Save checkpoint every X updates steps
save_steps2 = 10000
# Log every X updates steps
logging_steps2 = 5000

################################################################################
# SFT parameters
################################################################################
# Maximum sequence length to use
max_seq_length2 = 128
# Pack multiple short examples in the same input sequence to increase efficiency
packing2 = True
# Load the entire model on the GPU 0
device_map = {"": 0}

In [11]:
# Load QLoRA configuration
compute_dtype2 = getattr(torch, bnb_4bit_compute_dtype2)

bnb_config2 = BitsAndBytesConfig(
    load_in_4bit=use_4bit2, # Activates 4-bit precision loading
    bnb_4bit_quant_type=bnb_4bit_quant_type2, # nf4
    bnb_4bit_compute_dtype=compute_dtype2, # float16
    bnb_4bit_use_double_quant=use_nested_quant2, # False
)

In [12]:
# Check GPU compatibility with bfloat16
if compute_dtype2 == torch.float16 and use_4bit2:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("Setting BF16 to True")
        bf162 = True
    else:
        bf162 = False

**FINE-TUNING AND TRAINING**

In [13]:
# Load base model
model2 = AutoModelForCausalLM.from_pretrained(
    model_name2,
    token=hf_token,
    quantization_config=bnb_config2,
    device_map=device_map
)
model2.config.use_cache = False
model2.config.pretraining_tp = 1

tokenizer2 = AutoTokenizer.from_pretrained(model_name2,
                                          token=hf_token,
                                          trust_remote_code=True)
tokenizer2.pad_token = tokenizer2.eos_token
tokenizer2.padding_side = "right" # Fix weird overflow issue with fp16 training


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [14]:
# Load LoRA configuration
peft_config2 = LoraConfig(
    lora_alpha=lora_alpha2,
    lora_dropout=lora_dropout2,
    r=lora_r2,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"]
)

In [15]:
# Set training parameters
training_arguments2 = TrainingArguments(
    output_dir=output_dir2,
    num_train_epochs=num_train_epochs2,
    per_device_train_batch_size=per_device_train_batch_size2,
    per_device_eval_batch_size=per_device_eval_batch_size2,
    gradient_accumulation_steps=gradient_accumulation_steps2,
    optim=optim2,
    save_steps=save_steps2,
    logging_steps=logging_steps2,
    eval_strategy="steps",
    eval_steps=5000,
    learning_rate=learning_rate2,
    weight_decay=weight_decay2,
    fp16=fp162,
    bf16=bf162,
    max_grad_norm=max_grad_norm2,
    max_steps=max_steps2,
    warmup_ratio=warmup_ratio2,
    group_by_length=group_by_length2,
    lr_scheduler_type=lr_scheduler_type2,
    report_to="tensorboard",
)
training_arguments2

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=5000,
eval_strategy=steps,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp

In [16]:
# Set supervised fine-tuning parameters
trainer2 = SFTTrainer(
    model=model2,
    train_dataset=opinion_dataset_train,
    eval_dataset=opinion_dataset_val,
    peft_config=peft_config2,
    dataset_text_field="text",
    max_seq_length=max_seq_length2,
    tokenizer=tokenizer2,
    args=training_arguments2,
    packing=packing2,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [17]:
# Train model
trainer2.train()

Step,Training Loss,Validation Loss
5000,3.2916,3.20648
10000,3.2008,3.196189
15000,3.1906,3.182414
20000,3.1835,3.173559
25000,3.1727,3.173165


TrainOutput(global_step=25948, training_loss=3.2064292628977906, metrics={'train_runtime': 19796.6556, 'train_samples_per_second': 5.243, 'train_steps_per_second': 1.311, 'total_flos': 1.585490217963356e+17, 'train_loss': 3.2064292628977906, 'epoch': 1.0})

In [18]:
# Save trained model
trainer2.model.save_pretrained(new_model2)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results2/runs