In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

import wandb
from huggingface_hub import login, HfApi, create_repo
from pathlib import Path
from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#Wandb & Huggingface keys
hf_token = "**REMOVED***"
wandb_api_key = "***REMOVED***"

In [3]:
# Intitialize Weights & Biases
if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("Successfully logged in to WANDB!")
else:
    print("No wandb key provided. Skipping wandb login.")

if hf_token:
    
    # Log in to Hugging Face
    login(token=hf_token)
    print("Successfully logged in to Hugging Face!")
else:
    print("Hugging Face token not found in notebook secrets.")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/012/r/rx/rxh210037/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mreyhaneh-rhp7[0m ([33mreyhaneh-rhp7-university-of-texas-at-dallas[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Successfully logged in to WANDB!
Successfully logged in to Hugging Face!


In [4]:
wandb.init(project="Jigsaw_PartB_training_Language_Head", name="Jigsaw_PartB_training_Language_Head")  # you can omit name if not needed


In [None]:
#Loead Data
data_folder = Path ("***/PartB/jigsaw-agile-community-rules")
train_path = data_folder / "train.csv"
test_path = data_folder / "test.csv"
df = pd.read_csv(train_path)
print(f"Dataframe shape: {df.shape}")
df.head()

df["label"] = df["rule_violation"].map({0: "complies", 1: "violates"})

stack_dataset = Dataset.from_pandas(df) # Convert pandas DataFrame to Hugging Face Dataset

selected_columns = {
    'text': stack_dataset['body'],
    'label': stack_dataset['label']
}

# Create a new dataset with the selected columns
stack_selected_columns = Dataset.from_dict(selected_columns)

# Set the format to Pandas
stack_selected_columns.set_format(type='pandas')
df =stack_selected_columns[:]
df.head()



Dataframe shape: (2029, 9)


Unnamed: 0,text,label
0,Banks don't want you to know this! Click here ...,complies
1,SD Stream [ ENG Link 1] (http://www.sportsstre...,complies
2,Lol. Try appealing the ban and say you won't d...,violates
3,she will come your home open her legs with an...,violates
4,code free tyrande --->>> [Imgur](http://i.imgu...,violates


In [6]:
class_names = ["complies", "violates"]
stack_selected_columns_final = Dataset.from_pandas(df)

# Split into Train / Validation / Test
# Split the test set into test and validation sets
test_val_splits = stack_selected_columns_final.train_test_split(test_size=0.2, seed=42)
train_split= test_val_splits['train']
test_val_splits = test_val_splits['test'].train_test_split(test_size=0.5, seed=42,)
val_split = test_val_splits['train']
test_split = test_val_splits['test']

dataset = DatasetDict({"train": train_split, "valid": val_split, "test": test_split})

dataset['train'][0]

{'text': "unethical but... make a SS# but state name and address and all perfectly.  If the IRS asks, just say that's the number you got, let the IRS sort if out for you.",
 'label': 'violates'}

# <font color = 'indianred'>**Load pre-trained Tokenizer**</font>


In [7]:
# Tokenization
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig, AutoModelForCausalLM

model_name = "Qwen/Qwen1.5-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

#<font color = 'indianred'> **Create Completion Dataset**


In [8]:
class_names = ["complies", "violates"]

def format_prompt_completion(example):
    prompt = f"Classify the TEXT by selecting label from the following list: {class_names}. ### TEXT: {example['text'].strip()} ### LABEL:"
    completion = f" {example['label'].strip()}"
    return {"prompt": prompt, "completion": completion}

dataset_completion = dataset.map(format_prompt_completion, remove_columns=["text", "label"])

dataset_completion

dataset_completion['train'][0]


Map: 100%|██████████| 1623/1623 [00:00<00:00, 25182.86 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 22905.51 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 23822.60 examples/s]


{'prompt': "Classify the TEXT by selecting label from the following list: ['complies', 'violates']. ### TEXT: unethical but... make a SS# but state name and address and all perfectly.  If the IRS asks, just say that's the number you got, let the IRS sort if out for you. ### LABEL:",
 'completion': ' violates'}

In [9]:
dataset_completion.push_to_hub(
    "reyhanehrhp7/Jigsaw_partB_language_head",
    private=False  # Set to True if you want it private
)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 154.32ba/s]
Processing Files (1 / 1): 100%|██████████|  215kB /  215kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.17 shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 625.08ba/s]
Processing Files (1 / 1): 100%|██████████| 28.3kB / 28.3kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.40 shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 556.13ba/s]
Processing Files (1 / 1): 100%|██████████| 28.6kB / 28.6kB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.40 shards/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/reyhanehrhp7/Jigsaw_partB_language_head/commit/bd5b3e05c188bb259aa9e3f7debde07a36919c50', commit_message='Upload dataset', commit_description='', oid='bd5b3e05c188bb259aa9e3f7debde07a36919c50', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/reyhanehrhp7/Jigsaw_partB_language_head', endpoint='https://huggingface.co', repo_type='dataset', repo_id='reyhanehrhp7/Jigsaw_partB_language_head'), pr_revision=None, pr_num=None)

In [10]:
train_filltered = dataset_completion['train']
valid_filltered = dataset_completion['valid']

In [11]:
# Tokenize and truncate prompts to a fixed maximum length (in tokens)
max_length = 128  # shorten sequence length further to reduce memory

def tokenize_and_truncate(example):
    tok = tokenizer(example['prompt'], truncation=True, max_length=max_length, padding=False)
    return {'input_ids': tok['input_ids'], 'attention_mask': tok['attention_mask']}

dataset_completion = dataset_completion.map(tokenize_and_truncate, remove_columns=['prompt', 'completion'], batched=False)
dataset_completion

Map: 100%|██████████| 1623/1623 [00:00<00:00, 5924.17 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 4657.43 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 4258.65 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1623
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 203
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 203
    })
})

In [12]:
def get_appropriate_dtype():
    if torch.cuda.is_available() and torch.cuda.get_device_capability(0) >= (8, 0):
        return torch.bfloat16
    return torch.float16
torch_data_type = get_appropriate_dtype()

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch_data_type,
    bnb_4bit_quant_storage=torch_data_type,
)

## Load quantized base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply 4-bit quantization settings
    device_map="auto",
    torch_dtype=torch_data_type,
    trust_remote_code=True,
)

# If a single CUDA device is available, move the whole model to that device
if torch.cuda.is_available():
    base_model.to('cuda')

# Prepare model for k-bit training - ESSENTIAL for QLoRA
model = prepare_model_for_kbit_training(base_model)

# LoRA config with smaller rank for memory efficiency
lora_config = LoraConfig(
    r=8, 
    lora_alpha=16, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Create PEFT model
model = get_peft_model(model, lora_config)

# Freeze everything except LoRA layers ---
for name, param in model.named_parameters():
    if "lora" not in name.lower():
        param.requires_grad = False


model.print_trainable_parameters()

# Free the base model and clear cache
del base_model
if torch.cuda.is_available():
    torch.cuda.empty_cache()

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.21s/it]


trainable params: 8,388,608 || all params: 7,729,713,152 || trainable%: 0.1085


In [13]:
from trl import SFTConfig
#Metrics and Training Arguments
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Ignore padding (-100)
    mask = labels != -100
    correct = (preds == labels) & mask
    token_acc = correct.sum() / mask.sum()

    return {"mean_token_accuracy": token_acc}


training_args = SFTConfig(
    output_dir="./results_qwen_lora",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=32,
    num_train_epochs=3,
    learning_rate=2e-4,
    # Use fp16 for mixed precision (set bf16=False to avoid bf16-only GPUs issues)
    bf16=True,
    fp16=False,
    # Reduce sequence length if memory is tight
    max_length = 128,
    # Enable gradient checkpointing to trade compute for memory
    gradient_checkpointing=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=["wandb"],
    load_best_model_at_end=True,
    optim="paged_adamw_32bit"
)

# If gradient checkpointing is enabled, configure relevant settings
if training_args.gradient_checkpointing:
    model.config.use_cache = False  # Disable caching for compatibility



In [14]:
from trl import SFTTrainer 
# Ensure model cache is disabled for gradient checkpointing and training memory savings
model.config.use_cache = False

# Create trainer with the PEFT model instead of base_model
trainer = SFTTrainer(
    model=model,  # Use the PEFT-wrapped model
    args=training_args,
    train_dataset=train_filltered,
    eval_dataset=valid_filltered,
    peft_config=lora_config,
    compute_metrics=compute_metrics,
)

Adding EOS to train dataset: 100%|██████████| 1623/1623 [00:00<00:00, 59277.90 examples/s]
Tokenizing train dataset: 100%|██████████| 1623/1623 [00:00<00:00, 3468.52 examples/s]
Truncating train dataset: 100%|██████████| 1623/1623 [00:00<00:00, 530663.81 examples/s]
Adding EOS to eval dataset: 100%|██████████| 203/203 [00:00<00:00, 43074.00 examples/s]
Tokenizing eval dataset: 100%|██████████| 203/203 [00:00<00:00, 3370.05 examples/s]
Truncating eval dataset: 100%|██████████| 203/203 [00:00<00:00, 115591.06 examples/s]


In [15]:
%env WANDB_PROJECT = Jigsaw_PartB_training_Language_Head


env: WANDB_PROJECT=Jigsaw_PartB_training_Language_Head


## Start training 

In [16]:
def free_gpu_memory():
    """
    Frees up GPU memory by clearing cache and garbage collecting.
    """
    import torch
    import gc
    
    # Empty CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
        # Get initial GPU memory info
        initial_mem = torch.cuda.memory_allocated()
        
        # Force garbage collection
        gc.collect()
        
        # Get final GPU memory info
        final_mem = torch.cuda.memory_allocated()
        
        print(f"GPU memory freed: {(initial_mem - final_mem) / 1024**2:.2f} MB")
    else:
        print("No GPU available")

In [17]:
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU name:", torch.cuda.get_device_name(0))


Torch version: 2.8.0+cu128
CUDA available: True
CUDA version: 12.8
GPU name: NVIDIA RTX A6000


In [18]:
free_gpu_memory()


GPU memory freed: 0.00 MB


In [19]:
import torch
torch.cuda.empty_cache()

del model
torch.cuda.empty_cache()


In [20]:
print(valid_filltered.column_names)


['prompt', 'completion']


In [21]:
# Print GPU memory stats and clear cache before training to reduce OOM occurrences
import torch
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("Memory allocated before cleaning (MiB):", torch.cuda.memory_allocated() / 1024**2)
    print("Max memory reserved (MiB):", torch.cuda.max_memory_reserved() / 1024**2)
    # Ensure cache cleared and garbage collected
    free_gpu_memory()

# Finally start training inside try/except to catch OOM and provide a graceful message
try:
    trainer.train()
except RuntimeError as e:
    if 'CUDA out of memory' in str(e):
        print("CUDA out of memory error detected during trainer.train().")
        free_gpu_memory()
        # Suggest next steps to the user instead of retrying automatically
        print("Training OOM. Consider: lower per_device_train_batch_size, shorten max_length, increase gradient_accumulation_steps, enable/offload.")
    else:
        raise e



GPU name: NVIDIA RTX A6000
Memory allocated before cleaning (MiB): 5607.61083984375
Max memory reserved (MiB): 11656.0
GPU memory freed: 0.00 MB

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.





  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Mean Token Accuracy,Entropy,Num Tokens
1,0.3306,0.227301,0.88317,2.63787,118248.0
2,0.2082,0.203367,0.90915,2.600972,236496.0
3,0.1559,0.215159,0.899346,2.433299,354744.0


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [None]:
#evaluation on validaton set
metrics = trainer.evaluate()
print(metrics)



{'eval_loss': 0.2033674120903015, 'eval_mean_token_accuracy': 0.9091503292906518, 'eval_runtime': 17.9678, 'eval_samples_per_second': 11.298, 'eval_steps_per_second': 5.677, 'eval_entropy': 2.600971795764624, 'eval_num_tokens': 354744.0, 'epoch': 3.0}


In [23]:
#Evaluation on test set
test_filltered = dataset_completion['test']

In [24]:
test_metrics = trainer.evaluate(eval_dataset=test_filltered)
print(test_metrics)

{'eval_loss': 4.292723178863525, 'eval_mean_token_accuracy': 0.3289932938767414, 'eval_runtime': 18.0345, 'eval_samples_per_second': 11.256, 'eval_steps_per_second': 5.656, 'eval_entropy': 2.6436598476241615, 'eval_num_tokens': 354744.0, 'epoch': 3.0}


In [25]:
print("Best checkpoint:", trainer.state.best_model_checkpoint)
print("Best metric:", trainer.state.best_metric)


Best checkpoint: ./results_qwen_lora/checkpoint-52
Best metric: 0.2033674120903015


In [None]:
wandb.finish()