In [44]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

import wandb
from huggingface_hub import login, HfApi, create_repo
from pathlib import Path
from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


In [None]:
#Wandb & Huggingface keys
hf_token = "*****"
wandb_api_key = "******"

In [46]:
# Intitialize Weights & Biases
if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("Successfully logged in to WANDB!")
else:
    print("No wandb key provided. Skipping wandb login.")



Successfully logged in to WANDB!


In [47]:
if hf_token:
    # Log in to Hugging Face
    login(token=hf_token)
    print("Successfully logged in to Hugging Face!")
else:
    print("Hugging Face token not found in notebook secrets.")


Successfully logged in to Hugging Face!


In [None]:
#Loead Data
data_folder = Path ("****/jigsaw-agile-community-rules")
train_path = data_folder / "train.csv"
test_path = data_folder / "test.csv"
df = pd.read_csv(train_path)
print(f"Dataframe shape: {df.shape}")
df.head()

Dataframe shape: (2029, 9)


Unnamed: 0,row_id,body,rule,subreddit,positive_example_1,positive_example_2,negative_example_1,negative_example_2,rule_violation
0,0,Banks don't want you to know this! Click here ...,"No Advertising: Spam, referral links, unsolici...",Futurology,If you could tell your younger self something ...,hunt for lady for jack off in neighbourhood ht...,Watch Golden Globe Awards 2017 Live Online in ...,"DOUBLE CEE x BANDS EPPS - ""BIRDS""\n\nDOWNLOAD/...",0
1,1,SD Stream [ ENG Link 1] (http://www.sportsstre...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,[I wanna kiss you all over! Stunning!](http://...,LOLGA.COM is One of the First Professional Onl...,#Rapper \n🚨Straight Outta Cross Keys SC 🚨YouTu...,[15 Amazing Hidden Features Of Google Search Y...,0
2,2,Lol. Try appealing the ban and say you won't d...,No legal advice: Do not offer or request legal...,pcmasterrace,Don't break up with him or call the cops. If ...,It'll be dismissed: https://en.wikipedia.org/w...,Where is there a site that still works where y...,Because this statement of his is true. It isn'...,1
3,3,she will come your home open her legs with an...,"No Advertising: Spam, referral links, unsolici...",sex,Selling Tyrande codes for 3€ to paypal. PM. \n...,tight pussy watch for your cock get her at thi...,NSFW(obviously) http://spankbang.com/iy3u/vide...,Good News ::Download WhatsApp 2.16.230 APK for...,1
4,4,code free tyrande --->>> [Imgur](http://i.imgu...,"No Advertising: Spam, referral links, unsolici...",hearthstone,wow!! amazing reminds me of the old days.Well...,seek for lady for sex in around http://p77.pl/...,must be watch movie https://sites.google.com/s...,We're streaming Pokemon Veitnamese Crystal RIG...,1


In [49]:
df = df[["body", "rule_violation"]].rename(columns={"body": "text", "rule_violation": "label"})


In [50]:
# Split into Train / Validation / Test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

Train: 1623 | Val: 203 | Test: 203


In [51]:
def get_appropriate_dtype():
    if torch.cuda.is_available() and torch.cuda.get_device_capability(0) >= (8, 0):
        return torch.bfloat16
    return torch.float16

torch_data_type = get_appropriate_dtype()
torch_data_type

torch.bfloat16

In [52]:
#Tokenizer + Model (Qwen-2-7B)
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig

model_name = "Qwen/Qwen1.5-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True,
  bnb_4bit_compute_dtype=torch_data_type,
  bnb_4bit_quant_storage=torch_data_type,
)
## Load quantized base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Binary classification
    problem_type="single_label_classification",
    quantization_config=bnb_config,  # Apply 8-bit quantization
    device_map="auto",
    torch_dtype=torch_data_type,
    trust_remote_code=True)
  # Automatically map model to available devices


Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen1.5-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
base_model

Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 4096)
    (layers): ModuleList(
      (0-31): 32 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((4096,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((4096,), eps=1e-06)
      )
    )
    (norm): Qwen2R

In [53]:
#Attach a LoRA adapter (PEFT)

# Prepare for LoRA fine-tuning
model = prepare_model_for_kbit_training(base_model)

# Qwen attention modules use slightly different names than LLaMA/Mistral
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)




In [54]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 16,785,408 || all params: 7,115,788,288 || trainable%: 0.2359


In [14]:
# def free_gpu_memory():
#     """
#     Frees up GPU memory after CUDA out-of-memory error in Colab.

#     This function performs the following steps:
#     1. Deletes all PyTorch objects to clear references.
#     2. Calls garbage collection to remove unreferenced objects from memory.
#     3. Uses torch.cuda.empty_cache() to release cached GPU memory.
#     4. Waits for a moment to ensure memory is fully released.
#     """
#     try:
#         # Delete all torch tensors to free up memory
#         for obj in list(locals().values()):
#             if torch.is_tensor(obj):
#                 del obj

#         # Collect garbage to release any remaining unused memory
#         gc.collect()

#         # Empty the CUDA cache to release GPU memory
#         torch.cuda.empty_cache()

#         # Adding a small delay to allow memory to be fully released
#         time.sleep(2)

#         print("GPU memory has been freed.")
#     except Exception as e:
#         print(f"Error while freeing GPU memory: {e}")


In [55]:
# Tokenize Datasets

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dfataset = Dataset.from_pandas(val_df)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_dataset = train_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dfataset.map(tokenize_fn, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 1623/1623 [00:00<00:00, 15145.53 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 12510.01 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 13035.56 examples/s]


In [56]:
#Metrics and Training Arguments
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


training_args = TrainingArguments(
    output_dir="./results_qwen_lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=True,
    gradient_checkpointing=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=["wandb"],
    load_best_model_at_end=True,
)



In [17]:
# Train
trainer = Trainer(
    model=model,  # now a PEFT-wrapped model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,22.229,1.479141,0.615764,0.458333
2,6.2782,0.658259,0.768473,0.763819
3,2.1972,0.666508,0.778325,0.794521


TrainOutput(global_step=306, training_loss=10.234817654478784, metrics={'train_runtime': 1636.6808, 'train_samples_per_second': 2.975, 'train_steps_per_second': 0.187, 'total_flos': 4.856317302590669e+16, 'train_loss': 10.234817654478784, 'epoch': 3.0})

In [18]:
trainer.evaluate(val_dataset)

{'eval_loss': 0.6582591533660889,
 'eval_accuracy': 0.7684729064039408,
 'eval_f1': 0.7638190954773869,
 'eval_runtime': 11.5148,
 'eval_samples_per_second': 17.629,
 'eval_steps_per_second': 2.258,
 'epoch': 3.0}

In [19]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.7552276253700256,
 'eval_accuracy': 0.7438423645320197,
 'eval_f1': 0.7425742574257426,
 'eval_runtime': 11.4942,
 'eval_samples_per_second': 17.661,
 'eval_steps_per_second': 2.262,
 'epoch': 3.0}

In [57]:
# Find the best checkpoint saved by Trainer
import os

best_model_dir = trainer.state.best_model_checkpoint
print("Best model checkpoint:", best_model_dir)


Best model checkpoint: ./results_qwen_lora/checkpoint-306


## Inference


In [58]:
# Load that checkpoint
from transformers import AutoModelForSequenceClassification
model_name = "Qwen/Qwen1.5-7B" 
best_lora_checkpoint = best_model_dir


In [59]:

# Quantization setup (same as training)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [60]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [61]:
# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    problem_type="single_label_classification",
    quantization_config=bnb_config,
    device_map={"": 0},
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen1.5-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
from peft import PeftModel

# Load LoRA adapter checkpoint
model = PeftModel.from_pretrained(base_model, best_lora_checkpoint)
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Qwen2ForSequenceClassification(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 4096)
        (layers): ModuleList(
          (0-31): 32 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )


In [64]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()
print(" GPU cache cleared.")


 GPU cache cleared.


In [65]:
from tqdm import tqdm
df_test = pd.read_csv(test_path)

# --- Run inference on your test data ---
preds, probs = [], []

for text in tqdm(df_test["body"], desc="Running inference on test set"):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    ).to(model.device)

    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(logits, dim=-1).item()
        prob = torch.softmax(logits, dim=-1)[0, 1].item()

    preds.append(pred)
    probs.append(prob)

Running inference on test set: 100%|██████████| 10/10 [00:00<00:00, 15.10it/s]


In [66]:

output_df = pd.DataFrame({
    "row_id": df_test["row_id"],       # first column (from test_df)
    "rule_violation": preds            # second column (model predictions)
})

In [67]:
# Save to CSV
output_path = "submission_partA_best.csv"
output_df.to_csv(output_path, index=False)