In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

def setup_model_and_tokenizer(model_name, device):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False


    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                # Preferred: Another phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                # Rejected: A ham email
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                # Preferred: Another ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                # Rejected: A phishing email
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        
        pair = self.pairs[idx]


        message_text = f"Sender: {pair['message']['sender']} [SEP] Subject: {pair['message']['subject']} [SEP] {pair['message']['body']}"
     
        preferred_response = f"Sender: {pair['preferred']['sender']} [SEP] Subject: {pair['preferred']['subject']} [SEP] {pair['preferred']['body']}"
        rejected_response = f"Sender: {pair['rejected']['sender']} [SEP] Subject: {pair['rejected']['subject']} [SEP] {pair['rejected']['body']}"
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=5, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

  
    device = setup_environment()
    model_name = 'meta-llama/Meta-Llama-3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_8b_dpo_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": s tr(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/500] - Loss: 6.0408
[Epoch 1/8 | Step 10/500] - Loss: 3.5454
[Epoch 1/8 | Step 20/500] - Loss: 3.1262
[Epoch 1/8 | Step 30/500] - Loss: 3.4185
[Epoch 1/8 | Step 40/500] - Loss: 3.4728
[Epoch 1/8 | Step 50/500] - Loss: 3.5754
[Epoch 1/8 | Step 60/500] - Loss: 3.7399
[Epoch 1/8 | Step 70/500] - Loss: 3.7017
[Epoch 1/8 | Step 80/500] - Loss: 3.6941
[Epoch 1/8 | Step 90/500] - Loss: 3.6940
[Epoch 1/8 | Step 100/500] - Loss: 3.7085
[Epoch 1/8 | Step 110/500] - Loss: 3.6996
[Epoch 1/8 | Step 120/500] - Loss: 3.6417
[Epoch 1/8 | Step 130/500] - Loss: 3.5847
[Epoch 1/8 | Step 140/500] - Loss: 3.6472
[Epoch 1/8 | Step 150/500] - Loss: 3.6390
[Epoch 1/8 | Step 160/500] - Loss: 3.6050
[Epoch 1/8 | Step 170/500] - Loss: 3.5473
[Epoch 1/8 | Step 180/500] - Loss: 3.4963
[Epoch 1/8 | Step 190/500] - Loss: 3.4660
[Epoch 1/8 | Step 200/500] - Loss: 3.4016
[Epoch 1/8 | Step 210/500] - Loss: 3.4156
[Epoch 1/8 | Step 220/500] - Loss: 3.4077
[Epoch 1/8 | Step 230/500] - Loss: 3.3576
[Ep

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

def setup_model_and_tokenizer(model_name, device):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False


    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=5, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

  
    device = setup_environment()
    model_name = 'meta-llama/Meta-Llama-3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_8b_dpo123_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/500] - Loss: 1.0263
[Epoch 1/8 | Step 10/500] - Loss: 2.2608
[Epoch 1/8 | Step 20/500] - Loss: 2.6308
[Epoch 1/8 | Step 30/500] - Loss: 3.0243
[Epoch 1/8 | Step 40/500] - Loss: 3.1960
[Epoch 1/8 | Step 50/500] - Loss: 3.4414
[Epoch 1/8 | Step 60/500] - Loss: 3.3087
[Epoch 1/8 | Step 70/500] - Loss: 3.2294
[Epoch 1/8 | Step 80/500] - Loss: 3.2480
[Epoch 1/8 | Step 90/500] - Loss: 3.3849
[Epoch 1/8 | Step 100/500] - Loss: 3.3588
[Epoch 1/8 | Step 110/500] - Loss: 3.5123
[Epoch 1/8 | Step 120/500] - Loss: 3.5350
[Epoch 1/8 | Step 130/500] - Loss: 3.5167
[Epoch 1/8 | Step 140/500] - Loss: 3.4866
[Epoch 1/8 | Step 150/500] - Loss: 3.4178
[Epoch 1/8 | Step 160/500] - Loss: 3.3666
[Epoch 1/8 | Step 170/500] - Loss: 3.4325
[Epoch 1/8 | Step 180/500] - Loss: 3.4171
[Epoch 1/8 | Step 190/500] - Loss: 3.4409
[Epoch 1/8 | Step 200/500] - Loss: 3.3828
[Epoch 1/8 | Step 210/500] - Loss: 3.3968
[Epoch 1/8 | Step 220/500] - Loss: 3.4264
[Epoch 1/8 | Step 230/500] - Loss: 3.3990
[Ep

In [3]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
# Keep/Add these imports
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login

import re
import traceback




from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

def setup_model_and_tokenizer(model_name, device):
    """Setup BERT model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config
    )
    
    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        # Construct the prompt for phishing or ham detection
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=5, beta=0.2, gradient_accumulation_steps=2):
    """
    Train the model using DPO (Direct Preference Optimization)
    
    Args:
        policy_model: The model being trained
        reference_model: The reference model for DPO
        train_loader: DataLoader for training data
        val_loader: DataLoader for validation data
        optimizer: Optimizer for training
        scheduler: Learning rate scheduler
        device: Device to train on (cuda/cpu)
        num_epochs: Number of training epochs
        beta: Temperature parameter for DPO
        gradient_accumulation_steps: Number of steps to accumulate gradients
    """
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device)
    reference_model = reference_model.to(device)
    reference_model.eval()
    
    # Initialize GradScaler for mixed precision training
    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                if scaler is not None:
                    # Mixed precision training path
                    with torch.cuda.amp.autocast():
                        # Forward pass
                        policy_chosen_outputs = policy_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        policy_rejected_outputs = policy_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                        
                        with torch.no_grad():
                            ref_chosen_outputs = reference_model(
                                input_ids=batch['preferred_input_ids'],
                                attention_mask=batch['preferred_attention_mask']
                            )
                            ref_rejected_outputs = reference_model(
                                input_ids=batch['rejected_input_ids'],
                                attention_mask=batch['rejected_attention_mask']
                            )
                        
                        # Compute loss
                        loss = compute_dpo_loss(
                            policy_chosen_outputs.logits,
                            policy_rejected_outputs.logits,
                            ref_chosen_outputs.logits,
                            ref_rejected_outputs.logits,
                            beta=beta
                        )
                        
                        if not torch.isnan(loss) and not torch.isinf(loss):
                            loss = loss / gradient_accumulation_steps
                            # Mixed precision backward pass
                            scaler.scale(loss).backward()
                            
                            if (step + 1) % gradient_accumulation_steps == 0:
                                scaler.unscale_(optimizer)
                                torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                                scaler.step(optimizer)
                                scaler.update()
                                scheduler.step()
                                optimizer.zero_grad()
                            
                            total_loss += loss.item() * gradient_accumulation_steps
                            valid_steps += 1
                else:
                    # Standard precision training path
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        loss = loss / gradient_accumulation_steps
                        loss.backward()
                        
                        if (step + 1) % gradient_accumulation_steps == 0:
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            optimizer.step()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item() * gradient_accumulation_steps
                        valid_steps += 1
                
                if step % 10 == 0:
                    avg_loss = total_loss / max(valid_steps, 1)
                    print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
            
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            # Memory management
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items()}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
    """
    Evaluate the model using DPO loss
    
    Args:
        policy_model: The model being evaluated
        reference_model: The reference model for DPO
        val_loader: DataLoader for validation data
        device: Device to evaluate on (cuda/cpu)
        beta: Temperature parameter for DPO
    """
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            if torch.cuda.is_available():
                with torch.cuda.amp.autocast():
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    ref_chosen_outputs = reference_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    ref_rejected_outputs = reference_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
            else:
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
            
            loss = compute_dpo_loss(
                policy_chosen_outputs.logits,
                policy_rejected_outputs.logits,
                ref_chosen_outputs.logits,
                ref_rejected_outputs.logits,
                beta=beta
            )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

# Helper function for computing DPO loss
def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
    """
    Compute the DPO loss between policy and reference models
    
    Args:
        policy_chosen_logits: Logits from policy model for chosen examples
        policy_rejected_logits: Logits from policy model for rejected examples
        reference_chosen_logits: Logits from reference model for chosen examples
        reference_rejected_logits: Logits from reference model for rejected examples
        beta: Temperature parameter
    """
    epsilon = 1e-8
    
    # Get probabilities
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
    # Compute rewards
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    # Clamp rewards
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    # Compute loss
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def main():

    
    device = setup_environment()
    model_name = 'bert-base-uncased'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/bert_dpo123_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/500] - Loss: 0.8203
[Epoch 1/8 | Step 10/500] - Loss: 0.7087
[Epoch 1/8 | Step 20/500] - Loss: 0.7226
[Epoch 1/8 | Step 30/500] - Loss: 0.7019
[Epoch 1/8 | Step 40/500] - Loss: 0.7047
[Epoch 1/8 | Step 50/500] - Loss: 0.6942
[Epoch 1/8 | Step 60/500] - Loss: 0.6977
[Epoch 1/8 | Step 70/500] - Loss: 0.6917
[Epoch 1/8 | Step 80/500] - Loss: 0.6937
[Epoch 1/8 | Step 90/500] - Loss: 0.6947
[Epoch 1/8 | Step 100/500] - Loss: 0.6892
[Epoch 1/8 | Step 110/500] - Loss: 0.6869
[Epoch 1/8 | Step 120/500] - Loss: 0.6846
[Epoch 1/8 | Step 130/500] - Loss: 0.6834
[Epoch 1/8 | Step 140/500] - Loss: 0.6790
[Epoch 1/8 | Step 150/500] - Loss: 0.6726
[Epoch 1/8 | Step 160/500] - Loss: 0.6572
[Epoch 1/8 | Step 170/500] - Loss: 0.6463
[Epoch 1/8 | Step 180/500] - Loss: 0.6308
[Epoch 1/8 | Step 190/500] - Loss: 0.6082
[Epoch 1/8 | Step 200/500] - Loss: 0.5822
[Epoch 1/8 | Step 210/500] - Loss: 0.5622
[Epoch 1/8 | Step 220/500] - Loss: 0.5386
[Epoch 1/8 | Step 230/500] - Loss: 0.5172
[Ep

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
# Keep/Add these imports
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login

import re
import traceback




from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

def setup_model_and_tokenizer(model_name, device):
    """Setup BERT model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config
    )
    
    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        # Construct the prompt for phishing or ham detection
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=5, beta=0.2, gradient_accumulation_steps=2):
    """
    Train the model using DPO (Direct Preference Optimization)
    
    Args:
        policy_model: The model being trained
        reference_model: The reference model for DPO
        train_loader: DataLoader for training data
        val_loader: DataLoader for validation data
        optimizer: Optimizer for training
        scheduler: Learning rate scheduler
        device: Device to train on (cuda/cpu)
        num_epochs: Number of training epochs
        beta: Temperature parameter for DPO
        gradient_accumulation_steps: Number of steps to accumulate gradients
    """
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device)
    reference_model = reference_model.to(device)
    reference_model.eval()
    
    # Initialize GradScaler for mixed precision training
    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                if scaler is not None:
                    # Mixed precision training path
                    with torch.cuda.amp.autocast():
                        # Forward pass
                        policy_chosen_outputs = policy_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        policy_rejected_outputs = policy_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                        
                        with torch.no_grad():
                            ref_chosen_outputs = reference_model(
                                input_ids=batch['preferred_input_ids'],
                                attention_mask=batch['preferred_attention_mask']
                            )
                            ref_rejected_outputs = reference_model(
                                input_ids=batch['rejected_input_ids'],
                                attention_mask=batch['rejected_attention_mask']
                            )
                        
                        # Compute loss
                        loss = compute_dpo_loss(
                            policy_chosen_outputs.logits,
                            policy_rejected_outputs.logits,
                            ref_chosen_outputs.logits,
                            ref_rejected_outputs.logits,
                            beta=beta
                        )
                        
                        if not torch.isnan(loss) and not torch.isinf(loss):
                            loss = loss / gradient_accumulation_steps
                            # Mixed precision backward pass
                            scaler.scale(loss).backward()
                            
                            if (step + 1) % gradient_accumulation_steps == 0:
                                scaler.unscale_(optimizer)
                                torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                                scaler.step(optimizer)
                                scaler.update()
                                scheduler.step()
                                optimizer.zero_grad()
                            
                            total_loss += loss.item() * gradient_accumulation_steps
                            valid_steps += 1
                else:
                    # Standard precision training path
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        loss = loss / gradient_accumulation_steps
                        loss.backward()
                        
                        if (step + 1) % gradient_accumulation_steps == 0:
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            optimizer.step()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item() * gradient_accumulation_steps
                        valid_steps += 1
                
                if step % 10 == 0:
                    avg_loss = total_loss / max(valid_steps, 1)
                    print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
            
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            # Memory management
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items()}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
    
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            if torch.cuda.is_available():
                with torch.cuda.amp.autocast():
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    ref_chosen_outputs = reference_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    ref_rejected_outputs = reference_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
            else:
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
            
            loss = compute_dpo_loss(
                policy_chosen_outputs.logits,
                policy_rejected_outputs.logits,
                ref_chosen_outputs.logits,
                ref_rejected_outputs.logits,
                beta=beta
            )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

# Helper function for computing DPO loss
def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
    """
    Compute the DPO loss between policy and reference models
    
    Args:
        policy_chosen_logits: Logits from policy model for chosen examples
        policy_rejected_logits: Logits from policy model for rejected examples
        reference_chosen_logits: Logits from reference model for chosen examples
        reference_rejected_logits: Logits from reference model for rejected examples
        beta: Temperature parameter
    """
    epsilon = 1e-8
    
    # Get probabilities
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
    # Compute rewards
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    # Clamp rewards
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    # Compute loss
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def main():

   
    device = setup_environment()
    model_name = 'bert-base-uncased'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/bert_dpo123_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/500] - Loss: 0.8203
[Epoch 1/8 | Step 10/500] - Loss: 0.7087
[Epoch 1/8 | Step 20/500] - Loss: 0.7226
[Epoch 1/8 | Step 30/500] - Loss: 0.7019
[Epoch 1/8 | Step 40/500] - Loss: 0.7047
[Epoch 1/8 | Step 50/500] - Loss: 0.6942
[Epoch 1/8 | Step 60/500] - Loss: 0.6977
[Epoch 1/8 | Step 70/500] - Loss: 0.6917
[Epoch 1/8 | Step 80/500] - Loss: 0.6937
[Epoch 1/8 | Step 90/500] - Loss: 0.6947
[Epoch 1/8 | Step 100/500] - Loss: 0.6892
[Epoch 1/8 | Step 110/500] - Loss: 0.6869
[Epoch 1/8 | Step 120/500] - Loss: 0.6846
[Epoch 1/8 | Step 130/500] - Loss: 0.6834
[Epoch 1/8 | Step 140/500] - Loss: 0.6790
[Epoch 1/8 | Step 150/500] - Loss: 0.6726
[Epoch 1/8 | Step 160/500] - Loss: 0.6572
[Epoch 1/8 | Step 170/500] - Loss: 0.6463
[Epoch 1/8 | Step 180/500] - Loss: 0.6308
[Epoch 1/8 | Step 190/500] - Loss: 0.6082
[Epoch 1/8 | Step 200/500] - Loss: 0.5822
[Epoch 1/8 | Step 210/500] - Loss: 0.5622
[Epoch 1/8 | Step 220/500] - Loss: 0.5386
[Epoch 1/8 | Step 230/500] - Loss: 0.5172
[Ep