In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,LlamaModel,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

def setup_model_and_tokenizer(model_name, device):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False


    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=8, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

    login(token="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")
    device = setup_environment()
    model_name = 'meta-llama/Meta-Llama-3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_8B")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/600] - Loss: 7.1610
[Epoch 1/8 | Step 10/600] - Loss: 4.7485
[Epoch 1/8 | Step 20/600] - Loss: 4.7732
[Epoch 1/8 | Step 30/600] - Loss: 4.3574
[Epoch 1/8 | Step 40/600] - Loss: 3.9310
[Epoch 1/8 | Step 50/600] - Loss: 4.1771
[Epoch 1/8 | Step 60/600] - Loss: 4.3604
[Epoch 1/8 | Step 70/600] - Loss: 4.3687
[Epoch 1/8 | Step 80/600] - Loss: 4.3960
[Epoch 1/8 | Step 90/600] - Loss: 4.4058
[Epoch 1/8 | Step 100/600] - Loss: 4.4079
[Epoch 1/8 | Step 110/600] - Loss: 4.4454
[Epoch 1/8 | Step 120/600] - Loss: 4.3950
[Epoch 1/8 | Step 130/600] - Loss: 4.4077
[Epoch 1/8 | Step 140/600] - Loss: 4.2805
[Epoch 1/8 | Step 150/600] - Loss: 4.2940
[Epoch 1/8 | Step 160/600] - Loss: 4.3175
[Epoch 1/8 | Step 170/600] - Loss: 4.2830
[Epoch 1/8 | Step 180/600] - Loss: 4.1601
[Epoch 1/8 | Step 190/600] - Loss: 4.1684
[Epoch 1/8 | Step 200/600] - Loss: 4.0842
[Epoch 1/8 | Step 210/600] - Loss: 3.9931
[Epoch 1/8 | Step 220/600] - Loss: 4.0245
[Epoch 1/8 | Step 230/600] - Loss: 4.0657
[Ep

[Epoch 4/8 | Step 130/600] - Loss: 2.2164
[Epoch 4/8 | Step 140/600] - Loss: 2.2823
[Epoch 4/8 | Step 150/600] - Loss: 2.3189
[Epoch 4/8 | Step 160/600] - Loss: 2.3333
[Epoch 4/8 | Step 170/600] - Loss: 2.2912
[Epoch 4/8 | Step 180/600] - Loss: 2.3598
[Epoch 4/8 | Step 190/600] - Loss: 2.4061
[Epoch 4/8 | Step 200/600] - Loss: 2.3701
[Epoch 4/8 | Step 210/600] - Loss: 2.3815
[Epoch 4/8 | Step 220/600] - Loss: 2.3756
[Epoch 4/8 | Step 230/600] - Loss: 2.3394
[Epoch 4/8 | Step 240/600] - Loss: 2.3399
[Epoch 4/8 | Step 250/600] - Loss: 2.3257
[Epoch 4/8 | Step 260/600] - Loss: 2.3078
[Epoch 4/8 | Step 270/600] - Loss: 2.3147
[Epoch 4/8 | Step 280/600] - Loss: 2.3165
[Epoch 4/8 | Step 290/600] - Loss: 2.3023
[Epoch 4/8 | Step 300/600] - Loss: 2.2862
[Epoch 4/8 | Step 310/600] - Loss: 2.2724
[Epoch 4/8 | Step 320/600] - Loss: 2.2588
[Epoch 4/8 | Step 330/600] - Loss: 2.2219
[Epoch 4/8 | Step 340/600] - Loss: 2.2035
[Epoch 4/8 | Step 350/600] - Loss: 2.1889
[Epoch 4/8 | Step 360/600] - Loss:

[Epoch 7/8 | Step 260/600] - Loss: 1.9099
[Epoch 7/8 | Step 270/600] - Loss: 1.9065
[Epoch 7/8 | Step 280/600] - Loss: 1.9100
[Epoch 7/8 | Step 290/600] - Loss: 1.9284
[Epoch 7/8 | Step 300/600] - Loss: 1.9345
[Epoch 7/8 | Step 310/600] - Loss: 1.9246
[Epoch 7/8 | Step 320/600] - Loss: 1.9375
[Epoch 7/8 | Step 330/600] - Loss: 1.9442
[Epoch 7/8 | Step 340/600] - Loss: 1.9410
[Epoch 7/8 | Step 350/600] - Loss: 1.9550
[Epoch 7/8 | Step 360/600] - Loss: 1.9500
[Epoch 7/8 | Step 370/600] - Loss: 1.9653
[Epoch 7/8 | Step 380/600] - Loss: 1.9542
[Epoch 7/8 | Step 390/600] - Loss: 1.9595
[Epoch 7/8 | Step 400/600] - Loss: 1.9346
[Epoch 7/8 | Step 410/600] - Loss: 1.9277
[Epoch 7/8 | Step 420/600] - Loss: 1.9251
[Epoch 7/8 | Step 430/600] - Loss: 1.9455
[Epoch 7/8 | Step 440/600] - Loss: 1.9610
[Epoch 7/8 | Step 450/600] - Loss: 1.9624
[Epoch 7/8 | Step 460/600] - Loss: 1.9748
[Epoch 7/8 | Step 470/600] - Loss: 1.9754
[Epoch 7/8 | Step 480/600] - Loss: 1.9762
[Epoch 7/8 | Step 490/600] - Loss:

In [2]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,LlamaModel,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

def setup_model_and_tokenizer(model_name, device):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False


    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=8, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

    login(token="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")
    device = setup_environment()
    model_name =  'meta-llama/Llama-2-7b-chat-hf'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_7B")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/600] - Loss: 6.6057
[Epoch 1/8 | Step 10/600] - Loss: 4.7892
[Epoch 1/8 | Step 20/600] - Loss: 4.2138
[Epoch 1/8 | Step 30/600] - Loss: 3.7690
[Epoch 1/8 | Step 40/600] - Loss: 3.5323
[Epoch 1/8 | Step 50/600] - Loss: 3.4411
[Epoch 1/8 | Step 60/600] - Loss: 3.4594
[Epoch 1/8 | Step 70/600] - Loss: 3.4469
[Epoch 1/8 | Step 80/600] - Loss: 3.4203
[Epoch 1/8 | Step 90/600] - Loss: 3.3946
[Epoch 1/8 | Step 100/600] - Loss: 3.3460
[Epoch 1/8 | Step 110/600] - Loss: 3.3276
[Epoch 1/8 | Step 120/600] - Loss: 3.2951
[Epoch 1/8 | Step 130/600] - Loss: 3.3332
[Epoch 1/8 | Step 140/600] - Loss: 3.3563
[Epoch 1/8 | Step 150/600] - Loss: 3.3373
[Epoch 1/8 | Step 160/600] - Loss: 3.3860
[Epoch 1/8 | Step 170/600] - Loss: 3.4165
[Epoch 1/8 | Step 180/600] - Loss: 3.3524
[Epoch 1/8 | Step 190/600] - Loss: 3.3358
[Epoch 1/8 | Step 200/600] - Loss: 3.3656
[Epoch 1/8 | Step 210/600] - Loss: 3.3729
[Epoch 1/8 | Step 220/600] - Loss: 3.3451
[Epoch 1/8 | Step 230/600] - Loss: 3.3427
[Ep

[Epoch 4/8 | Step 130/600] - Loss: 2.1675
[Epoch 4/8 | Step 140/600] - Loss: 2.1429
[Epoch 4/8 | Step 150/600] - Loss: 2.1392
[Epoch 4/8 | Step 160/600] - Loss: 2.1490
[Epoch 4/8 | Step 170/600] - Loss: 2.1589
[Epoch 4/8 | Step 180/600] - Loss: 2.2471
[Epoch 4/8 | Step 190/600] - Loss: 2.2743
[Epoch 4/8 | Step 200/600] - Loss: 2.2375
[Epoch 4/8 | Step 210/600] - Loss: 2.2367
[Epoch 4/8 | Step 220/600] - Loss: 2.1950
[Epoch 4/8 | Step 230/600] - Loss: 2.1940
[Epoch 4/8 | Step 240/600] - Loss: 2.1836
[Epoch 4/8 | Step 250/600] - Loss: 2.1952
[Epoch 4/8 | Step 260/600] - Loss: 2.1980
[Epoch 4/8 | Step 270/600] - Loss: 2.2081
[Epoch 4/8 | Step 280/600] - Loss: 2.1927
[Epoch 4/8 | Step 290/600] - Loss: 2.1841
[Epoch 4/8 | Step 300/600] - Loss: 2.1527
[Epoch 4/8 | Step 310/600] - Loss: 2.1414
[Epoch 4/8 | Step 320/600] - Loss: 2.1488
[Epoch 4/8 | Step 330/600] - Loss: 2.1321
[Epoch 4/8 | Step 340/600] - Loss: 2.1533
[Epoch 4/8 | Step 350/600] - Loss: 2.1466
[Epoch 4/8 | Step 360/600] - Loss:

[Epoch 7/8 | Step 260/600] - Loss: 1.9143
[Epoch 7/8 | Step 270/600] - Loss: 1.9063
[Epoch 7/8 | Step 280/600] - Loss: 1.9072
[Epoch 7/8 | Step 290/600] - Loss: 1.9237
[Epoch 7/8 | Step 300/600] - Loss: 1.9186
[Epoch 7/8 | Step 310/600] - Loss: 1.9316
[Epoch 7/8 | Step 320/600] - Loss: 1.9439
[Epoch 7/8 | Step 330/600] - Loss: 1.9432
[Epoch 7/8 | Step 340/600] - Loss: 1.9490
[Epoch 7/8 | Step 350/600] - Loss: 1.9423
[Epoch 7/8 | Step 360/600] - Loss: 1.9697
[Epoch 7/8 | Step 370/600] - Loss: 1.9886
[Epoch 7/8 | Step 380/600] - Loss: 1.9776
[Epoch 7/8 | Step 390/600] - Loss: 1.9746
[Epoch 7/8 | Step 400/600] - Loss: 1.9893
[Epoch 7/8 | Step 410/600] - Loss: 2.0007
[Epoch 7/8 | Step 420/600] - Loss: 1.9953
[Epoch 7/8 | Step 430/600] - Loss: 2.0069
[Epoch 7/8 | Step 440/600] - Loss: 1.9991
[Epoch 7/8 | Step 450/600] - Loss: 1.9922
[Epoch 7/8 | Step 460/600] - Loss: 1.9908
[Epoch 7/8 | Step 470/600] - Loss: 2.0038
[Epoch 7/8 | Step 480/600] - Loss: 2.0011
[Epoch 7/8 | Step 490/600] - Loss:

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,LlamaModel,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

def setup_model_and_tokenizer(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False

   
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()

    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=8, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

    login(token="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")
    device = setup_environment()
    model_name =   'dreamgen/WizardLM-2-7B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_7B_Wizard")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/600] - Loss: 28.3284
[Epoch 1/8 | Step 10/600] - Loss: 21.6973
[Epoch 1/8 | Step 20/600] - Loss: 17.3904
[Epoch 1/8 | Step 30/600] - Loss: 16.5516
[Epoch 1/8 | Step 40/600] - Loss: 14.3947
[Epoch 1/8 | Step 50/600] - Loss: 13.9590
[Epoch 1/8 | Step 60/600] - Loss: 13.7488
[Epoch 1/8 | Step 70/600] - Loss: 14.6531
[Epoch 1/8 | Step 80/600] - Loss: 14.5961
[Epoch 1/8 | Step 90/600] - Loss: 14.3592
[Epoch 1/8 | Step 100/600] - Loss: 13.8593
[Epoch 1/8 | Step 110/600] - Loss: 13.9053
[Epoch 1/8 | Step 120/600] - Loss: 13.8606
[Epoch 1/8 | Step 130/600] - Loss: 13.7220
[Epoch 1/8 | Step 140/600] - Loss: 13.5348
[Epoch 1/8 | Step 150/600] - Loss: 13.5671
[Epoch 1/8 | Step 160/600] - Loss: 13.1902
[Epoch 1/8 | Step 170/600] - Loss: 13.4731
[Epoch 1/8 | Step 180/600] - Loss: 13.5168
[Epoch 1/8 | Step 190/600] - Loss: 13.6146
[Epoch 1/8 | Step 200/600] - Loss: 13.5607
[Epoch 1/8 | Step 210/600] - Loss: 13.6931
[Epoch 1/8 | Step 220/600] - Loss: 13.9038
[Epoch 1/8 | Step 230/

[Epoch 4/8 | Step 80/600] - Loss: 10.9356
[Epoch 4/8 | Step 90/600] - Loss: 11.3245
[Epoch 4/8 | Step 100/600] - Loss: 10.9931
[Epoch 4/8 | Step 110/600] - Loss: 11.1235
[Epoch 4/8 | Step 120/600] - Loss: 11.0677
[Epoch 4/8 | Step 130/600] - Loss: 10.9019
[Epoch 4/8 | Step 140/600] - Loss: 11.0965
[Epoch 4/8 | Step 150/600] - Loss: 11.0333
[Epoch 4/8 | Step 160/600] - Loss: 10.7755
[Epoch 4/8 | Step 170/600] - Loss: 10.8889
[Epoch 4/8 | Step 180/600] - Loss: 10.9961
[Epoch 4/8 | Step 190/600] - Loss: 11.2042
[Epoch 4/8 | Step 200/600] - Loss: 11.7233
[Epoch 4/8 | Step 210/600] - Loss: 11.6215
[Epoch 4/8 | Step 220/600] - Loss: 11.6170
[Epoch 4/8 | Step 230/600] - Loss: 11.5105
[Epoch 4/8 | Step 240/600] - Loss: 11.4006
[Epoch 4/8 | Step 250/600] - Loss: 11.3751
[Epoch 4/8 | Step 260/600] - Loss: 11.4267
[Epoch 4/8 | Step 270/600] - Loss: 11.3620
[Epoch 4/8 | Step 280/600] - Loss: 11.3083
[Epoch 4/8 | Step 290/600] - Loss: 11.1863
[Epoch 4/8 | Step 300/600] - Loss: 11.1607
[Epoch 4/8 | 

[Epoch 7/8 | Step 160/600] - Loss: 10.7205
[Epoch 7/8 | Step 170/600] - Loss: 10.6518
[Epoch 7/8 | Step 180/600] - Loss: 10.4557
[Epoch 7/8 | Step 190/600] - Loss: 10.5396
[Epoch 7/8 | Step 200/600] - Loss: 10.5080
[Epoch 7/8 | Step 210/600] - Loss: 10.5458
[Epoch 7/8 | Step 220/600] - Loss: 10.4530
[Epoch 7/8 | Step 230/600] - Loss: 10.5861
[Epoch 7/8 | Step 240/600] - Loss: 10.4968
[Epoch 7/8 | Step 250/600] - Loss: 10.4275
[Epoch 7/8 | Step 260/600] - Loss: 10.3452
[Epoch 7/8 | Step 270/600] - Loss: 10.4414
[Epoch 7/8 | Step 280/600] - Loss: 10.4256
[Epoch 7/8 | Step 290/600] - Loss: 10.5070
[Epoch 7/8 | Step 300/600] - Loss: 10.3550
[Epoch 7/8 | Step 310/600] - Loss: 10.3921
[Epoch 7/8 | Step 320/600] - Loss: 10.4178
[Epoch 7/8 | Step 330/600] - Loss: 10.4925
[Epoch 7/8 | Step 340/600] - Loss: 10.5791
[Epoch 7/8 | Step 350/600] - Loss: 10.5651
[Epoch 7/8 | Step 360/600] - Loss: 10.5216
[Epoch 7/8 | Step 370/600] - Loss: 10.4972
[Epoch 7/8 | Step 380/600] - Loss: 10.4062
[Epoch 7/8 

In [2]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,LlamaModel,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

def setup_model_and_tokenizer(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False

   
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()

    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=8, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

    login(token="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")
    device = setup_environment()
    model_name = 'Qwen/Qwen3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_Qwen")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/600] - Loss: 5.9304
[Epoch 1/8 | Step 10/600] - Loss: 3.0329
[Epoch 1/8 | Step 20/600] - Loss: 2.3218
[Epoch 1/8 | Step 30/600] - Loss: 2.3063
[Epoch 1/8 | Step 40/600] - Loss: 2.3573
[Epoch 1/8 | Step 50/600] - Loss: 2.3593
[Epoch 1/8 | Step 60/600] - Loss: 2.3306
[Epoch 1/8 | Step 70/600] - Loss: 2.2590
[Epoch 1/8 | Step 80/600] - Loss: 2.2863
[Epoch 1/8 | Step 90/600] - Loss: 2.2815
[Epoch 1/8 | Step 100/600] - Loss: 2.2803
[Epoch 1/8 | Step 110/600] - Loss: 2.2747
[Epoch 1/8 | Step 120/600] - Loss: 2.2263
[Epoch 1/8 | Step 130/600] - Loss: 2.2265
[Epoch 1/8 | Step 140/600] - Loss: 2.2109
[Epoch 1/8 | Step 150/600] - Loss: 2.1348
[Epoch 1/8 | Step 160/600] - Loss: 2.1920
[Epoch 1/8 | Step 170/600] - Loss: 2.2194
[Epoch 1/8 | Step 180/600] - Loss: 2.2029
[Epoch 1/8 | Step 190/600] - Loss: 2.1642
[Epoch 1/8 | Step 200/600] - Loss: 2.1566
[Epoch 1/8 | Step 210/600] - Loss: 2.1663
[Epoch 1/8 | Step 220/600] - Loss: 2.1831
[Epoch 1/8 | Step 230/600] - Loss: 2.1866
[Ep

[Epoch 4/8 | Step 130/600] - Loss: 0.9724
[Epoch 4/8 | Step 140/600] - Loss: 1.0226
[Epoch 4/8 | Step 150/600] - Loss: 1.0120
[Epoch 4/8 | Step 160/600] - Loss: 0.9889
[Epoch 4/8 | Step 170/600] - Loss: 0.9860
[Epoch 4/8 | Step 180/600] - Loss: 0.9755
[Epoch 4/8 | Step 190/600] - Loss: 0.9811
[Epoch 4/8 | Step 200/600] - Loss: 0.9684
[Epoch 4/8 | Step 210/600] - Loss: 0.9457
[Epoch 4/8 | Step 220/600] - Loss: 0.9333
[Epoch 4/8 | Step 230/600] - Loss: 0.9288
[Epoch 4/8 | Step 240/600] - Loss: 0.9455
[Epoch 4/8 | Step 250/600] - Loss: 0.9327
[Epoch 4/8 | Step 260/600] - Loss: 0.9565
[Epoch 4/8 | Step 270/600] - Loss: 0.9415
[Epoch 4/8 | Step 280/600] - Loss: 0.9294
[Epoch 4/8 | Step 290/600] - Loss: 0.9316
[Epoch 4/8 | Step 300/600] - Loss: 0.9449
[Epoch 4/8 | Step 310/600] - Loss: 0.9489
[Epoch 4/8 | Step 320/600] - Loss: 0.9358
[Epoch 4/8 | Step 330/600] - Loss: 0.9417
[Epoch 4/8 | Step 340/600] - Loss: 0.9286
[Epoch 4/8 | Step 350/600] - Loss: 0.9347
[Epoch 4/8 | Step 360/600] - Loss:

[Epoch 7/8 | Step 260/600] - Loss: 0.7218
[Epoch 7/8 | Step 270/600] - Loss: 0.7169
[Epoch 7/8 | Step 280/600] - Loss: 0.7073
[Epoch 7/8 | Step 290/600] - Loss: 0.7014
[Epoch 7/8 | Step 300/600] - Loss: 0.6977
[Epoch 7/8 | Step 310/600] - Loss: 0.7218
[Epoch 7/8 | Step 320/600] - Loss: 0.7158
[Epoch 7/8 | Step 330/600] - Loss: 0.7160
[Epoch 7/8 | Step 340/600] - Loss: 0.7093
[Epoch 7/8 | Step 350/600] - Loss: 0.7091
[Epoch 7/8 | Step 360/600] - Loss: 0.7374
[Epoch 7/8 | Step 370/600] - Loss: 0.7418
[Epoch 7/8 | Step 380/600] - Loss: 0.7449
[Epoch 7/8 | Step 390/600] - Loss: 0.7386
[Epoch 7/8 | Step 400/600] - Loss: 0.7340
[Epoch 7/8 | Step 410/600] - Loss: 0.7479
[Epoch 7/8 | Step 420/600] - Loss: 0.7658
[Epoch 7/8 | Step 430/600] - Loss: 0.7604
[Epoch 7/8 | Step 440/600] - Loss: 0.7537
[Epoch 7/8 | Step 450/600] - Loss: 0.7491
[Epoch 7/8 | Step 460/600] - Loss: 0.7460
[Epoch 7/8 | Step 470/600] - Loss: 0.7500
[Epoch 7/8 | Step 480/600] - Loss: 0.7517
[Epoch 7/8 | Step 490/600] - Loss:

In [4]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,LlamaModel,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

def setup_model_and_tokenizer(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False

   
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()

    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }



def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=8, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

    login(token="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")
    device = setup_environment()
    model_name =  "mistralai/Mistral-7B-v0.1"
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_Mistral")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/600] - Loss: 27.1476
[Epoch 1/8 | Step 10/600] - Loss: 18.4292
[Epoch 1/8 | Step 20/600] - Loss: 15.6888
[Epoch 1/8 | Step 30/600] - Loss: 15.2943
[Epoch 1/8 | Step 40/600] - Loss: 13.4584
[Epoch 1/8 | Step 50/600] - Loss: 12.8521
[Epoch 1/8 | Step 60/600] - Loss: 12.7043
[Epoch 1/8 | Step 70/600] - Loss: 13.4286
[Epoch 1/8 | Step 80/600] - Loss: 13.3973
[Epoch 1/8 | Step 90/600] - Loss: 12.9480
[Epoch 1/8 | Step 100/600] - Loss: 12.9068
[Epoch 1/8 | Step 110/600] - Loss: 13.0340
[Epoch 1/8 | Step 120/600] - Loss: 12.8922
[Epoch 1/8 | Step 130/600] - Loss: 13.0055
[Epoch 1/8 | Step 140/600] - Loss: 12.6378
[Epoch 1/8 | Step 150/600] - Loss: 12.9118
[Epoch 1/8 | Step 160/600] - Loss: 12.6558
[Epoch 1/8 | Step 170/600] - Loss: 12.8433
[Epoch 1/8 | Step 180/600] - Loss: 12.7878
[Epoch 1/8 | Step 190/600] - Loss: 12.5438
[Epoch 1/8 | Step 200/600] - Loss: 12.5616
[Epoch 1/8 | Step 210/600] - Loss: 12.7078
[Epoch 1/8 | Step 220/600] - Loss: 12.8436
[Epoch 1/8 | Step 230/

[Epoch 4/8 | Step 110/600] - Loss: 9.3907
[Epoch 4/8 | Step 120/600] - Loss: 9.2223
[Epoch 4/8 | Step 130/600] - Loss: 9.0087
[Epoch 4/8 | Step 140/600] - Loss: 9.2886
[Epoch 4/8 | Step 150/600] - Loss: 9.1104
[Epoch 4/8 | Step 160/600] - Loss: 9.0967
[Epoch 4/8 | Step 170/600] - Loss: 9.1297
[Epoch 4/8 | Step 180/600] - Loss: 9.1980
[Epoch 4/8 | Step 190/600] - Loss: 9.2380
[Epoch 4/8 | Step 200/600] - Loss: 9.3561
[Epoch 4/8 | Step 210/600] - Loss: 9.2624
[Epoch 4/8 | Step 220/600] - Loss: 9.2286
[Epoch 4/8 | Step 230/600] - Loss: 9.3336
[Epoch 4/8 | Step 240/600] - Loss: 9.1521
[Epoch 4/8 | Step 250/600] - Loss: 9.2610
[Epoch 4/8 | Step 260/600] - Loss: 9.2316
[Epoch 4/8 | Step 270/600] - Loss: 9.1387
[Epoch 4/8 | Step 280/600] - Loss: 9.0575
[Epoch 4/8 | Step 290/600] - Loss: 8.9446
[Epoch 4/8 | Step 300/600] - Loss: 8.9596
[Epoch 4/8 | Step 310/600] - Loss: 8.9836
[Epoch 4/8 | Step 320/600] - Loss: 8.9811
[Epoch 4/8 | Step 330/600] - Loss: 8.9783
[Epoch 4/8 | Step 340/600] - Loss:

[Epoch 7/8 | Step 240/600] - Loss: 8.9197
[Epoch 7/8 | Step 250/600] - Loss: 8.9007
[Epoch 7/8 | Step 260/600] - Loss: 8.7709
[Epoch 7/8 | Step 270/600] - Loss: 8.8475
[Epoch 7/8 | Step 280/600] - Loss: 8.7217
[Epoch 7/8 | Step 290/600] - Loss: 8.6705
[Epoch 7/8 | Step 300/600] - Loss: 8.5550
[Epoch 7/8 | Step 310/600] - Loss: 8.5340
[Epoch 7/8 | Step 320/600] - Loss: 8.5365
[Epoch 7/8 | Step 330/600] - Loss: 8.4717
[Epoch 7/8 | Step 340/600] - Loss: 8.5785
[Epoch 7/8 | Step 350/600] - Loss: 8.5440
[Epoch 7/8 | Step 360/600] - Loss: 8.5185
[Epoch 7/8 | Step 370/600] - Loss: 8.5181
[Epoch 7/8 | Step 380/600] - Loss: 8.4169
[Epoch 7/8 | Step 390/600] - Loss: 8.4687
[Epoch 7/8 | Step 400/600] - Loss: 8.4265
[Epoch 7/8 | Step 410/600] - Loss: 8.4188
[Epoch 7/8 | Step 420/600] - Loss: 8.3729
[Epoch 7/8 | Step 430/600] - Loss: 8.4128
[Epoch 7/8 | Step 440/600] - Loss: 8.4537
[Epoch 7/8 | Step 450/600] - Loss: 8.4449
[Epoch 7/8 | Step 460/600] - Loss: 8.4603
[Epoch 7/8 | Step 470/600] - Loss:

In [None]:
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,LlamaModel,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

def setup_model_and_tokenizer(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False

   
    quantization_config = BitsAndBytesConfig(load_in_4bit=True)
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()

    return model, tokenizer

class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        
        
        if pair['message']['label'] == 1:
            message_text = (
                "This email is flagged as a phishing email. "
                "Carefully examine the sender's address, subject line, and content of the email. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        else:
            message_text = (
                "This email is flagged as a legitimate email. "
                "Look for consistent and clear sender details, subject relevance, and authentic body content. "
                f"Sender: {pair['message']['sender']} [SEP] "
                f"Subject: {pair['message']['subject']} [SEP] "
                f"Body: {pair['message']['body']}"
            )
        
        
        preferred_response = (
            "This is a similar email example to the one above. "
            f"Sender: {pair['preferred']['sender']} [SEP] "
            f"Subject: {pair['preferred']['subject']} [SEP] "
            f"Body: {pair['preferred']['body']}"
        )
        rejected_response = (
            "This email is different in intent. Notice the sender's address, subject, and content mismatch. "
            f"Sender: {pair['rejected']['sender']} [SEP] "
            f"Subject: {pair['rejected']['subject']} [SEP] "
            f"Body: {pair['rejected']['body']}"
        )
        
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }



def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    #text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=8, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state

def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
   
    policy_model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

def main():

    login(token="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")
    device = setup_environment()
    model_name =  "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_DistillDeepSeek")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
  

Using GPU: NVIDIA RTX A5000


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/600] - Loss: 11.1326
[Epoch 1/8 | Step 10/600] - Loss: 7.2902
[Epoch 1/8 | Step 20/600] - Loss: 8.3438
[Epoch 1/8 | Step 30/600] - Loss: 8.4150
[Epoch 1/8 | Step 40/600] - Loss: 7.8513
[Epoch 1/8 | Step 50/600] - Loss: 7.8507
[Epoch 1/8 | Step 60/600] - Loss: 8.4181
[Epoch 1/8 | Step 70/600] - Loss: 8.1368
[Epoch 1/8 | Step 80/600] - Loss: 7.8838
[Epoch 1/8 | Step 90/600] - Loss: 7.9143
[Epoch 1/8 | Step 100/600] - Loss: 7.7941
[Epoch 1/8 | Step 110/600] - Loss: 8.0048
[Epoch 1/8 | Step 120/600] - Loss: 8.1687
[Epoch 1/8 | Step 130/600] - Loss: 7.9420
[Epoch 1/8 | Step 140/600] - Loss: 7.9366
[Epoch 1/8 | Step 150/600] - Loss: 7.8189
[Epoch 1/8 | Step 160/600] - Loss: 7.9781
[Epoch 1/8 | Step 170/600] - Loss: 7.9747
[Epoch 1/8 | Step 180/600] - Loss: 7.8100
[Epoch 1/8 | Step 190/600] - Loss: 7.7166
[Epoch 1/8 | Step 200/600] - Loss: 7.6652
[Epoch 1/8 | Step 210/600] - Loss: 7.5877
[Epoch 1/8 | Step 220/600] - Loss: 7.4935
[Epoch 1/8 | Step 230/600] - Loss: 7.6036
[E

[Epoch 4/8 | Step 130/600] - Loss: 4.5390
[Epoch 4/8 | Step 140/600] - Loss: 4.4785
[Epoch 4/8 | Step 150/600] - Loss: 4.5305
[Epoch 4/8 | Step 160/600] - Loss: 4.5443
[Epoch 4/8 | Step 170/600] - Loss: 4.5967
[Epoch 4/8 | Step 180/600] - Loss: 4.5678
[Epoch 4/8 | Step 190/600] - Loss: 4.5037
[Epoch 4/8 | Step 200/600] - Loss: 4.5059
[Epoch 4/8 | Step 210/600] - Loss: 4.4589
[Epoch 4/8 | Step 220/600] - Loss: 4.4713
[Epoch 4/8 | Step 230/600] - Loss: 4.4564
[Epoch 4/8 | Step 240/600] - Loss: 4.4210
[Epoch 4/8 | Step 250/600] - Loss: 4.3624
[Epoch 4/8 | Step 260/600] - Loss: 4.3544
[Epoch 4/8 | Step 270/600] - Loss: 4.3536
[Epoch 4/8 | Step 280/600] - Loss: 4.3268
[Epoch 4/8 | Step 290/600] - Loss: 4.3061
[Epoch 4/8 | Step 300/600] - Loss: 4.3422
[Epoch 4/8 | Step 310/600] - Loss: 4.4253
[Epoch 4/8 | Step 320/600] - Loss: 4.4725
[Epoch 4/8 | Step 330/600] - Loss: 4.4494
[Epoch 4/8 | Step 340/600] - Loss: 4.5038
[Epoch 4/8 | Step 350/600] - Loss: 4.5431
[Epoch 4/8 | Step 360/600] - Loss:

[Epoch 7/8 | Step 260/600] - Loss: 3.3792
[Epoch 7/8 | Step 270/600] - Loss: 3.4037
[Epoch 7/8 | Step 280/600] - Loss: 3.4710
[Epoch 7/8 | Step 290/600] - Loss: 3.4164
[Epoch 7/8 | Step 300/600] - Loss: 3.4348
[Epoch 7/8 | Step 310/600] - Loss: 3.4249
[Epoch 7/8 | Step 320/600] - Loss: 3.3941
[Epoch 7/8 | Step 330/600] - Loss: 3.3928
[Epoch 7/8 | Step 340/600] - Loss: 3.3557
[Epoch 7/8 | Step 350/600] - Loss: 3.3390
[Epoch 7/8 | Step 360/600] - Loss: 3.3762
[Epoch 7/8 | Step 370/600] - Loss: 3.3593
[Epoch 7/8 | Step 380/600] - Loss: 3.3653
[Epoch 7/8 | Step 390/600] - Loss: 3.3405
[Epoch 7/8 | Step 400/600] - Loss: 3.3533
[Epoch 7/8 | Step 410/600] - Loss: 3.3973
[Epoch 7/8 | Step 420/600] - Loss: 3.3937
[Epoch 7/8 | Step 430/600] - Loss: 3.4050
[Epoch 7/8 | Step 440/600] - Loss: 3.4006
[Epoch 7/8 | Step 450/600] - Loss: 3.4020
[Epoch 7/8 | Step 460/600] - Loss: 3.4112
[Epoch 7/8 | Step 470/600] - Loss: 3.4525
[Epoch 7/8 | Step 480/600] - Loss: 3.4092
[Epoch 7/8 | Step 490/600] - Loss: