In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

def setup_model_and_tokenizer(model_name, device):
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False

   
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer



def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    login(token="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")
    device = setup_environment()
    model_name = 'meta-llama/Llama-2-7b-chat-hf'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_7B")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.8101
Epoch 1, Step 10: Loss = 0.6461
Epoch 1, Step 20: Loss = 0.5980
Epoch 1, Step 30: Loss = 0.7918
Epoch 1, Step 40: Loss = 0.8680
Epoch 1, Step 50: Loss = 0.5674
Epoch 1, Step 60: Loss = 0.5396
Epoch 1, Step 70: Loss = 0.7503
Epoch 1, Step 80: Loss = 0.7560
Epoch 1, Step 90: Loss = 0.5677
Epoch 1, Step 100: Loss = 0.8650
Epoch 1, Step 110: Loss = 0.8832
Epoch 1, Step 120: Loss = 0.5864
Epoch 1, Step 130: Loss = 0.9844
Epoch 1, Step 140: Loss = 0.4921
Epoch 1, Step 150: Loss = 1.0096
Epoch 1, Step 160: Loss = 0.5531
Epoch 1, Step 170: Loss = 0.5686
Epoch 1, Step 180: Loss = 0.9962
Epoch 1, Step 190: Loss = 0.9772
Epoch 1, Step 200: Loss = 0.7517
Epoch 1, Step 210: Loss = 0.5499
Epoch 1, Step 220: Loss = 0.5828
Epoch 1, Step 230: Loss = 0.6110
Epoch 1, Step 240: Loss = 0.5795
Epoch 1, Step 250: Loss = 0.8249
Epoch 1, Step 260: Loss = 0.5340
Epoch 1, Step 270: Loss = 0.6975
Epoch 1, Step 280: Loss = 0.4896
Epoch 1, Step 290: Loss = 0.6943
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.2649
Epoch 5, Step 10: Loss = 0.1631
Epoch 5, Step 20: Loss = 0.0628
Epoch 5, Step 30: Loss = 0.0623
Epoch 5, Step 40: Loss = 0.3664
Epoch 5, Step 50: Loss = 0.3920
Epoch 5, Step 60: Loss = 0.0342
Epoch 5, Step 70: Loss = 0.0493
Epoch 5, Step 80: Loss = 0.1032
Epoch 5, Step 90: Loss = 0.1389
Epoch 5, Step 100: Loss = 0.1382
Epoch 5, Step 110: Loss = 0.2513
Epoch 5, Step 120: Loss = 0.4962
Epoch 5, Step 130: Loss = 0.3531
Epoch 5, Step 140: Loss = 0.2061
Epoch 5, Step 150: Loss = 0.1699
Epoch 5, Step 160: Loss = 0.1876
Epoch 5, Step 170: Loss = 0.1907
Epoch 5, Step 180: Loss = 0.1449
Epoch 5, Step 190: Loss = 0.2178
Epoch 5, Step 200: Loss = 0.0777
Epoch 5, Step 210: Loss = 0.2252
Epoch 5, Step 220: Loss = 0.3895
Epoch 5, Step 230: Loss = 0.3966
Epoch 5, Step 240: Loss = 0.2991
Epoch 5, Step 250: Loss = 0.0000
Epoch 5, Step 260: Loss = 0.3903
Epoch 5, Step 270: Loss = 0.1708
Epoch 5, Step 280: Loss = 0.5249
Epoch 5, Step 290: Loss = 0.2350
Epoch 5, Step 300: Lo

Epoch 9, Step 0: Loss = 0.0148
Epoch 9, Step 10: Loss = 0.3380
Epoch 9, Step 20: Loss = 0.0865
Epoch 9, Step 30: Loss = 0.0017
Epoch 9, Step 40: Loss = 0.0353
Epoch 9, Step 50: Loss = 0.1241
Epoch 9, Step 60: Loss = 0.5079
Epoch 9, Step 70: Loss = 0.1415
Epoch 9, Step 80: Loss = 0.1370
Epoch 9, Step 90: Loss = 0.1156
Epoch 9, Step 100: Loss = 0.3721
Epoch 9, Step 110: Loss = 0.0000
Epoch 9, Step 120: Loss = 0.0958
Epoch 9, Step 130: Loss = 0.0447
Epoch 9, Step 140: Loss = 0.3230
Epoch 9, Step 150: Loss = 0.1635
Epoch 9, Step 160: Loss = 0.1678
Epoch 9, Step 170: Loss = 0.0000
Epoch 9, Step 180: Loss = 0.0177
Epoch 9, Step 190: Loss = 0.1063
Epoch 9, Step 200: Loss = 0.1825
Epoch 9, Step 210: Loss = 0.0155
Epoch 9, Step 220: Loss = 0.0597
Epoch 9, Step 230: Loss = 0.3630
Epoch 9, Step 240: Loss = 0.3226
Epoch 9, Step 250: Loss = 0.1469
Epoch 9, Step 260: Loss = 0.0143
Epoch 9, Step 270: Loss = 0.0395
Epoch 9, Step 280: Loss = 0.4286
Epoch 9, Step 290: Loss = 0.3904
Epoch 9, Step 300: Lo

In [3]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

from transformers import AutoTokenizer, AutoConfig, LlamaForSequenceClassification
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

def setup_model_and_tokenizer(model_name, device):
    # Use the fast tokenizer to avoid the slow/legacy mismatch
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Load model config
    model_config = AutoConfig.from_pretrained(model_name, num_labels=2, pad_token_id=tokenizer.pad_token_id, use_cache=False)

    # Quantize base model
    quant_config = BitsAndBytesConfig(load_in_8bit=True)
    base = LlamaForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        quantization_config=quant_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,             # if needed for this checkpoint
    )

    # Attach LoRA
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(base, lora_cfg)
    model.gradient_checkpointing_enable()

    return model, tokenizer




def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    login(token="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")
    device = setup_environment()
    model_name = 'meta-llama/Meta-Llama-3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_8B")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,s
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 1.0064
Epoch 1, Step 10: Loss = 0.6280
Epoch 1, Step 20: Loss = 0.7360
Epoch 1, Step 30: Loss = 0.8420
Epoch 1, Step 40: Loss = 0.5209
Epoch 1, Step 50: Loss = 0.6332
Epoch 1, Step 60: Loss = 0.6008
Epoch 1, Step 70: Loss = 1.0036
Epoch 1, Step 80: Loss = 0.5906
Epoch 1, Step 90: Loss = 0.8547
Epoch 1, Step 100: Loss = 1.0852
Epoch 1, Step 110: Loss = 0.9466
Epoch 1, Step 120: Loss = 0.5576
Epoch 1, Step 130: Loss = 0.7647
Epoch 1, Step 140: Loss = 0.8686
Epoch 1, Step 150: Loss = 1.3381
Epoch 1, Step 160: Loss = 0.5882
Epoch 1, Step 170: Loss = 0.7980
Epoch 1, Step 180: Loss = 0.6350
Epoch 1, Step 190: Loss = 0.7847
Epoch 1, Step 200: Loss = 0.5892
Epoch 1, Step 210: Loss = 0.7243
Epoch 1, Step 220: Loss = 1.1059
Epoch 1, Step 230: Loss = 0.8742
Epoch 1, Step 240: Loss = 0.6977
Epoch 1, Step 250: Loss = 0.6698
Epoch 1, Step 260: Loss = 0.5186
Epoch 1, Step 270: Loss = 0.8657
Epoch 1, Step 280: Loss = 0.9431
Epoch 1, Step 290: Loss = 0.4577
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.2186
Epoch 5, Step 10: Loss = 0.1178
Epoch 5, Step 20: Loss = 0.0007
Epoch 5, Step 30: Loss = 0.1030
Epoch 5, Step 40: Loss = 0.0000
Epoch 5, Step 50: Loss = 0.0885
Epoch 5, Step 60: Loss = 0.0000
Epoch 5, Step 70: Loss = 0.3810
Epoch 5, Step 80: Loss = 0.6118
Epoch 5, Step 90: Loss = 0.0936
Epoch 5, Step 100: Loss = 0.0000
Epoch 5, Step 110: Loss = 0.3295
Epoch 5, Step 120: Loss = 0.0522
Epoch 5, Step 130: Loss = 0.2232
Epoch 5, Step 140: Loss = 0.0556
Epoch 5, Step 150: Loss = 0.1454
Epoch 5, Step 160: Loss = 0.0000
Epoch 5, Step 170: Loss = 0.0923
Epoch 5, Step 180: Loss = 0.0719
Epoch 5, Step 190: Loss = 0.0530
Epoch 5, Step 200: Loss = 0.3196
Epoch 5, Step 210: Loss = 0.0000
Epoch 5, Step 220: Loss = 0.0411
Epoch 5, Step 230: Loss = 0.1253
Epoch 5, Step 240: Loss = 0.0050
Epoch 5, Step 250: Loss = 0.0454
Epoch 5, Step 260: Loss = 0.1058
Epoch 5, Step 270: Loss = 0.0665
Epoch 5, Step 280: Loss = 0.3438
Epoch 5, Step 290: Loss = 0.0649
Epoch 5, Step 300: Lo

Epoch 9, Step 0: Loss = 0.2613
Epoch 9, Step 10: Loss = 0.1708
Epoch 9, Step 20: Loss = 0.0594
Epoch 9, Step 30: Loss = 0.0000
Epoch 9, Step 40: Loss = 0.0039
Epoch 9, Step 50: Loss = 0.1382
Epoch 9, Step 60: Loss = 0.0000
Epoch 9, Step 70: Loss = 0.0000
Epoch 9, Step 80: Loss = 0.0246
Epoch 9, Step 90: Loss = 0.1358
Epoch 9, Step 100: Loss = 0.0447
Epoch 9, Step 110: Loss = 0.0265
Epoch 9, Step 120: Loss = 0.0244
Epoch 9, Step 130: Loss = 0.6715
Epoch 9, Step 140: Loss = 0.0178
Epoch 9, Step 150: Loss = 0.1008
Epoch 9, Step 160: Loss = 0.1228
Epoch 9, Step 170: Loss = 0.0000
Epoch 9, Step 180: Loss = 0.0002
Epoch 9, Step 190: Loss = 0.0000
Epoch 9, Step 200: Loss = 0.0406
Epoch 9, Step 210: Loss = 0.1436
Epoch 9, Step 220: Loss = 0.3209
Epoch 9, Step 230: Loss = 0.1013
Epoch 9, Step 240: Loss = 0.1140
Epoch 9, Step 250: Loss = 0.1041
Epoch 9, Step 260: Loss = 0.1476
Epoch 9, Step 270: Loss = 0.0227
Epoch 9, Step 280: Loss = 0.5331
Epoch 9, Step 290: Loss = 0.0172
Epoch 9, Step 300: Lo

In [4]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

from transformers import AutoTokenizer, AutoConfig, LlamaForSequenceClassification,AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

def setup_model_and_tokenizer(model_name, device):
    # Use the fast tokenizer to avoid the slow/legacy mismatch
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Load model config
    model_config = AutoConfig.from_pretrained(model_name, num_labels=2, pad_token_id=tokenizer.pad_token_id, use_cache=False)

    # Quantize base model
    quant_config = BitsAndBytesConfig(load_in_8bit=True)
    base = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        quantization_config=quant_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,             # if needed for this checkpoint
    )

    # Attach LoRA
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(base, lora_cfg)
    model.gradient_checkpointing_enable()

    return model, tokenizer




def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    login(token="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")
    device = setup_environment()
    model_name = 'dreamgen/WizardLM-2-7B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_7B_wiz")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 1.3188
Epoch 1, Step 10: Loss = 1.6830
Epoch 1, Step 20: Loss = 1.6642
Epoch 1, Step 30: Loss = 1.7365
Epoch 1, Step 40: Loss = 1.0800
Epoch 1, Step 50: Loss = 2.0796
Epoch 1, Step 60: Loss = 0.8108
Epoch 1, Step 70: Loss = 1.3426
Epoch 1, Step 80: Loss = 0.7914
Epoch 1, Step 90: Loss = 2.4679
Epoch 1, Step 100: Loss = 0.9290
Epoch 1, Step 110: Loss = 2.1301
Epoch 1, Step 120: Loss = 1.8296
Epoch 1, Step 130: Loss = 1.4450
Epoch 1, Step 140: Loss = 2.0599
Epoch 1, Step 150: Loss = 2.2248
Epoch 1, Step 160: Loss = 1.5988
Epoch 1, Step 170: Loss = 0.8125
Epoch 1, Step 180: Loss = 1.5921
Epoch 1, Step 190: Loss = 1.4128
Epoch 1, Step 200: Loss = 1.1420
Epoch 1, Step 210: Loss = 1.5280
Epoch 1, Step 220: Loss = 0.9515
Epoch 1, Step 230: Loss = 2.5399
Epoch 1, Step 240: Loss = 1.7433
Epoch 1, Step 250: Loss = 1.1936
Epoch 1, Step 260: Loss = 1.2059
Epoch 1, Step 270: Loss = 1.9741
Epoch 1, Step 280: Loss = 1.2583
Epoch 1, Step 290: Loss = 1.0895
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.9105
Epoch 5, Step 10: Loss = 0.1370
Epoch 5, Step 20: Loss = 0.4942
Epoch 5, Step 30: Loss = 0.3818
Epoch 5, Step 40: Loss = 0.1322
Epoch 5, Step 50: Loss = 0.2818
Epoch 5, Step 60: Loss = 0.7809
Epoch 5, Step 70: Loss = 0.4391
Epoch 5, Step 80: Loss = 0.4273
Epoch 5, Step 90: Loss = 0.2813
Epoch 5, Step 100: Loss = 0.2222
Epoch 5, Step 110: Loss = 0.3155
Epoch 5, Step 120: Loss = 0.7569
Epoch 5, Step 130: Loss = 0.5390
Epoch 5, Step 140: Loss = 1.0921
Epoch 5, Step 150: Loss = 0.1243
Epoch 5, Step 160: Loss = 0.1373
Epoch 5, Step 170: Loss = 0.2413
Epoch 5, Step 180: Loss = 0.1682
Epoch 5, Step 190: Loss = 1.9631
Epoch 5, Step 200: Loss = 1.0903
Epoch 5, Step 210: Loss = 0.6169
Epoch 5, Step 220: Loss = 0.7336
Epoch 5, Step 230: Loss = 0.7288
Epoch 5, Step 240: Loss = 1.0659
Epoch 5, Step 250: Loss = 0.8512
Epoch 5, Step 260: Loss = 0.7342
Epoch 5, Step 270: Loss = 1.9058
Epoch 5, Step 280: Loss = 0.0959
Epoch 5, Step 290: Loss = 1.3089
Epoch 5, Step 300: Lo

Epoch 9, Step 0: Loss = 1.3125
Epoch 9, Step 10: Loss = 1.0837
Epoch 9, Step 20: Loss = 0.3677
Epoch 9, Step 30: Loss = 0.0000
Epoch 9, Step 40: Loss = 0.1115
Epoch 9, Step 50: Loss = 0.3065
Epoch 9, Step 60: Loss = 0.2711
Epoch 9, Step 70: Loss = 0.9611
Epoch 9, Step 80: Loss = 0.7760
Epoch 9, Step 90: Loss = 0.4306
Epoch 9, Step 100: Loss = 0.0343
Epoch 9, Step 110: Loss = 0.1955
Epoch 9, Step 120: Loss = 0.0612
Epoch 9, Step 130: Loss = 0.6497
Epoch 9, Step 140: Loss = 0.2490
Epoch 9, Step 150: Loss = 0.6360
Epoch 9, Step 160: Loss = 0.2944
Epoch 9, Step 170: Loss = 0.4716
Epoch 9, Step 180: Loss = 0.1511
Epoch 9, Step 190: Loss = 0.6909
Epoch 9, Step 200: Loss = 0.0822
Epoch 9, Step 210: Loss = 0.1519
Epoch 9, Step 220: Loss = 0.0369
Epoch 9, Step 230: Loss = 1.2739
Epoch 9, Step 240: Loss = 0.6786
Epoch 9, Step 250: Loss = 0.1609
Epoch 9, Step 260: Loss = 0.2369
Epoch 9, Step 270: Loss = 0.4917
Epoch 9, Step 280: Loss = 0.3892
Epoch 9, Step 290: Loss = 0.2310
Epoch 9, Step 300: Lo

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

from transformers import AutoTokenizer, AutoConfig, LlamaForSequenceClassification,AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

def setup_model_and_tokenizer(model_name, device):
    # Use the fast tokenizer to avoid the slow/legacy mismatch
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Load model config
    model_config = AutoConfig.from_pretrained(model_name, num_labels=2, pad_token_id=tokenizer.pad_token_id, use_cache=False)

    # Quantize base model
    quant_config = BitsAndBytesConfig(load_in_8bit=True)
    base = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        quantization_config=quant_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,             # if needed for this checkpoint
    )

    # Attach LoRA
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(base, lora_cfg)
    model.gradient_checkpointing_enable()

    return model, tokenizer




def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    login(token="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")
    device = setup_environment()
    model_name = "mistralai/Mistral-7B-v0.1"
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_7B_Mistral")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 1.4422
Epoch 1, Step 10: Loss = 1.1817
Epoch 1, Step 20: Loss = 2.0573
Epoch 1, Step 30: Loss = 2.1412
Epoch 1, Step 40: Loss = 1.9548
Epoch 1, Step 50: Loss = 1.0802
Epoch 1, Step 60: Loss = 1.6772
Epoch 1, Step 70: Loss = 0.9746
Epoch 1, Step 80: Loss = 0.9204
Epoch 1, Step 90: Loss = 1.4151
Epoch 1, Step 100: Loss = 0.9177
Epoch 1, Step 110: Loss = 1.4063
Epoch 1, Step 120: Loss = 1.3937
Epoch 1, Step 130: Loss = 1.1164
Epoch 1, Step 140: Loss = 1.7856
Epoch 1, Step 150: Loss = 1.6946
Epoch 1, Step 160: Loss = 1.1235
Epoch 1, Step 170: Loss = 1.4161
Epoch 1, Step 180: Loss = 1.5961
Epoch 1, Step 190: Loss = 1.7093
Epoch 1, Step 200: Loss = 2.7760
Epoch 1, Step 210: Loss = 2.1215
Epoch 1, Step 220: Loss = 1.0533
Epoch 1, Step 230: Loss = 1.7038
Epoch 1, Step 240: Loss = 0.8380
Epoch 1, Step 250: Loss = 0.8123
Epoch 1, Step 260: Loss = 1.9257
Epoch 1, Step 270: Loss = 1.6220
Epoch 1, Step 280: Loss = 1.6277
Epoch 1, Step 290: Loss = 1.1407
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.1285
Epoch 5, Step 10: Loss = 0.0773
Epoch 5, Step 20: Loss = 0.5819
Epoch 5, Step 30: Loss = 0.0733
Epoch 5, Step 40: Loss = 0.1115
Epoch 5, Step 50: Loss = 0.5313
Epoch 5, Step 60: Loss = 0.8941
Epoch 5, Step 70: Loss = 0.8116
Epoch 5, Step 80: Loss = 0.3450
Epoch 5, Step 90: Loss = 0.0555
Epoch 5, Step 100: Loss = 0.1123
Epoch 5, Step 110: Loss = 0.1539
Epoch 5, Step 120: Loss = 0.3463
Epoch 5, Step 130: Loss = 0.0611
Epoch 5, Step 140: Loss = 0.0116
Epoch 5, Step 150: Loss = 0.6575
Epoch 5, Step 160: Loss = 0.8190
Epoch 5, Step 170: Loss = 0.3793
Epoch 5, Step 180: Loss = 0.0110
Epoch 5, Step 190: Loss = 1.1273
Epoch 5, Step 200: Loss = 0.0000
Epoch 5, Step 210: Loss = 0.0000
Epoch 5, Step 220: Loss = 0.2307
Epoch 5, Step 230: Loss = 0.1032
Epoch 5, Step 240: Loss = 0.3041
Epoch 5, Step 250: Loss = 0.0000
Epoch 5, Step 260: Loss = 1.2399
Epoch 5, Step 270: Loss = 0.6720
Epoch 5, Step 280: Loss = 1.0245
Epoch 5, Step 290: Loss = 0.4175
Epoch 5, Step 300: Lo

Epoch 9, Step 0: Loss = 0.6492
Epoch 9, Step 10: Loss = 0.0000
Epoch 9, Step 20: Loss = 0.3952
Epoch 9, Step 30: Loss = 0.5238
Epoch 9, Step 40: Loss = 0.5307
Epoch 9, Step 50: Loss = 0.4734
Epoch 9, Step 60: Loss = 0.4899
Epoch 9, Step 70: Loss = 0.0751
Epoch 9, Step 80: Loss = 0.2115
Epoch 9, Step 90: Loss = 0.0000
Epoch 9, Step 100: Loss = 0.0000
Epoch 9, Step 110: Loss = 0.0858
Epoch 9, Step 120: Loss = 0.2784
Epoch 9, Step 130: Loss = 0.1853
Epoch 9, Step 140: Loss = 0.0447
Epoch 9, Step 150: Loss = 0.0000
Epoch 9, Step 160: Loss = 0.0303
Epoch 9, Step 170: Loss = 0.0987
Epoch 9, Step 180: Loss = 0.0676
Epoch 9, Step 190: Loss = 0.2087
Epoch 9, Step 200: Loss = 0.1384
Epoch 9, Step 210: Loss = 0.4276
Epoch 9, Step 220: Loss = 1.7658
Epoch 9, Step 230: Loss = 0.0000
Epoch 9, Step 240: Loss = 0.2245
Epoch 9, Step 250: Loss = 0.0700
Epoch 9, Step 260: Loss = 0.0093
Epoch 9, Step 270: Loss = 0.0519
Epoch 9, Step 280: Loss = 0.0000
Epoch 9, Step 290: Loss = 0.0000
Epoch 9, Step 300: Lo

In [2]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

from transformers import AutoTokenizer, AutoConfig, LlamaForSequenceClassification,AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

def setup_model_and_tokenizer(model_name, device):
    # Use the fast tokenizer to avoid the slow/legacy mismatch
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Load model config
    model_config = AutoConfig.from_pretrained(model_name, num_labels=2, pad_token_id=tokenizer.pad_token_id, use_cache=False)

    # Quantize base model
    quant_config = BitsAndBytesConfig(load_in_8bit=True)
    base = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        quantization_config=quant_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,             # if needed for this checkpoint
    )

    # Attach LoRA
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(base, lora_cfg)
    model.gradient_checkpointing_enable()

    return model, tokenizer




def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    login(token="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")
    device = setup_environment()
    model_name ='Qwen/Qwen3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_8B_Qwen")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.6146
Epoch 1, Step 10: Loss = 0.6401
Epoch 1, Step 20: Loss = 0.6859
Epoch 1, Step 30: Loss = 0.5828
Epoch 1, Step 40: Loss = 0.5446
Epoch 1, Step 50: Loss = 0.4075
Epoch 1, Step 60: Loss = 0.5262
Epoch 1, Step 70: Loss = 0.6076
Epoch 1, Step 80: Loss = 0.5714
Epoch 1, Step 90: Loss = 0.7249
Epoch 1, Step 100: Loss = 0.5793
Epoch 1, Step 110: Loss = 0.4526
Epoch 1, Step 120: Loss = 0.5871
Epoch 1, Step 130: Loss = 0.6251
Epoch 1, Step 140: Loss = 0.4585
Epoch 1, Step 150: Loss = 0.6158
Epoch 1, Step 160: Loss = 0.6849
Epoch 1, Step 170: Loss = 0.5393
Epoch 1, Step 180: Loss = 0.5804
Epoch 1, Step 190: Loss = 0.5369
Epoch 1, Step 200: Loss = 0.7754
Epoch 1, Step 210: Loss = 0.7759
Epoch 1, Step 220: Loss = 0.4564
Epoch 1, Step 230: Loss = 0.4169
Epoch 1, Step 240: Loss = 0.4820
Epoch 1, Step 250: Loss = 0.4652
Epoch 1, Step 260: Loss = 0.6481
Epoch 1, Step 270: Loss = 0.4827
Epoch 1, Step 280: Loss = 0.5719
Epoch 1, Step 290: Loss = 0.4441
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.1508
Epoch 5, Step 10: Loss = 0.1403
Epoch 5, Step 20: Loss = 0.0471
Epoch 5, Step 30: Loss = 0.6500
Epoch 5, Step 40: Loss = 0.0494
Epoch 5, Step 50: Loss = 0.3249
Epoch 5, Step 60: Loss = 0.2425
Epoch 5, Step 70: Loss = 0.1295
Epoch 5, Step 80: Loss = 0.1309
Epoch 5, Step 90: Loss = 0.2391
Epoch 5, Step 100: Loss = 0.0538
Epoch 5, Step 110: Loss = 0.0595
Epoch 5, Step 120: Loss = 0.0888
Epoch 5, Step 130: Loss = 0.2564
Epoch 5, Step 140: Loss = 0.0168
Epoch 5, Step 150: Loss = 0.0542
Epoch 5, Step 160: Loss = 0.1780
Epoch 5, Step 170: Loss = 0.0925
Epoch 5, Step 180: Loss = 0.1510
Epoch 5, Step 190: Loss = 0.1251
Epoch 5, Step 200: Loss = 0.0153
Epoch 5, Step 210: Loss = 0.0743
Epoch 5, Step 220: Loss = 0.1483
Epoch 5, Step 230: Loss = 0.2424
Epoch 5, Step 240: Loss = 0.1060
Epoch 5, Step 250: Loss = 0.1605
Epoch 5, Step 260: Loss = 0.0524
Epoch 5, Step 270: Loss = 0.0539
Epoch 5, Step 280: Loss = 0.1283
Epoch 5, Step 290: Loss = 0.0063
Epoch 5, Step 300: Lo

Epoch 9, Step 0: Loss = 0.0018
Epoch 9, Step 10: Loss = 0.1107
Epoch 9, Step 20: Loss = 0.3005
Epoch 9, Step 30: Loss = 0.1307
Epoch 9, Step 40: Loss = 0.0015
Epoch 9, Step 50: Loss = 0.0343
Epoch 9, Step 60: Loss = 0.0708
Epoch 9, Step 70: Loss = 0.0000
Epoch 9, Step 80: Loss = 0.0298
Epoch 9, Step 90: Loss = 0.0117
Epoch 9, Step 100: Loss = 0.0142
Epoch 9, Step 110: Loss = 0.1360
Epoch 9, Step 120: Loss = 0.0306
Epoch 9, Step 130: Loss = 0.0000
Epoch 9, Step 140: Loss = 0.1032
Epoch 9, Step 150: Loss = 0.0254
Epoch 9, Step 160: Loss = 0.0333
Epoch 9, Step 170: Loss = 0.0448
Epoch 9, Step 180: Loss = 0.1480
Epoch 9, Step 190: Loss = 0.0290
Epoch 9, Step 200: Loss = 0.0245
Epoch 9, Step 210: Loss = 0.0378
Epoch 9, Step 220: Loss = 0.0228
Epoch 9, Step 230: Loss = 0.0840
Epoch 9, Step 240: Loss = 0.0000
Epoch 9, Step 250: Loss = 0.1462
Epoch 9, Step 260: Loss = 0.0549
Epoch 9, Step 270: Loss = 0.0306
Epoch 9, Step 280: Loss = 0.0000
Epoch 9, Step 290: Loss = 0.0000
Epoch 9, Step 300: Lo

In [3]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import re
import traceback

# Set environment to use GPU
def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '2' 
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device

# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }
# Load the model and tokenizer
def setup_model_and_tokenizer(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Set a default padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else tokenizer.unk_token
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id

    # Load the BERT model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.float16
    ).to(device)
    
    return model, tokenizer


def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=5, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float32)  # Use full precision initially to debug

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            # Remove the autocast context to stabilize loss calculation
            anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
            positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
            negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

            positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
            negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

           
            epsilon = 1e-6
            loss = F.triplet_margin_loss(
                anchor_embeddings.unsqueeze(1).expand(-1, 3, -1) + epsilon,
                positive_embeddings + epsilon,
                negative_embeddings + epsilon,
                margin=margin
            )
            loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)  
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}

        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state


def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}
def print_metrics(epoch, total_loss, num_batches, val_metrics):
    """
    Prints the training and validation metrics for each epoch.
    
    Parameters:
    - epoch (int): Current epoch number.
    - total_loss (float): Total training loss for the epoch.
    - num_batches (int): Number of batches in training data.
    - val_metrics (dict): Dictionary containing validation metrics such as 'val_loss'.
    """
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def clean_text(text):
    if not isinstance(text, str):  # Ensure text is a string
        text = ""
 
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def main():
    login(token="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") 
    device = setup_environment()
    model_name = 'bert-base-uncased' 
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    num_epochs = 5
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_bertuncased")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.4716
Epoch 1, Step 10: Loss = 0.4720
Epoch 1, Step 20: Loss = 0.5014
Epoch 1, Step 30: Loss = 0.5029
Epoch 1, Step 40: Loss = 0.4967
Epoch 1, Step 50: Loss = 0.5201
Epoch 1, Step 60: Loss = 0.4843
Epoch 1, Step 70: Loss = 0.4999
Epoch 1, Step 80: Loss = 0.4959
Epoch 1, Step 90: Loss = 0.4868
Epoch 1, Step 100: Loss = 0.4495
Epoch 1, Step 110: Loss = 0.4924
Epoch 1, Step 120: Loss = 0.4389
Epoch 1, Step 130: Loss = 0.3904
Epoch 1, Step 140: Loss = 0.2097
Epoch 1, Step 150: Loss = 0.1661
Epoch 1, Step 160: Loss = 0.1652
Epoch 1, Step 170: Loss = 0.0013
Epoch 1, Step 180: Loss = 0.1500
Epoch 1, Step 190: Loss = 0.0000
Epoch 1, Step 200: Loss = 0.0165
Epoch 1, Step 210: Loss = 0.0905
Epoch 1, Step 220: Loss = 0.0150
Epoch 1, Step 230: Loss = 0.0016
Epoch 1, Step 240: Loss = 0.1572
Epoch 1, Step 250: Loss = 0.0127
Epoch 1, Step 260: Loss = 0.0000
Epoch 1, Step 270: Loss = 0.0000
Epoch 1, Step 280: Loss = 0.0097
Epoch 1, Step 290: Loss = 0.0000
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.0000
Epoch 5, Step 10: Loss = 0.0000
Epoch 5, Step 20: Loss = 0.0000
Epoch 5, Step 30: Loss = 0.0000
Epoch 5, Step 40: Loss = 0.0000
Epoch 5, Step 50: Loss = 0.0000
Epoch 5, Step 60: Loss = 0.0000
Epoch 5, Step 70: Loss = 0.0203
Epoch 5, Step 80: Loss = 0.0000
Epoch 5, Step 90: Loss = 0.0000
Epoch 5, Step 100: Loss = 0.0000
Epoch 5, Step 110: Loss = 0.0000
Epoch 5, Step 120: Loss = 0.0000
Epoch 5, Step 130: Loss = 0.0000
Epoch 5, Step 140: Loss = 0.0000
Epoch 5, Step 150: Loss = 0.0000
Epoch 5, Step 160: Loss = 0.0000
Epoch 5, Step 170: Loss = 0.0000
Epoch 5, Step 180: Loss = 0.0000
Epoch 5, Step 190: Loss = 0.0000
Epoch 5, Step 200: Loss = 0.0000
Epoch 5, Step 210: Loss = 0.0000
Epoch 5, Step 220: Loss = 0.0000
Epoch 5, Step 230: Loss = 0.0000
Epoch 5, Step 240: Loss = 0.0000
Epoch 5, Step 250: Loss = 0.0000
Epoch 5, Step 260: Loss = 0.0000
Epoch 5, Step 270: Loss = 0.0000
Epoch 5, Step 280: Loss = 0.0000
Epoch 5, Step 290: Loss = 0.0000
Epoch 5, Step 300: Lo

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

from transformers import AutoTokenizer, AutoConfig, LlamaForSequenceClassification,AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

def setup_model_and_tokenizer(model_name, device):
    # Use the fast tokenizer to avoid the slow/legacy mismatch
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Load model config
    model_config = AutoConfig.from_pretrained(model_name, num_labels=2, pad_token_id=tokenizer.pad_token_id, use_cache=False)

    # Quantize base model
    quant_config = BitsAndBytesConfig(load_in_8bit=True)
    base = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        quantization_config=quant_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,             # if needed for this checkpoint
    )

    # Attach LoRA
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(base, lora_cfg)
    model.gradient_checkpointing_enable()

    return model, tokenizer




def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    login(token="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")
    device = setup_environment()
    model_name = 'meta-llama/Meta-Llama-3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_8B_Llama")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.6385
Epoch 1, Step 10: Loss = 0.9131
Epoch 1, Step 20: Loss = 0.6719
Epoch 1, Step 30: Loss = 0.5644
Epoch 1, Step 40: Loss = 0.9722
Epoch 1, Step 50: Loss = 1.0099
Epoch 1, Step 60: Loss = 0.7462
Epoch 1, Step 70: Loss = 0.4694
Epoch 1, Step 80: Loss = 0.6595
Epoch 1, Step 90: Loss = 0.4440
Epoch 1, Step 100: Loss = 0.5117
Epoch 1, Step 110: Loss = 0.8599
Epoch 1, Step 120: Loss = 0.4787
Epoch 1, Step 130: Loss = 0.5803
Epoch 1, Step 140: Loss = 1.0303
Epoch 1, Step 150: Loss = 0.6111
Epoch 1, Step 160: Loss = 0.8293
Epoch 1, Step 170: Loss = 0.7493
Epoch 1, Step 180: Loss = 0.2285
Epoch 1, Step 190: Loss = 0.7167
Epoch 1, Step 200: Loss = 0.7388
Epoch 1, Step 210: Loss = 0.6272
Epoch 1, Step 220: Loss = 0.6067
Epoch 1, Step 230: Loss = 0.6489
Epoch 1, Step 240: Loss = 0.3009
Epoch 1, Step 250: Loss = 0.6325
Epoch 1, Step 260: Loss = 0.6023
Epoch 1, Step 270: Loss = 0.8579
Epoch 1, Step 280: Loss = 0.5738
Epoch 1, Step 290: Loss = 0.3541
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.0934
Epoch 5, Step 10: Loss = 0.4894
Epoch 5, Step 20: Loss = 0.1160
Epoch 5, Step 30: Loss = 0.2615
Epoch 5, Step 40: Loss = 0.0000
Epoch 5, Step 50: Loss = 0.0510
Epoch 5, Step 60: Loss = 0.0264
Epoch 5, Step 70: Loss = 0.0639
Epoch 5, Step 80: Loss = 0.0746
Epoch 5, Step 90: Loss = 0.0824
Epoch 5, Step 100: Loss = 0.1135
Epoch 5, Step 110: Loss = 0.0000
Epoch 5, Step 120: Loss = 0.0672
Epoch 5, Step 130: Loss = 0.2503
Epoch 5, Step 140: Loss = 0.1025
Epoch 5, Step 150: Loss = 0.1044
Epoch 5, Step 160: Loss = 0.3419
Epoch 5, Step 170: Loss = 0.0367
Epoch 5, Step 180: Loss = 0.0329
Epoch 5, Step 190: Loss = 0.0945
Epoch 5, Step 200: Loss = 0.4274
Epoch 5, Step 210: Loss = 0.0000
Epoch 5, Step 220: Loss = 0.0000
Epoch 5, Step 230: Loss = 0.0299
Epoch 5, Step 240: Loss = 0.0000
Epoch 5, Step 250: Loss = 0.0648
Epoch 5, Step 260: Loss = 0.0033
Epoch 5, Step 270: Loss = 0.1724
Epoch 5, Step 280: Loss = 0.0720
Epoch 5, Step 290: Loss = 0.0000
Epoch 5, Step 300: Lo

Epoch 9, Step 0: Loss = 0.1083
Epoch 9, Step 10: Loss = 0.1633
Epoch 9, Step 20: Loss = 0.0363
Epoch 9, Step 30: Loss = 0.2780
Epoch 9, Step 40: Loss = 0.0840
Epoch 9, Step 50: Loss = 0.0000
Epoch 9, Step 60: Loss = 0.0172
Epoch 9, Step 70: Loss = 0.2872
Epoch 9, Step 80: Loss = 0.0733
Epoch 9, Step 90: Loss = 0.0207
Epoch 9, Step 100: Loss = 0.4088
Epoch 9, Step 110: Loss = 0.0570
Epoch 9, Step 120: Loss = 0.1450
Epoch 9, Step 130: Loss = 0.0031
Epoch 9, Step 140: Loss = 0.0000
Epoch 9, Step 150: Loss = 0.1433
Epoch 9, Step 160: Loss = 0.0643
Epoch 9, Step 170: Loss = 0.0000
Epoch 9, Step 180: Loss = 0.1204
Epoch 9, Step 190: Loss = 0.0876
Epoch 9, Step 200: Loss = 0.0206
Epoch 9, Step 210: Loss = 0.0000
Epoch 9, Step 220: Loss = 0.0000
Epoch 9, Step 230: Loss = 0.0000
Epoch 9, Step 240: Loss = 0.0000
Epoch 9, Step 250: Loss = 0.0132
Epoch 9, Step 260: Loss = 0.1766
Epoch 9, Step 270: Loss = 0.1162
Epoch 9, Step 280: Loss = 0.0949
Epoch 9, Step 290: Loss = 0.9205
Epoch 9, Step 300: Lo

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

from transformers import AutoTokenizer, AutoConfig, LlamaForSequenceClassification,AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig

def setup_model_and_tokenizer(model_name, device):
    # Use the fast tokenizer to avoid the slow/legacy mismatch
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Load model config
    model_config = AutoConfig.from_pretrained(model_name, num_labels=2, pad_token_id=tokenizer.pad_token_id, use_cache=False)

    # Quantize base model
    quant_config = BitsAndBytesConfig(load_in_8bit=True)
    base = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        quantization_config=quant_config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,             # if needed for this checkpoint
    )

    # Attach LoRA
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
    )
    model = get_peft_model(base, lora_cfg)
    model.gradient_checkpointing_enable()

    return model, tokenizer




def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    login(token="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")
    device = setup_environment()
    model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/newdata_cleaned.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/contrastive_7B_DistillDeepSeek")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.8028
Epoch 1, Step 10: Loss = 0.6896
Epoch 1, Step 20: Loss = 1.2010
Epoch 1, Step 30: Loss = 0.6916
Epoch 1, Step 40: Loss = 0.6402
Epoch 1, Step 50: Loss = 0.9477
Epoch 1, Step 60: Loss = 1.1248
Epoch 1, Step 70: Loss = 0.9071
Epoch 1, Step 80: Loss = 0.5568
Epoch 1, Step 90: Loss = 1.3045
Epoch 1, Step 100: Loss = 1.2882
Epoch 1, Step 110: Loss = 0.6885
Epoch 1, Step 120: Loss = 0.7105
Epoch 1, Step 130: Loss = 0.7471
Epoch 1, Step 140: Loss = 1.1494
Epoch 1, Step 150: Loss = 0.6830
Epoch 1, Step 160: Loss = 1.1418
Epoch 1, Step 170: Loss = 1.0892
Epoch 1, Step 180: Loss = 0.8536
Epoch 1, Step 190: Loss = 1.0699
Epoch 1, Step 200: Loss = 0.7271
Epoch 1, Step 210: Loss = 1.3153
Epoch 1, Step 220: Loss = 0.8392
Epoch 1, Step 230: Loss = 0.7340
Epoch 1, Step 240: Loss = 0.5714
Epoch 1, Step 250: Loss = 0.6092
Epoch 1, Step 260: Loss = 0.4712
Epoch 1, Step 270: Loss = 0.6679
Epoch 1, Step 280: Loss = 0.9576
Epoch 1, Step 290: Loss = 0.9810
Epoch 1, Step 300: Lo

Epoch 5, Step 0: Loss = 0.5847
Epoch 5, Step 10: Loss = 0.2536
Epoch 5, Step 20: Loss = 0.1347
Epoch 5, Step 30: Loss = 0.0988
Epoch 5, Step 40: Loss = 0.1152
Epoch 5, Step 50: Loss = 0.4080
Epoch 5, Step 60: Loss = 0.1958
Epoch 5, Step 70: Loss = 0.4006
Epoch 5, Step 80: Loss = 0.2764
Epoch 5, Step 90: Loss = 0.2463
Epoch 5, Step 100: Loss = 0.4503
Epoch 5, Step 110: Loss = 0.4649
Epoch 5, Step 120: Loss = 0.3792
Epoch 5, Step 130: Loss = 0.0900
Epoch 5, Step 140: Loss = 0.1605
Epoch 5, Step 150: Loss = 0.2922
Epoch 5, Step 160: Loss = 0.1506
Epoch 5, Step 170: Loss = 0.2528
Epoch 5, Step 180: Loss = 0.1301
Epoch 5, Step 190: Loss = 0.1761
Epoch 5, Step 200: Loss = 0.2238
Epoch 5, Step 210: Loss = 0.2018
Epoch 5, Step 220: Loss = 0.0098
Epoch 5, Step 230: Loss = 0.3342
Epoch 5, Step 240: Loss = 0.4101
Epoch 5, Step 250: Loss = 0.0187
Epoch 5, Step 260: Loss = 0.3135
Epoch 5, Step 270: Loss = 0.2022
Epoch 5, Step 280: Loss = 0.3618
Epoch 5, Step 290: Loss = 0.3848
Epoch 5, Step 300: Lo

Epoch 9, Step 0: Loss = 0.5386
Epoch 9, Step 10: Loss = 0.3143
Epoch 9, Step 20: Loss = 0.5745
Epoch 9, Step 30: Loss = 0.0539
Epoch 9, Step 40: Loss = 0.0669
Epoch 9, Step 50: Loss = 0.1700
Epoch 9, Step 60: Loss = 0.1260
Epoch 9, Step 70: Loss = 0.3125
Epoch 9, Step 80: Loss = 0.1614
Epoch 9, Step 90: Loss = 0.4260
Epoch 9, Step 100: Loss = 0.5738
Epoch 9, Step 110: Loss = 0.0813
Epoch 9, Step 120: Loss = 0.3561
Epoch 9, Step 130: Loss = 0.2631
Epoch 9, Step 140: Loss = 0.0845
Epoch 9, Step 150: Loss = 0.1350
Epoch 9, Step 160: Loss = 0.1224
Epoch 9, Step 170: Loss = 0.2205
Epoch 9, Step 180: Loss = 0.2210
Epoch 9, Step 190: Loss = 0.0603
Epoch 9, Step 200: Loss = 0.0312
Epoch 9, Step 210: Loss = 0.4600
Epoch 9, Step 220: Loss = 0.0000
Epoch 9, Step 230: Loss = 0.1804
Epoch 9, Step 240: Loss = 0.2589
Epoch 9, Step 250: Loss = 0.1767
Epoch 9, Step 260: Loss = 0.0672
Epoch 9, Step 270: Loss = 0.1853
Epoch 9, Step 280: Loss = 0.1249
Epoch 9, Step 290: Loss = 0.3505
Epoch 9, Step 300: Lo