In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

def setup_model_and_tokenizer(model_name, device):
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False

   
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer



def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    
    device = setup_environment()
    model_name = 'meta-llama/Llama-2-7b-hf'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_7B_contrastive_classification_model_lora")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.7082
Epoch 1, Step 10: Loss = 0.4468
Epoch 1, Step 20: Loss = 0.7221
Epoch 1, Step 30: Loss = 0.6960
Epoch 1, Step 40: Loss = 0.5017
Epoch 1, Step 50: Loss = 0.5816
Epoch 1, Step 60: Loss = 0.7640
Epoch 1, Step 70: Loss = 0.8839
Epoch 1, Step 80: Loss = 0.6196
Epoch 1, Step 90: Loss = 0.7089
Epoch 1, Step 100: Loss = 0.5917
Epoch 1, Step 110: Loss = 0.3528
Epoch 1, Step 120: Loss = 0.8158
Epoch 1, Step 130: Loss = 1.0465
Epoch 1, Step 140: Loss = 0.6372
Epoch 1, Step 150: Loss = 0.7199
Epoch 1, Step 160: Loss = 0.4135
Epoch 1, Step 170: Loss = 0.5287
Epoch 1, Step 180: Loss = 0.8346
Epoch 1, Step 190: Loss = 0.5695
Epoch 1, Step 200: Loss = 0.6521
Epoch 1, Step 210: Loss = 0.2859
Epoch 1, Step 220: Loss = 0.4780
Epoch 1, Step 230: Loss = 0.7337
Epoch 1, Step 240: Loss = 0.6236
Epoch 1, Step 250: Loss = 0.7992
Epoch 1, Step 260: Loss = 0.7037
Epoch 1, Step 270: Loss = 0.6601
Epoch 1, Step 280: Loss = 0.5299
Epoch 1, Step 290: Loss = 0.7128
Epoch 1, Step 300: Lo

In [2]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

# Set environment to use GPU 2 explicitly
def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device



class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_model_and_tokenizer(model_name, device):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False


    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer


def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}

           # best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}

        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")


def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  # Ensure text is a string
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Main function modification for data cleaning
def main():
   
    device = setup_environment()
    model_name = 'meta-llama/Meta-Llama-3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

    # Ensure all relevant columns are strings to avoid errors
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)   
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
   
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

        #best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama8BBBBBB_contrastive_classification_model_lora")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.4848
Epoch 1, Step 10: Loss = 0.3876
Epoch 1, Step 20: Loss = 0.4532
Epoch 1, Step 30: Loss = 0.4432
Epoch 1, Step 40: Loss = 0.4758
Epoch 1, Step 50: Loss = 1.0357
Epoch 1, Step 60: Loss = 0.6788
Epoch 1, Step 70: Loss = 0.5073
Epoch 1, Step 80: Loss = 0.7307
Epoch 1, Step 90: Loss = 0.9666
Epoch 1, Step 100: Loss = 0.7170
Epoch 1, Step 110: Loss = 1.2472
Epoch 1, Step 120: Loss = 0.6567
Epoch 1, Step 130: Loss = 0.4748
Epoch 1, Step 140: Loss = 0.4824
Epoch 1, Step 150: Loss = 0.7874
Epoch 1, Step 160: Loss = 0.9680
Epoch 1, Step 170: Loss = 0.2350
Epoch 1, Step 180: Loss = 0.4143
Epoch 1, Step 190: Loss = 0.5848
Epoch 1, Step 200: Loss = 0.8162
Epoch 1, Step 210: Loss = 0.6087
Epoch 1, Step 220: Loss = 0.2629
Epoch 1, Step 230: Loss = 0.4542
Epoch 1, Step 240: Loss = 0.5835
Epoch 1, Step 250: Loss = 0.3362
Epoch 1, Step 260: Loss = 0.5486
Epoch 1, Step 270: Loss = 0.7951
Epoch 1, Step 280: Loss = 0.6721
Epoch 1, Step 290: Loss = 0.0943
Epoch 1, Step 300: Lo

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback
from transformers import AutoModelForSequenceClassification

# Replace LlamaForSequenceClassification with AutoModelForSequenceClassification

# Set environment to use GPU 2 explicitly
def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device



class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

def setup_model_and_tokenizer(model_name, device):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False


    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=model_config,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config
)


    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer


def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}

           # best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}

        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")


def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  # Ensure text is a string
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Main function modification for data cleaning
def main():
  
    device = setup_environment()
    model_name = 'dreamgen/WizardLM-2-7B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

    # Ensure all relevant columns are strings to avoid errors
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)   
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
   
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

        #best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/wiz_contrastive_classification_model_lora")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 1.4031
Epoch 1, Step 10: Loss = 1.0666
Epoch 1, Step 20: Loss = 1.8342
Epoch 1, Step 30: Loss = 2.4281
Epoch 1, Step 40: Loss = 1.7325
Epoch 1, Step 50: Loss = 2.1193
Epoch 1, Step 60: Loss = 0.7690
Epoch 1, Step 70: Loss = 1.4506
Epoch 1, Step 80: Loss = 1.3137
Epoch 1, Step 90: Loss = 1.5301
Epoch 1, Step 100: Loss = 0.8923
Epoch 1, Step 110: Loss = 1.3825
Epoch 1, Step 120: Loss = 0.4773
Epoch 1, Step 130: Loss = 1.4295
Epoch 1, Step 140: Loss = 1.4777
Epoch 1, Step 150: Loss = 1.1634
Epoch 1, Step 160: Loss = 1.3524
Epoch 1, Step 170: Loss = 1.7564
Epoch 1, Step 180: Loss = 0.8237
Epoch 1, Step 190: Loss = 1.4811
Epoch 1, Step 200: Loss = 1.8455
Epoch 1, Step 210: Loss = 2.0182
Epoch 1, Step 220: Loss = 1.7140
Epoch 1, Step 230: Loss = 1.4466
Epoch 1, Step 240: Loss = 1.2749
Epoch 1, Step 250: Loss = 1.7117
Epoch 1, Step 260: Loss = 1.3477
Epoch 1, Step 270: Loss = 1.0726
Epoch 1, Step 280: Loss = 1.4518
Epoch 1, Step 290: Loss = 1.7797
Epoch 1, Step 300: Lo

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    torch.cuda.set_device(0)  
    return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")




# Dataset class with data cleaning
class ContrastiveEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        # Clean each text field
        emails_df['sender'] = emails_df['sender'].apply(clean_text)
        emails_df['subject'] = emails_df['subject'].apply(clean_text)
        emails_df['body'] = emails_df['body'].apply(clean_text)
        
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.ham_indices = self.emails_df[self.emails_df['label'] == 0].index.tolist()
        self.phish_indices = self.emails_df[self.emails_df['label'] == 1].index.tolist()
        if not (self.ham_indices and self.phish_indices):
            raise ValueError("Dataset must contain examples of both classes")

    def __len__(self):
        return len(self.emails_df)

    def _get_random_email_idx(self, label):
        indices = self.ham_indices if label == 0 else self.phish_indices
        return np.random.choice(indices)

    def _prepare_email_input(self, email):
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        tokenized_output = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return tokenized_output

    def __getitem__(self, idx):
        anchor_email = self.emails_df.iloc[idx]
        anchor_label = anchor_email['label']
        positives, negatives = [], []

        for _ in range(3):
            pos_idx = self._get_random_email_idx(anchor_label)
            neg_idx = self._get_random_email_idx(1 - anchor_label)
            positives.append(self._prepare_email_input(self.emails_df.iloc[pos_idx]))
            negatives.append(self._prepare_email_input(self.emails_df.iloc[neg_idx]))

        anchor_inputs = self._prepare_email_input(anchor_email)
        return {
            'anchor_input_ids': anchor_inputs['input_ids'].squeeze(),
            'anchor_attention_mask': anchor_inputs['attention_mask'].squeeze(),
            'positive_input_ids': torch.stack([p['input_ids'].squeeze() for p in positives]),
            'positive_attention_mask': torch.stack([p['attention_mask'].squeeze() for p in positives]),
            'negative_input_ids': torch.stack([n['input_ids'].squeeze() for n in negatives]),
            'negative_attention_mask': torch.stack([n['attention_mask'].squeeze() for n in negatives])
        }

from transformers import BitsAndBytesConfig

from transformers import BitsAndBytesConfig

def setup_model_and_tokenizer(model_name, device):
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False

   
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer



def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=6, margin=1.0, accumulation_steps=2):
    best_val_loss = float('inf')
    best_model_state = None
    model = model.to(device).to(torch.float16)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                # Compute embeddings for anchor, positive, and negative samples in parallel
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                # Reshape positive and negative embeddings to match the batch structure (batch_size, 3, embedding_dim)
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

                # Calculate the triplet loss for each triplet in the batch
                loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )
                loss /= accumulation_steps

            loss.backward()

            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")

            torch.cuda.empty_cache()
            gc.collect()

        val_metrics = evaluate_model(model, val_loader, device, margin)
        print_metrics(epoch, total_loss, len(train_loader), val_metrics)

        if val_metrics['val_loss'] < best_val_loss:
            best_val_loss = val_metrics['val_loss']
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items() if isinstance(v, torch.Tensor)}


        torch.cuda.empty_cache()
        gc.collect()

    return best_model_state

def evaluate_model(model, val_loader, device, margin=1.0):
    model.eval()
    total_val_loss = 0
    val_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

            with torch.autocast(device_type='cuda', dtype=torch.float16):
               
                anchor_embeddings = model(input_ids=batch['anchor_input_ids'], attention_mask=batch['anchor_attention_mask']).logits
                positive_embeddings = model(input_ids=batch['positive_input_ids'].view(-1, batch['positive_input_ids'].size(-1)), attention_mask=batch['positive_attention_mask'].view(-1, batch['positive_attention_mask'].size(-1))).logits
                negative_embeddings = model(input_ids=batch['negative_input_ids'].view(-1, batch['negative_input_ids'].size(-1)), attention_mask=batch['negative_attention_mask'].view(-1, batch['negative_attention_mask'].size(-1))).logits

                
                positive_embeddings = positive_embeddings.view(anchor_embeddings.size(0), 3, -1)
                negative_embeddings = negative_embeddings.view(anchor_embeddings.size(0), 3, -1)

               
                val_loss = F.triplet_margin_loss(
                    anchor_embeddings.unsqueeze(1).expand(-1, 3, -1),
                    positive_embeddings,
                    negative_embeddings,
                    margin=margin
                )

            total_val_loss += val_loss.item()
            val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    return {'val_loss': avg_val_loss}


# Main function
# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):  
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def print_metrics(epoch, total_loss, num_batches, val_metrics):
    
    avg_train_loss = total_loss / num_batches
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print("Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

def main():
    
    device = setup_environment()
    model_name = 'meta-llama/Llama-2-7b-chat-hf'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)

   
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

    train_dataset = ContrastiveEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = ContrastiveEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 6
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_model_state = train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=num_epochs)
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_7B_contrastive_classification_model_lora_chat")
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device)
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.5966
Epoch 1, Step 10: Loss = 0.4484
Epoch 1, Step 20: Loss = 0.4604
Epoch 1, Step 30: Loss = 0.5192
Epoch 1, Step 40: Loss = 0.2952
Epoch 1, Step 50: Loss = 0.2056
Epoch 1, Step 60: Loss = 0.3994
Epoch 1, Step 70: Loss = 0.4703
Epoch 1, Step 80: Loss = 0.1773
Epoch 1, Step 90: Loss = 0.5351
Epoch 1, Step 100: Loss = 0.4803
Epoch 1, Step 110: Loss = 0.3882
Epoch 1, Step 120: Loss = 0.8287
Epoch 1, Step 130: Loss = 0.4020
Epoch 1, Step 140: Loss = 0.3895
Epoch 1, Step 150: Loss = 0.3187
Epoch 1, Step 160: Loss = 0.2593
Epoch 1, Step 170: Loss = 0.4572
Epoch 1, Step 180: Loss = 0.5183
Epoch 1, Step 190: Loss = 0.5409
Epoch 1, Step 200: Loss = 0.2555
Epoch 1, Step 210: Loss = 0.4181
Epoch 1, Step 220: Loss = 0.5730
Epoch 1, Step 230: Loss = 0.4346
Epoch 1, Step 240: Loss = 0.1880
Epoch 1, Step 250: Loss = 0.5279
Epoch 1, Step 260: Loss = 0.3217
Epoch 1, Step 270: Loss = 0.2363
Epoch 1, Step 280: Loss = 0.0875
Epoch 1, Step 290: Loss = 0.5618
Epoch 1, Step 300: Lo

In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import gc
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import traceback
from transformers import BitsAndBytesConfig


from transformers import AutoTokenizer

from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig,
)
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from huggingface_hub import login
import re

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '2'
    
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
  
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)
    
    return device


def setup_model_and_tokenizer(model_name, device):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False


    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name, 
        config=model_config, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config
    )

    lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="SEQ_CLS")
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer



class PreferenceEmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        """
        Dataset to create pairs of message, preferred response, and rejected response for DPO training.
        """
        self.emails_df = emails_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pairs = self._create_preference_pairs()

    def _create_preference_pairs(self):
        """
        Create pairs using emails from the dataset based on their labels.
        """
        pairs = []
        for _, selected_email in self.emails_df.iterrows():
            selected_label = selected_email['label']
            ham_emails = self.emails_df[self.emails_df['label'] == 0]
            phish_emails = self.emails_df[self.emails_df['label'] == 1]

            if selected_label == 1:  # Phishing email
                # Preferred: Another phishing email
                preferred_email = phish_emails[phish_emails.index != selected_email.name].sample(n=1).iloc[0]
                # Rejected: A ham email
                rejected_email = ham_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })
            elif selected_label == 0:  # Ham email
                # Preferred: Another ham email
                preferred_email = ham_emails[ham_emails.index != selected_email.name].sample(n=1).iloc[0]
                # Rejected: A phishing email
                rejected_email = phish_emails.sample(n=1).iloc[0]
                pairs.append({
                    'message': selected_email,
                    'preferred': preferred_email,
                    'rejected': rejected_email
                })

        return pairs

    def _prepare_email_input(self, message, response):
        """
        Prepare the input text with formatted message and response for tokenization.
        """
        formatted_input = f"<s>[INST] {message} [/INST] {response}</s>"
        return self.tokenizer(
            formatted_input,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        
        pair = self.pairs[idx]


        message_text = f"Sender: {pair['message']['sender']} [SEP] Subject: {pair['message']['subject']} [SEP] {pair['message']['body']}"
     
        preferred_response = f"Sender: {pair['preferred']['sender']} [SEP] Subject: {pair['preferred']['subject']} [SEP] {pair['preferred']['body']}"
        rejected_response = f"Sender: {pair['rejected']['sender']} [SEP] Subject: {pair['rejected']['subject']} [SEP] {pair['rejected']['body']}"
        message_inputs = self._prepare_email_input(message_text, "")
        preferred_inputs = self._prepare_email_input(message_text, preferred_response)
        rejected_inputs = self._prepare_email_input(message_text, rejected_response)

        return {
            'message_input_ids': message_inputs['input_ids'].squeeze(),
            'message_attention_mask': message_inputs['attention_mask'].squeeze(),
            'preferred_input_ids': preferred_inputs['input_ids'].squeeze(),
            'preferred_attention_mask': preferred_inputs['attention_mask'].squeeze(),
            'rejected_input_ids': rejected_inputs['input_ids'].squeeze(),
            'rejected_attention_mask': rejected_inputs['attention_mask'].squeeze(),
        }


def clean_text(text):
   
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def compute_dpo_loss(policy_chosen_logits, policy_rejected_logits, 
                    reference_chosen_logits, reference_rejected_logits, 
                    beta=0.2):
   
    epsilon = 1e-8
    
   
    policy_chosen_probs = F.softmax(policy_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    policy_rejected_probs = F.softmax(policy_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_chosen_probs = F.softmax(reference_chosen_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    ref_rejected_probs = F.softmax(reference_rejected_logits, dim=-1)[:, 0].clamp(epsilon, 1-epsilon)
    
  
    chosen_rewards = (torch.log(policy_chosen_probs + epsilon) - 
                     torch.log(ref_chosen_probs + epsilon))
    rejected_rewards = (torch.log(policy_rejected_probs + epsilon) - 
                       torch.log(ref_rejected_probs + epsilon))
    
    
    max_reward = 50.0
    chosen_rewards = torch.clamp(chosen_rewards, -max_reward, max_reward)
    rejected_rewards = torch.clamp(rejected_rewards, -max_reward, max_reward)
    
    
    logits_diff = (chosen_rewards - rejected_rewards) / beta
    
    valid_mask = ~torch.isnan(logits_diff)
    if valid_mask.any():
        loss = -F.logsigmoid(logits_diff[valid_mask]).mean()
    else:
        loss = torch.tensor(0.0, device=logits_diff.device)
    
    return loss

def train_model_dpo(policy_model, reference_model, train_loader, val_loader, 
                   optimizer, scheduler, device, num_epochs=5, beta=0.2, gradient_accumulation_steps=2):
    
    best_val_loss = float('inf')
    best_model_state = None
    
    policy_model = policy_model.to(device).float()
    reference_model = reference_model.to(device).float()
    reference_model.eval()  # Ensure reference model does not get updated during training
    
    scaler = torch.amp.GradScaler('cuda')
    
    for epoch in range(num_epochs):
        policy_model.train()
        total_loss = 0
        valid_steps = 0
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            try:
                with torch.amp.autocast('cuda', dtype=torch.float32):
                    policy_chosen_outputs = policy_model(
                        input_ids=batch['preferred_input_ids'],
                        attention_mask=batch['preferred_attention_mask']
                    )
                    policy_rejected_outputs = policy_model(
                        input_ids=batch['rejected_input_ids'],
                        attention_mask=batch['rejected_attention_mask']
                    )
                    
                    with torch.no_grad():
                        ref_chosen_outputs = reference_model(
                            input_ids=batch['preferred_input_ids'],
                            attention_mask=batch['preferred_attention_mask']
                        )
                        ref_rejected_outputs = reference_model(
                            input_ids=batch['rejected_input_ids'],
                            attention_mask=batch['rejected_attention_mask']
                        )
                    
                    loss = compute_dpo_loss(
                        policy_chosen_outputs.logits,
                        policy_rejected_outputs.logits,
                        ref_chosen_outputs.logits,
                        ref_rejected_outputs.logits,
                        beta=beta
                    )
                    
                    if not torch.isnan(loss) and not torch.isinf(loss):
                        scaler.scale(loss).backward()
                        
                        # Gradient accumulation logic
                        if (step + 1) % gradient_accumulation_steps == 0:
                            scaler.unscale_(optimizer)
                            torch.nn.utils.clip_grad_norm_(policy_model.parameters(), max_norm=1.0)
                            scaler.step(optimizer)
                            scaler.update()
                            scheduler.step()
                            optimizer.zero_grad()
                        
                        total_loss += loss.item()
                        valid_steps += 1
                    
                    if step % 10 == 0:
                        avg_loss = total_loss / max(valid_steps, 1)
                        print(f"[Epoch {epoch+1}/{num_epochs} | Step {step}/{len(train_loader)}] - Loss: {avg_loss:.4f}")
                
            except RuntimeError as e:
                print(f"Error in batch {step}: {str(e)}")
                traceback.print_exc()
                continue
            
            torch.cuda.empty_cache()
            gc.collect()
        
        if valid_steps > 0:
            avg_train_loss = total_loss / valid_steps
            val_loss = evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta)
            print(f"Epoch {epoch+1}/{num_epochs} - Avg Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = {k: v.cpu() for k, v in policy_model.state_dict().items() if isinstance(v, torch.Tensor)}
    
    return best_model_state



def evaluate_model_dpo(policy_model, reference_model, val_loader, device, beta):
    policy_model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.amp.autocast('cuda', dtype=torch.float32):
                policy_chosen_outputs = policy_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                policy_rejected_outputs = policy_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                ref_chosen_outputs = reference_model(
                    input_ids=batch['preferred_input_ids'],
                    attention_mask=batch['preferred_attention_mask']
                )
                ref_rejected_outputs = reference_model(
                    input_ids=batch['rejected_input_ids'],
                    attention_mask=batch['rejected_attention_mask']
                )
                
                loss = compute_dpo_loss(
                    policy_chosen_outputs.logits,
                    policy_rejected_outputs.logits,
                    ref_chosen_outputs.logits,
                    ref_rejected_outputs.logits,
                    beta=beta
                )
            
            total_loss += loss.item()

    return total_loss / len(val_loader)


def main():

   
    device = setup_environment()
    model_name = 'meta-llama/Llama-2-7b-chat-hf'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")

    policy_model, tokenizer = setup_model_and_tokenizer(model_name, device)
    reference_model, _ = setup_model_and_tokenizer(model_name, device)
    

    emails_df = pd.read_csv(data_path)
    emails_df['sender'] = emails_df['sender'].astype(str).apply(clean_text)
    emails_df['subject'] = emails_df['subject'].astype(str).apply(clean_text)
    emails_df['body'] = emails_df['body'].astype(str).apply(clean_text)

    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)

  
    train_dataset = PreferenceEmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = PreferenceEmailDataset(val_df, tokenizer, max_length=512)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)

    # Setup optimization
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5, weight_decay=0.01)
    num_epochs = 8
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_model_state = train_model_dpo(
        policy_model,
        reference_model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs,
        beta=0.2
    )

   
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_7b_dpo_classification_model_chat")
    os.makedirs(output_dir, exist_ok=True)
    policy_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 1e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "beta": 0.2
    }
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()
    

Using GPU: NVIDIA RTX A5000


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1/8 | Step 0/500] - Loss: 4.2860
[Epoch 1/8 | Step 10/500] - Loss: 4.0605
[Epoch 1/8 | Step 20/500] - Loss: 4.1956
[Epoch 1/8 | Step 30/500] - Loss: 4.6349
[Epoch 1/8 | Step 40/500] - Loss: 4.7998
[Epoch 1/8 | Step 50/500] - Loss: 4.4766
[Epoch 1/8 | Step 60/500] - Loss: 4.5783
[Epoch 1/8 | Step 70/500] - Loss: 4.6406
[Epoch 1/8 | Step 80/500] - Loss: 4.5046
[Epoch 1/8 | Step 90/500] - Loss: 4.3593
[Epoch 1/8 | Step 100/500] - Loss: 4.4299
[Epoch 1/8 | Step 110/500] - Loss: 4.5442
[Epoch 1/8 | Step 120/500] - Loss: 4.5273
[Epoch 1/8 | Step 130/500] - Loss: 4.5207
[Epoch 1/8 | Step 140/500] - Loss: 4.4689
[Epoch 1/8 | Step 150/500] - Loss: 4.3957
[Epoch 1/8 | Step 160/500] - Loss: 4.5157
[Epoch 1/8 | Step 170/500] - Loss: 4.4796
[Epoch 1/8 | Step 180/500] - Loss: 4.4771
[Epoch 1/8 | Step 190/500] - Loss: 4.3955
[Epoch 1/8 | Step 200/500] - Loss: 4.3488
[Epoch 1/8 | Step 210/500] - Loss: 4.3235
[Epoch 1/8 | Step 220/500] - Loss: 4.3200
[Epoch 1/8 | Step 230/500] - Loss: 4.2350
[Ep

In [3]:
import os
import json
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig, BitsAndBytesConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import gc
import seaborn as sns
import matplotlib.pyplot as plt

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

class EmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        self.emails_df = emails_df.copy()
        self.emails_df['sender'] = self.emails_df['sender'].apply(clean_text)
        self.emails_df['subject'] = self.emails_df['subject'].apply(clean_text)
        self.emails_df['body'] = self.emails_df['body'].apply(clean_text)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.emails_df)

    def __getitem__(self, idx):
        email = self.emails_df.iloc[idx]
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        
        encoding = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(email['label'], dtype=torch.long)
        }

def clean_text(text):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def setup_model_and_tokenizer(model_name, device):
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False
    
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

def compute_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    conf_matrix = confusion_matrix(labels, preds)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

def plot_confusion_matrix(conf_matrix, output_dir):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['label'].cpu().numpy())
    
    metrics = compute_metrics(all_preds, all_labels)
    metrics['loss'] = total_loss / len(data_loader)
    return metrics

def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=6):
    best_val_metrics = {'f1': 0}
    best_model_state = None
    model = model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_preds = []
        train_labels = []
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            loss.backward()
            
            if (step + 1) % 2 == 0:  # Gradient accumulation steps = 2
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(batch['label'].cpu().numpy())
            
            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")
            
            torch.cuda.empty_cache()
  
        train_metrics = compute_metrics(train_preds, train_labels)
        train_metrics['loss'] = total_loss / len(train_loader)
      
        val_metrics = evaluate_model(model, val_loader, device)
      
        print(f"\nEpoch {epoch + 1} Summary:")
        print("Training Metrics:")
        for metric, value in train_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        print("\nValidation Metrics:")
        for metric, value in val_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        # Save best model
        if val_metrics['f1'] > best_val_metrics['f1']:
            best_val_metrics = val_metrics
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
        
        torch.cuda.empty_cache()
        gc.collect()
    
    return best_model_state, best_val_metrics

def main():
    
    device = setup_environment()
    model_name = 'meta-llama/Llama-2-7b-chat-hf'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")
    
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")
    
    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)
    
    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)
    
    train_dataset = EmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = EmailDataset(val_df, tokenizer, max_length=512)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)
    
    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 6
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_model_state, best_metrics = train_model(
        model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs
    )
    
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_binary_classification_model_chat")
    os.makedirs(output_dir, exist_ok=True)
    
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    plot_confusion_matrix(best_metrics['confusion_matrix'], output_dir)
    
  
    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "best_metrics": {k: float(v) if k != 'confusion_matrix' else v.tolist() 
                        for k, v in best_metrics.items()}
    }
    
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()

Using GPU: NVIDIA RTX A5000
GPU Memory: 23.68 GB


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.4272
Epoch 1, Step 10: Loss = 1.6470
Epoch 1, Step 20: Loss = 0.7630
Epoch 1, Step 30: Loss = 2.1678
Epoch 1, Step 40: Loss = 1.5892
Epoch 1, Step 50: Loss = 2.3011
Epoch 1, Step 60: Loss = 0.5395
Epoch 1, Step 70: Loss = 1.2031
Epoch 1, Step 80: Loss = 1.8276
Epoch 1, Step 90: Loss = 1.1809
Epoch 1, Step 100: Loss = 1.4345
Epoch 1, Step 110: Loss = 0.8915
Epoch 1, Step 120: Loss = 1.1646
Epoch 1, Step 130: Loss = 1.5051
Epoch 1, Step 140: Loss = 1.9505
Epoch 1, Step 150: Loss = 1.2951
Epoch 1, Step 160: Loss = 2.5265
Epoch 1, Step 170: Loss = 0.2750
Epoch 1, Step 180: Loss = 0.3918
Epoch 1, Step 190: Loss = 1.0108
Epoch 1, Step 200: Loss = 0.4703
Epoch 1, Step 210: Loss = 2.1217
Epoch 1, Step 220: Loss = 1.1946
Epoch 1, Step 230: Loss = 0.7917
Epoch 1, Step 240: Loss = 1.4148
Epoch 1, Step 250: Loss = 1.3443
Epoch 1, Step 260: Loss = 1.8523
Epoch 1, Step 270: Loss = 1.0013
Epoch 1, Step 280: Loss = 0.5420
Epoch 1, Step 290: Loss = 1.0984
Epoch 1, Step 300: Lo




Epoch 1 Summary:
Training Metrics:
accuracy: 0.6092
precision: 0.5673
recall: 0.9210
f1: 0.7021
loss: 1.1232

Validation Metrics:
accuracy: 0.7880
precision: 0.7491
recall: 0.8660
f1: 0.8033
loss: 0.5239




Epoch 2, Step 0: Loss = 0.7248
Epoch 2, Step 10: Loss = 0.6782
Epoch 2, Step 20: Loss = 0.0501
Epoch 2, Step 30: Loss = 0.1776
Epoch 2, Step 40: Loss = 0.8486
Epoch 2, Step 50: Loss = 0.1535
Epoch 2, Step 60: Loss = 0.9027
Epoch 2, Step 70: Loss = 0.1491
Epoch 2, Step 80: Loss = 0.5188
Epoch 2, Step 90: Loss = 0.2357
Epoch 2, Step 100: Loss = 1.1729
Epoch 2, Step 110: Loss = 0.6698
Epoch 2, Step 120: Loss = 0.1304
Epoch 2, Step 130: Loss = 0.4738
Epoch 2, Step 140: Loss = 0.7800
Epoch 2, Step 150: Loss = 0.7193
Epoch 2, Step 160: Loss = 0.3190
Epoch 2, Step 170: Loss = 0.5726
Epoch 2, Step 180: Loss = 0.4613
Epoch 2, Step 190: Loss = 0.0764
Epoch 2, Step 200: Loss = 0.1840
Epoch 2, Step 210: Loss = 0.0874
Epoch 2, Step 220: Loss = 0.2503
Epoch 2, Step 230: Loss = 0.5001
Epoch 2, Step 240: Loss = 0.2976
Epoch 2, Step 250: Loss = 0.4904
Epoch 2, Step 260: Loss = 0.5987
Epoch 2, Step 270: Loss = 0.2482
Epoch 2, Step 280: Loss = 0.2561
Epoch 2, Step 290: Loss = 0.1637
Epoch 2, Step 300: Lo




Epoch 2 Summary:
Training Metrics:
accuracy: 0.8293
precision: 0.7984
recall: 0.8810
f1: 0.8377
loss: 0.4229

Validation Metrics:
accuracy: 0.8590
precision: 0.8406
recall: 0.8860
f1: 0.8627
loss: 0.3375




Epoch 3, Step 0: Loss = 0.1100
Epoch 3, Step 10: Loss = 0.0630
Epoch 3, Step 20: Loss = 0.1822
Epoch 3, Step 30: Loss = 0.2325
Epoch 3, Step 40: Loss = 0.3266
Epoch 3, Step 50: Loss = 0.3994
Epoch 3, Step 60: Loss = 0.0473
Epoch 3, Step 70: Loss = 0.0595
Epoch 3, Step 80: Loss = 0.4212
Epoch 3, Step 90: Loss = 0.0516
Epoch 3, Step 100: Loss = 0.5670
Epoch 3, Step 110: Loss = 0.7838
Epoch 3, Step 120: Loss = 0.3184
Epoch 3, Step 130: Loss = 0.3589
Epoch 3, Step 140: Loss = 0.1838
Epoch 3, Step 150: Loss = 0.0455
Epoch 3, Step 160: Loss = 0.3019
Epoch 3, Step 170: Loss = 0.1569
Epoch 3, Step 180: Loss = 0.2494
Epoch 3, Step 190: Loss = 0.4196
Epoch 3, Step 200: Loss = 0.4416
Epoch 3, Step 210: Loss = 0.0932
Epoch 3, Step 220: Loss = 0.2014
Epoch 3, Step 230: Loss = 0.2580
Epoch 3, Step 240: Loss = 0.1099
Epoch 3, Step 250: Loss = 0.3512
Epoch 3, Step 260: Loss = 0.0346
Epoch 3, Step 270: Loss = 0.6410
Epoch 3, Step 280: Loss = 0.3313
Epoch 3, Step 290: Loss = 1.2524
Epoch 3, Step 300: Lo




Epoch 3 Summary:
Training Metrics:
accuracy: 0.8755
precision: 0.8573
recall: 0.9010
f1: 0.8786
loss: 0.3140

Validation Metrics:
accuracy: 0.8890
precision: 0.8806
recall: 0.9000
f1: 0.8902
loss: 0.2861




Epoch 4, Step 0: Loss = 1.0407
Epoch 4, Step 10: Loss = 0.3867
Epoch 4, Step 20: Loss = 0.2628
Epoch 4, Step 30: Loss = 0.1276
Epoch 4, Step 40: Loss = 0.0725
Epoch 4, Step 50: Loss = 0.3154
Epoch 4, Step 60: Loss = 0.2460
Epoch 4, Step 70: Loss = 0.2625
Epoch 4, Step 80: Loss = 0.1105
Epoch 4, Step 90: Loss = 0.5443
Epoch 4, Step 100: Loss = 0.3665
Epoch 4, Step 110: Loss = 0.2582
Epoch 4, Step 120: Loss = 0.2831
Epoch 4, Step 130: Loss = 0.3791
Epoch 4, Step 140: Loss = 0.2010
Epoch 4, Step 150: Loss = 0.1029
Epoch 4, Step 160: Loss = 0.2065
Epoch 4, Step 170: Loss = 0.4754
Epoch 4, Step 180: Loss = 0.0299
Epoch 4, Step 190: Loss = 0.3958
Epoch 4, Step 200: Loss = 0.1270
Epoch 4, Step 210: Loss = 0.1072
Epoch 4, Step 220: Loss = 0.1097
Epoch 4, Step 230: Loss = 0.5458
Epoch 4, Step 240: Loss = 0.1787
Epoch 4, Step 250: Loss = 0.4305
Epoch 4, Step 260: Loss = 0.2355
Epoch 4, Step 270: Loss = 0.2236
Epoch 4, Step 280: Loss = 0.1132
Epoch 4, Step 290: Loss = 0.2852
Epoch 4, Step 300: Lo




Epoch 4 Summary:
Training Metrics:
accuracy: 0.8910
precision: 0.8749
recall: 0.9125
f1: 0.8933
loss: 0.2749

Validation Metrics:
accuracy: 0.9040
precision: 0.8945
recall: 0.9160
f1: 0.9051
loss: 0.2648




Epoch 5, Step 0: Loss = 0.0657
Epoch 5, Step 10: Loss = 0.2040
Epoch 5, Step 20: Loss = 0.2207
Epoch 5, Step 30: Loss = 0.0299
Epoch 5, Step 40: Loss = 0.2215
Epoch 5, Step 50: Loss = 0.1889
Epoch 5, Step 60: Loss = 0.0978
Epoch 5, Step 70: Loss = 0.0882
Epoch 5, Step 80: Loss = 0.1888
Epoch 5, Step 90: Loss = 0.9026
Epoch 5, Step 100: Loss = 0.8007
Epoch 5, Step 110: Loss = 0.2342
Epoch 5, Step 120: Loss = 0.1604
Epoch 5, Step 130: Loss = 0.1269
Epoch 5, Step 140: Loss = 0.0861
Epoch 5, Step 150: Loss = 0.6603
Epoch 5, Step 160: Loss = 0.0764
Epoch 5, Step 170: Loss = 0.2565
Epoch 5, Step 180: Loss = 0.4057
Epoch 5, Step 190: Loss = 0.3256
Epoch 5, Step 200: Loss = 0.0426
Epoch 5, Step 210: Loss = 0.7952
Epoch 5, Step 220: Loss = 0.0568
Epoch 5, Step 230: Loss = 0.2645
Epoch 5, Step 240: Loss = 0.4267
Epoch 5, Step 250: Loss = 0.0793
Epoch 5, Step 260: Loss = 0.3919
Epoch 5, Step 270: Loss = 0.1660
Epoch 5, Step 280: Loss = 0.2850
Epoch 5, Step 290: Loss = 0.1087
Epoch 5, Step 300: Lo




Epoch 5 Summary:
Training Metrics:
accuracy: 0.8970
precision: 0.8825
recall: 0.9160
f1: 0.8989
loss: 0.2616

Validation Metrics:
accuracy: 0.9060
precision: 0.8965
recall: 0.9180
f1: 0.9071
loss: 0.2568




Epoch 6, Step 0: Loss = 0.2733
Epoch 6, Step 10: Loss = 0.1311
Epoch 6, Step 20: Loss = 0.0605
Epoch 6, Step 30: Loss = 0.4768
Epoch 6, Step 40: Loss = 0.5181
Epoch 6, Step 50: Loss = 0.3234
Epoch 6, Step 60: Loss = 0.2090
Epoch 6, Step 70: Loss = 0.4782
Epoch 6, Step 80: Loss = 0.2583
Epoch 6, Step 90: Loss = 0.0413
Epoch 6, Step 100: Loss = 0.4889
Epoch 6, Step 110: Loss = 0.0198
Epoch 6, Step 120: Loss = 0.1177
Epoch 6, Step 130: Loss = 0.1281
Epoch 6, Step 140: Loss = 0.3601
Epoch 6, Step 150: Loss = 0.3155
Epoch 6, Step 160: Loss = 0.5489
Epoch 6, Step 170: Loss = 0.0185
Epoch 6, Step 180: Loss = 0.8911
Epoch 6, Step 190: Loss = 0.3537
Epoch 6, Step 200: Loss = 0.0821
Epoch 6, Step 210: Loss = 0.9456
Epoch 6, Step 220: Loss = 0.2114
Epoch 6, Step 230: Loss = 0.0300
Epoch 6, Step 240: Loss = 0.0390
Epoch 6, Step 250: Loss = 0.0582
Epoch 6, Step 260: Loss = 0.1660
Epoch 6, Step 270: Loss = 0.4701
Epoch 6, Step 280: Loss = 0.6595
Epoch 6, Step 290: Loss = 0.7898
Epoch 6, Step 300: Lo




Epoch 6 Summary:
Training Metrics:
accuracy: 0.9018
precision: 0.8846
recall: 0.9240
f1: 0.9039
loss: 0.2545

Validation Metrics:
accuracy: 0.9070
precision: 0.8982
recall: 0.9180
f1: 0.9080
loss: 0.2532
