In [6]:
import os
import json
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import gc
import seaborn as sns
import matplotlib.pyplot as plt

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

class EmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        self.emails_df = emails_df.copy()
        self.emails_df['sender'] = self.emails_df['sender'].apply(clean_text)
        self.emails_df['subject'] = self.emails_df['subject'].apply(clean_text)
        self.emails_df['body'] = self.emails_df['body'].apply(clean_text)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.emails_df)

    def __getitem__(self, idx):
        email = self.emails_df.iloc[idx]
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        
        encoding = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(email['label'], dtype=torch.long)
        }

def clean_text(text):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def setup_model_and_tokenizer(model_name, device):
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False
    
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

def compute_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    conf_matrix = confusion_matrix(labels, preds)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

def plot_confusion_matrix(conf_matrix, output_dir):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['label'].cpu().numpy())
    
    metrics = compute_metrics(all_preds, all_labels)
    metrics['loss'] = total_loss / len(data_loader)
    return metrics

def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9):
    best_val_metrics = {'f1': 0}
    best_model_state = None
    model = model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_preds = []
        train_labels = []
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            loss.backward()
            
            if (step + 1) % 2 == 0:  # Gradient accumulation steps = 2
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(batch['label'].cpu().numpy())
            
            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")
            
            torch.cuda.empty_cache()
  
        train_metrics = compute_metrics(train_preds, train_labels)
        train_metrics['loss'] = total_loss / len(train_loader)
      
        val_metrics = evaluate_model(model, val_loader, device)
      
        print(f"\nEpoch {epoch + 1} Summary:")
        print("Training Metrics:")
        for metric, value in train_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        print("\nValidation Metrics:")
        for metric, value in val_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        # Save best model
        if val_metrics['f1'] > best_val_metrics['f1']:
            best_val_metrics = val_metrics
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
        
        torch.cuda.empty_cache()
        gc.collect()
    
    return best_model_state, best_val_metrics

def main():
    login(token=
    device = setup_environment()
    model_name = 'meta-llama/Llama-2-7b-hf'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")
    
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")
    
    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)
    
    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)
    
    train_dataset = EmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = EmailDataset(val_df, tokenizer, max_length=512)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)
    
    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_model_state, best_metrics = train_model(
        model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs
    )
    
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_binary_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    plot_confusion_matrix(best_metrics['confusion_matrix'], output_dir)
    
  
    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "best_metrics": {k: float(v) if k != 'confusion_matrix' else v.tolist() 
                        for k, v in best_metrics.items()}
    }
    
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()

Using GPU: NVIDIA RTX A5000
GPU Memory: 23.68 GB


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 1.8862
Epoch 1, Step 10: Loss = 1.7530
Epoch 1, Step 20: Loss = 1.3627
Epoch 1, Step 30: Loss = 1.5234
Epoch 1, Step 40: Loss = 0.6137
Epoch 1, Step 50: Loss = 1.3821
Epoch 1, Step 60: Loss = 0.7046
Epoch 1, Step 70: Loss = 2.1004
Epoch 1, Step 80: Loss = 2.2951
Epoch 1, Step 90: Loss = 1.0264
Epoch 1, Step 100: Loss = 0.9422
Epoch 1, Step 110: Loss = 0.4088
Epoch 1, Step 120: Loss = 0.7887
Epoch 1, Step 130: Loss = 2.2298
Epoch 1, Step 140: Loss = 1.0574
Epoch 1, Step 150: Loss = 1.8968
Epoch 1, Step 160: Loss = 1.3741
Epoch 1, Step 170: Loss = 1.4226
Epoch 1, Step 180: Loss = 1.8554
Epoch 1, Step 190: Loss = 0.6717
Epoch 1, Step 200: Loss = 1.0808
Epoch 1, Step 210: Loss = 1.3347
Epoch 1, Step 220: Loss = 0.4885
Epoch 1, Step 230: Loss = 1.3909
Epoch 1, Step 240: Loss = 0.9509
Epoch 1, Step 250: Loss = 0.8483
Epoch 1, Step 260: Loss = 0.8913
Epoch 1, Step 270: Loss = 0.8954
Epoch 1, Step 280: Loss = 0.9386
Epoch 1, Step 290: Loss = 0.7468
Epoch 1, Step 300: Lo




Epoch 1 Summary:
Training Metrics:
accuracy: 0.4750
precision: 0.4501
recall: 0.2255
f1: 0.3005
loss: 1.1589

Validation Metrics:
accuracy: 0.6790
precision: 0.6908
recall: 0.6480
f1: 0.6687
loss: 0.6683




Epoch 2, Step 0: Loss = 0.7116
Epoch 2, Step 10: Loss = 1.2045
Epoch 2, Step 20: Loss = 0.7348
Epoch 2, Step 30: Loss = 0.6645
Epoch 2, Step 40: Loss = 0.5472
Epoch 2, Step 50: Loss = 0.7364
Epoch 2, Step 60: Loss = 0.8564
Epoch 2, Step 70: Loss = 0.2349
Epoch 2, Step 80: Loss = 0.4479
Epoch 2, Step 90: Loss = 0.6749
Epoch 2, Step 100: Loss = 0.2712
Epoch 2, Step 110: Loss = 0.6367
Epoch 2, Step 120: Loss = 1.0097
Epoch 2, Step 130: Loss = 0.1786
Epoch 2, Step 140: Loss = 0.2758
Epoch 2, Step 150: Loss = 0.1934
Epoch 2, Step 160: Loss = 0.5079
Epoch 2, Step 170: Loss = 0.4402
Epoch 2, Step 180: Loss = 0.8845
Epoch 2, Step 190: Loss = 0.4442
Epoch 2, Step 200: Loss = 0.6309
Epoch 2, Step 210: Loss = 0.4963
Epoch 2, Step 220: Loss = 0.9293
Epoch 2, Step 230: Loss = 0.3344
Epoch 2, Step 240: Loss = 0.5259
Epoch 2, Step 250: Loss = 0.3631
Epoch 2, Step 260: Loss = 0.3413
Epoch 2, Step 270: Loss = 0.2123
Epoch 2, Step 280: Loss = 0.3799
Epoch 2, Step 290: Loss = 0.8429
Epoch 2, Step 300: Lo




Epoch 2 Summary:
Training Metrics:
accuracy: 0.8075
precision: 0.8251
recall: 0.7805
f1: 0.8022
loss: 0.4246

Validation Metrics:
accuracy: 0.8670
precision: 0.8536
recall: 0.8860
f1: 0.8695
loss: 0.3116




Epoch 3, Step 0: Loss = 0.4618
Epoch 3, Step 10: Loss = 0.9066
Epoch 3, Step 20: Loss = 0.3491
Epoch 3, Step 30: Loss = 0.3149
Epoch 3, Step 40: Loss = 0.5431
Epoch 3, Step 50: Loss = 0.1569
Epoch 3, Step 60: Loss = 0.5555
Epoch 3, Step 70: Loss = 0.4737
Epoch 3, Step 80: Loss = 0.2687
Epoch 3, Step 90: Loss = 0.1112
Epoch 3, Step 100: Loss = 0.1680
Epoch 3, Step 110: Loss = 0.0765
Epoch 3, Step 120: Loss = 0.8623
Epoch 3, Step 130: Loss = 0.3594
Epoch 3, Step 140: Loss = 0.5808
Epoch 3, Step 150: Loss = 0.4015
Epoch 3, Step 160: Loss = 0.0359
Epoch 3, Step 170: Loss = 0.0941
Epoch 3, Step 180: Loss = 0.7391
Epoch 3, Step 190: Loss = 0.3301
Epoch 3, Step 200: Loss = 0.2222
Epoch 3, Step 210: Loss = 0.2294
Epoch 3, Step 220: Loss = 0.2410
Epoch 3, Step 230: Loss = 0.1176
Epoch 3, Step 240: Loss = 0.0437
Epoch 3, Step 250: Loss = 0.2377
Epoch 3, Step 260: Loss = 0.4675
Epoch 3, Step 270: Loss = 0.2277
Epoch 3, Step 280: Loss = 0.4567
Epoch 3, Step 290: Loss = 0.1460
Epoch 3, Step 300: Lo




Epoch 3 Summary:
Training Metrics:
accuracy: 0.8858
precision: 0.8958
recall: 0.8730
f1: 0.8843
loss: 0.2671

Validation Metrics:
accuracy: 0.8920
precision: 0.8784
recall: 0.9100
f1: 0.8939
loss: 0.2459




Epoch 4, Step 0: Loss = 0.4992
Epoch 4, Step 10: Loss = 0.1418
Epoch 4, Step 20: Loss = 0.4480
Epoch 4, Step 30: Loss = 0.3826
Epoch 4, Step 40: Loss = 0.0542
Epoch 4, Step 50: Loss = 0.3463
Epoch 4, Step 60: Loss = 0.9047
Epoch 4, Step 70: Loss = 0.2004
Epoch 4, Step 80: Loss = 0.3021
Epoch 4, Step 90: Loss = 0.4775
Epoch 4, Step 100: Loss = 0.1293
Epoch 4, Step 110: Loss = 0.3278
Epoch 4, Step 120: Loss = 0.5180
Epoch 4, Step 130: Loss = 0.0804
Epoch 4, Step 140: Loss = 0.1579
Epoch 4, Step 150: Loss = 0.1266
Epoch 4, Step 160: Loss = 0.1312
Epoch 4, Step 170: Loss = 0.5710
Epoch 4, Step 180: Loss = 0.5844
Epoch 4, Step 190: Loss = 0.3246
Epoch 4, Step 200: Loss = 0.4837
Epoch 4, Step 210: Loss = 0.2505
Epoch 4, Step 220: Loss = 0.3108
Epoch 4, Step 230: Loss = 0.2367
Epoch 4, Step 240: Loss = 0.0424
Epoch 4, Step 250: Loss = 0.1142
Epoch 4, Step 260: Loss = 0.5294
Epoch 4, Step 270: Loss = 0.1120
Epoch 4, Step 280: Loss = 0.0589
Epoch 4, Step 290: Loss = 0.2893
Epoch 4, Step 300: Lo




Epoch 4 Summary:
Training Metrics:
accuracy: 0.9048
precision: 0.9120
recall: 0.8960
f1: 0.9039
loss: 0.2250

Validation Metrics:
accuracy: 0.9000
precision: 0.8876
recall: 0.9160
f1: 0.9016
loss: 0.2185




Epoch 5, Step 0: Loss = 0.2840
Epoch 5, Step 10: Loss = 0.1175
Epoch 5, Step 20: Loss = 0.1477
Epoch 5, Step 30: Loss = 0.0236
Epoch 5, Step 40: Loss = 0.2171
Epoch 5, Step 50: Loss = 0.0347
Epoch 5, Step 60: Loss = 0.2353
Epoch 5, Step 70: Loss = 0.1558
Epoch 5, Step 80: Loss = 0.1263
Epoch 5, Step 90: Loss = 0.1659
Epoch 5, Step 100: Loss = 0.0431
Epoch 5, Step 110: Loss = 0.2071
Epoch 5, Step 120: Loss = 0.2685
Epoch 5, Step 130: Loss = 0.1310
Epoch 5, Step 140: Loss = 0.1153
Epoch 5, Step 150: Loss = 0.1445
Epoch 5, Step 160: Loss = 0.0619
Epoch 5, Step 170: Loss = 0.3447
Epoch 5, Step 180: Loss = 0.4257
Epoch 5, Step 190: Loss = 0.1868
Epoch 5, Step 200: Loss = 0.4359
Epoch 5, Step 210: Loss = 0.8335
Epoch 5, Step 220: Loss = 0.4176
Epoch 5, Step 230: Loss = 0.1357
Epoch 5, Step 240: Loss = 0.0614
Epoch 5, Step 250: Loss = 0.1743
Epoch 5, Step 260: Loss = 0.0349
Epoch 5, Step 270: Loss = 0.4068
Epoch 5, Step 280: Loss = 0.3461
Epoch 5, Step 290: Loss = 0.0990
Epoch 5, Step 300: Lo




Epoch 5 Summary:
Training Metrics:
accuracy: 0.9125
precision: 0.9213
recall: 0.9020
f1: 0.9116
loss: 0.2041

Validation Metrics:
accuracy: 0.9060
precision: 0.8980
recall: 0.9160
f1: 0.9069
loss: 0.2061




Epoch 6, Step 0: Loss = 0.0177
Epoch 6, Step 10: Loss = 0.2638
Epoch 6, Step 20: Loss = 0.0607
Epoch 6, Step 30: Loss = 0.0715
Epoch 6, Step 40: Loss = 0.0275
Epoch 6, Step 50: Loss = 0.1655
Epoch 6, Step 60: Loss = 0.0284
Epoch 6, Step 70: Loss = 0.1170
Epoch 6, Step 80: Loss = 0.2197
Epoch 6, Step 90: Loss = 0.0507
Epoch 6, Step 100: Loss = 0.3275
Epoch 6, Step 110: Loss = 0.5148
Epoch 6, Step 120: Loss = 0.0695
Epoch 6, Step 130: Loss = 0.4821
Epoch 6, Step 140: Loss = 0.0750
Epoch 6, Step 150: Loss = 0.0805
Epoch 6, Step 160: Loss = 0.0391
Epoch 6, Step 170: Loss = 0.2336
Epoch 6, Step 180: Loss = 0.3582
Epoch 6, Step 190: Loss = 0.1672
Epoch 6, Step 200: Loss = 0.1362
Epoch 6, Step 210: Loss = 0.1321
Epoch 6, Step 220: Loss = 0.5296
Epoch 6, Step 230: Loss = 0.0876
Epoch 6, Step 240: Loss = 0.2686
Epoch 6, Step 250: Loss = 0.2292
Epoch 6, Step 260: Loss = 0.0997
Epoch 6, Step 270: Loss = 0.3179
Epoch 6, Step 280: Loss = 0.0407
Epoch 6, Step 290: Loss = 0.1414
Epoch 6, Step 300: Lo




Epoch 6 Summary:
Training Metrics:
accuracy: 0.9185
precision: 0.9262
recall: 0.9095
f1: 0.9178
loss: 0.1957

Validation Metrics:
accuracy: 0.9070
precision: 0.8967
recall: 0.9200
f1: 0.9082
loss: 0.2006




Epoch 7, Step 0: Loss = 0.1646
Epoch 7, Step 10: Loss = 0.0346
Epoch 7, Step 20: Loss = 0.1163
Epoch 7, Step 30: Loss = 0.4327
Epoch 7, Step 40: Loss = 0.0602
Epoch 7, Step 50: Loss = 0.2500
Epoch 7, Step 60: Loss = 0.1210
Epoch 7, Step 70: Loss = 0.1555
Epoch 7, Step 80: Loss = 0.0701
Epoch 7, Step 90: Loss = 0.1577
Epoch 7, Step 100: Loss = 0.1935
Epoch 7, Step 110: Loss = 0.2684
Epoch 7, Step 120: Loss = 0.0288
Epoch 7, Step 130: Loss = 0.0467
Epoch 7, Step 140: Loss = 0.0763
Epoch 7, Step 150: Loss = 0.3544
Epoch 7, Step 160: Loss = 0.3598
Epoch 7, Step 170: Loss = 0.0865
Epoch 7, Step 180: Loss = 0.1754
Epoch 7, Step 190: Loss = 0.0677
Epoch 7, Step 200: Loss = 0.0425
Epoch 7, Step 210: Loss = 0.1569
Epoch 7, Step 220: Loss = 0.3503
Epoch 7, Step 230: Loss = 0.0912
Epoch 7, Step 240: Loss = 0.1234
Epoch 7, Step 250: Loss = 0.1022
Epoch 7, Step 260: Loss = 0.2016
Epoch 7, Step 270: Loss = 0.1400
Epoch 7, Step 280: Loss = 0.2066
Epoch 7, Step 290: Loss = 0.1196
Epoch 7, Step 300: Lo




Epoch 7 Summary:
Training Metrics:
accuracy: 0.9207
precision: 0.9300
recall: 0.9100
f1: 0.9199
loss: 0.1914

Validation Metrics:
accuracy: 0.9080
precision: 0.8984
recall: 0.9200
f1: 0.9091
loss: 0.1976




Epoch 8, Step 0: Loss = 0.0652
Epoch 8, Step 10: Loss = 0.2255
Epoch 8, Step 20: Loss = 0.5462
Epoch 8, Step 30: Loss = 0.0218
Epoch 8, Step 40: Loss = 0.1834
Epoch 8, Step 50: Loss = 0.1525
Epoch 8, Step 60: Loss = 0.1369
Epoch 8, Step 70: Loss = 0.1811
Epoch 8, Step 80: Loss = 0.0855
Epoch 8, Step 90: Loss = 0.2218
Epoch 8, Step 100: Loss = 0.0649
Epoch 8, Step 110: Loss = 0.0747
Epoch 8, Step 120: Loss = 0.3312
Epoch 8, Step 130: Loss = 0.1783
Epoch 8, Step 140: Loss = 0.2651
Epoch 8, Step 150: Loss = 0.0732
Epoch 8, Step 160: Loss = 0.2659
Epoch 8, Step 170: Loss = 0.0296
Epoch 8, Step 180: Loss = 0.1694
Epoch 8, Step 190: Loss = 0.0925
Epoch 8, Step 200: Loss = 0.0815
Epoch 8, Step 210: Loss = 0.1976
Epoch 8, Step 220: Loss = 0.1838
Epoch 8, Step 230: Loss = 0.2269
Epoch 8, Step 240: Loss = 0.5805
Epoch 8, Step 250: Loss = 0.1232
Epoch 8, Step 260: Loss = 0.0215
Epoch 8, Step 270: Loss = 0.2170
Epoch 8, Step 280: Loss = 0.1435
Epoch 8, Step 290: Loss = 0.4980
Epoch 8, Step 300: Lo




Epoch 8 Summary:
Training Metrics:
accuracy: 0.9205
precision: 0.9278
recall: 0.9120
f1: 0.9198
loss: 0.1902

Validation Metrics:
accuracy: 0.9090
precision: 0.8986
recall: 0.9220
f1: 0.9102
loss: 0.1962




Epoch 9, Step 0: Loss = 0.5244
Epoch 9, Step 10: Loss = 0.3127
Epoch 9, Step 20: Loss = 0.2639
Epoch 9, Step 30: Loss = 0.2064
Epoch 9, Step 40: Loss = 0.0112
Epoch 9, Step 50: Loss = 0.1878
Epoch 9, Step 60: Loss = 0.0116
Epoch 9, Step 70: Loss = 0.2564
Epoch 9, Step 80: Loss = 0.2378
Epoch 9, Step 90: Loss = 0.3180
Epoch 9, Step 100: Loss = 0.2747
Epoch 9, Step 110: Loss = 0.2552
Epoch 9, Step 120: Loss = 0.3418
Epoch 9, Step 130: Loss = 0.0859
Epoch 9, Step 140: Loss = 0.1040
Epoch 9, Step 150: Loss = 0.2279
Epoch 9, Step 160: Loss = 0.3075
Epoch 9, Step 170: Loss = 0.2711
Epoch 9, Step 180: Loss = 0.0137
Epoch 9, Step 190: Loss = 0.0370
Epoch 9, Step 200: Loss = 0.0512
Epoch 9, Step 210: Loss = 0.2675
Epoch 9, Step 220: Loss = 0.0800
Epoch 9, Step 230: Loss = 0.1220
Epoch 9, Step 240: Loss = 0.2889
Epoch 9, Step 250: Loss = 0.0189
Epoch 9, Step 260: Loss = 0.0091
Epoch 9, Step 270: Loss = 0.0385
Epoch 9, Step 280: Loss = 0.2621
Epoch 9, Step 290: Loss = 0.1430
Epoch 9, Step 300: Lo




Epoch 9 Summary:
Training Metrics:
accuracy: 0.9215
precision: 0.9297
recall: 0.9120
f1: 0.9207
loss: 0.1886

Validation Metrics:
accuracy: 0.9090
precision: 0.8986
recall: 0.9220
f1: 0.9102
loss: 0.1953


In [9]:
import os
import json
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (AutoTokenizer,
    LlamaForSequenceClassification,
    LlamaTokenizer,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import gc
import seaborn as sns
import matplotlib.pyplot as plt

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

class EmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        self.emails_df = emails_df.copy()
        self.emails_df['sender'] = self.emails_df['sender'].apply(clean_text)
        self.emails_df['subject'] = self.emails_df['subject'].apply(clean_text)
        self.emails_df['body'] = self.emails_df['body'].apply(clean_text)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.emails_df)

    def __getitem__(self, idx):
        email = self.emails_df.iloc[idx]
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        
        encoding = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(email['label'], dtype=torch.long)
        }

def clean_text(text):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def setup_model_and_tokenizer(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False
    
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = LlamaForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

def compute_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    conf_matrix = confusion_matrix(labels, preds)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

def plot_confusion_matrix(conf_matrix, output_dir):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['label'].cpu().numpy())
    
    metrics = compute_metrics(all_preds, all_labels)
    metrics['loss'] = total_loss / len(data_loader)
    return metrics

def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9):
    best_val_metrics = {'f1': 0}
    best_model_state = None
    model = model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_preds = []
        train_labels = []
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            loss.backward()
            
            if (step + 1) % 2 == 0:  # Gradient accumulation steps = 2
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(batch['label'].cpu().numpy())
            
            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")
            
            torch.cuda.empty_cache()
  
        train_metrics = compute_metrics(train_preds, train_labels)
        train_metrics['loss'] = total_loss / len(train_loader)
      
        val_metrics = evaluate_model(model, val_loader, device)
      
        print(f"\nEpoch {epoch + 1} Summary:")
        print("Training Metrics:")
        for metric, value in train_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        print("\nValidation Metrics:")
        for metric, value in val_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        # Save best model
        if val_metrics['f1'] > best_val_metrics['f1']:
            best_val_metrics = val_metrics
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
        
        torch.cuda.empty_cache()
        gc.collect()
    
    return best_model_state, best_val_metrics

def main():
    login(token=
    device = setup_environment()
    model_name = 'meta-llama/Meta-Llama-3-8B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")
    
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")
    
    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)
    
    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)
    
    train_dataset = EmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = EmailDataset(val_df, tokenizer, max_length=512)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)
    
    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_model_state, best_metrics = train_model(
        model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs
    )
    
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama8b_binary_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    plot_confusion_matrix(best_metrics['confusion_matrix'], output_dir)
    
  
    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "best_metrics": {k: float(v) if k != 'confusion_matrix' else v.tolist() 
                        for k, v in best_metrics.items()}
    }
    
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()

Using GPU: NVIDIA RTX A5000
GPU Memory: 23.68 GB


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 1.7160
Epoch 1, Step 10: Loss = 2.9442
Epoch 1, Step 20: Loss = 1.6117
Epoch 1, Step 30: Loss = 1.9146
Epoch 1, Step 40: Loss = 1.2296
Epoch 1, Step 50: Loss = 1.8982
Epoch 1, Step 60: Loss = 1.7840
Epoch 1, Step 70: Loss = 1.1333
Epoch 1, Step 80: Loss = 1.9108
Epoch 1, Step 90: Loss = 1.0153
Epoch 1, Step 100: Loss = 1.2603
Epoch 1, Step 110: Loss = 2.0296
Epoch 1, Step 120: Loss = 0.8160
Epoch 1, Step 130: Loss = 1.6099
Epoch 1, Step 140: Loss = 1.7204
Epoch 1, Step 150: Loss = 1.5328
Epoch 1, Step 160: Loss = 0.8729
Epoch 1, Step 170: Loss = 1.7121
Epoch 1, Step 180: Loss = 1.2583
Epoch 1, Step 190: Loss = 0.6022
Epoch 1, Step 200: Loss = 0.3489
Epoch 1, Step 210: Loss = 3.0040
Epoch 1, Step 220: Loss = 1.1964
Epoch 1, Step 230: Loss = 1.3043
Epoch 1, Step 240: Loss = 1.1198
Epoch 1, Step 250: Loss = 0.8775
Epoch 1, Step 260: Loss = 0.9502
Epoch 1, Step 270: Loss = 0.6216
Epoch 1, Step 280: Loss = 1.0937
Epoch 1, Step 290: Loss = 0.7937
Epoch 1, Step 300: Lo




Epoch 1 Summary:
Training Metrics:
accuracy: 0.5235
precision: 0.5497
recall: 0.2600
f1: 0.3530
loss: 1.3460

Validation Metrics:
accuracy: 0.6640
precision: 0.6934
recall: 0.5880
f1: 0.6364
loss: 0.7628




Epoch 2, Step 0: Loss = 0.7560
Epoch 2, Step 10: Loss = 0.7693
Epoch 2, Step 20: Loss = 0.4761
Epoch 2, Step 30: Loss = 1.1525
Epoch 2, Step 40: Loss = 0.6480
Epoch 2, Step 50: Loss = 0.5539
Epoch 2, Step 60: Loss = 0.3797
Epoch 2, Step 70: Loss = 0.3815
Epoch 2, Step 80: Loss = 0.4731
Epoch 2, Step 90: Loss = 0.1660
Epoch 2, Step 100: Loss = 0.4897
Epoch 2, Step 110: Loss = 0.4512
Epoch 2, Step 120: Loss = 0.2502
Epoch 2, Step 130: Loss = 0.3446
Epoch 2, Step 140: Loss = 0.5971
Epoch 2, Step 150: Loss = 0.4476
Epoch 2, Step 160: Loss = 0.3314
Epoch 2, Step 170: Loss = 0.5730
Epoch 2, Step 180: Loss = 0.2981
Epoch 2, Step 190: Loss = 0.2552
Epoch 2, Step 200: Loss = 0.4009
Epoch 2, Step 210: Loss = 0.4329
Epoch 2, Step 220: Loss = 1.4023
Epoch 2, Step 230: Loss = 0.1422
Epoch 2, Step 240: Loss = 0.1375
Epoch 2, Step 250: Loss = 0.2196
Epoch 2, Step 260: Loss = 0.6730
Epoch 2, Step 270: Loss = 0.1488
Epoch 2, Step 280: Loss = 0.1732
Epoch 2, Step 290: Loss = 0.3078
Epoch 2, Step 300: Lo




Epoch 2 Summary:
Training Metrics:
accuracy: 0.8227
precision: 0.8236
recall: 0.8215
f1: 0.8225
loss: 0.4243

Validation Metrics:
accuracy: 0.8850
precision: 0.8723
recall: 0.9020
f1: 0.8869
loss: 0.3260




Epoch 3, Step 0: Loss = 0.0296
Epoch 3, Step 10: Loss = 0.0589
Epoch 3, Step 20: Loss = 0.0784
Epoch 3, Step 30: Loss = 0.2739
Epoch 3, Step 40: Loss = 0.0501
Epoch 3, Step 50: Loss = 0.2014
Epoch 3, Step 60: Loss = 0.4901
Epoch 3, Step 70: Loss = 1.2105
Epoch 3, Step 80: Loss = 0.1604
Epoch 3, Step 90: Loss = 0.0785
Epoch 3, Step 100: Loss = 0.1149
Epoch 3, Step 110: Loss = 0.2818
Epoch 3, Step 120: Loss = 0.0878
Epoch 3, Step 130: Loss = 0.1841
Epoch 3, Step 140: Loss = 0.1401
Epoch 3, Step 150: Loss = 0.0999
Epoch 3, Step 160: Loss = 0.0363
Epoch 3, Step 170: Loss = 0.8047
Epoch 3, Step 180: Loss = 0.1576
Epoch 3, Step 190: Loss = 0.2243
Epoch 3, Step 200: Loss = 0.3306
Epoch 3, Step 210: Loss = 0.2267
Epoch 3, Step 220: Loss = 0.1665
Epoch 3, Step 230: Loss = 0.0937
Epoch 3, Step 240: Loss = 0.0504
Epoch 3, Step 250: Loss = 0.1032
Epoch 3, Step 260: Loss = 0.0239
Epoch 3, Step 270: Loss = 0.2993
Epoch 3, Step 280: Loss = 0.4538
Epoch 3, Step 290: Loss = 0.0481
Epoch 3, Step 300: Lo




Epoch 3 Summary:
Training Metrics:
accuracy: 0.9097
precision: 0.9047
recall: 0.9160
f1: 0.9103
loss: 0.2375

Validation Metrics:
accuracy: 0.9210
precision: 0.9087
recall: 0.9360
f1: 0.9222
loss: 0.2456




Epoch 4, Step 0: Loss = 0.0875
Epoch 4, Step 10: Loss = 0.0210
Epoch 4, Step 20: Loss = 0.2401
Epoch 4, Step 30: Loss = 0.0605
Epoch 4, Step 40: Loss = 0.2529
Epoch 4, Step 50: Loss = 0.1607
Epoch 4, Step 60: Loss = 0.2264
Epoch 4, Step 70: Loss = 0.2493
Epoch 4, Step 80: Loss = 0.1734
Epoch 4, Step 90: Loss = 0.0605
Epoch 4, Step 100: Loss = 0.1698
Epoch 4, Step 110: Loss = 0.1759
Epoch 4, Step 120: Loss = 0.3050
Epoch 4, Step 130: Loss = 0.2141
Epoch 4, Step 140: Loss = 0.3136
Epoch 4, Step 150: Loss = 0.0253
Epoch 4, Step 160: Loss = 0.3635
Epoch 4, Step 170: Loss = 0.4912
Epoch 4, Step 180: Loss = 0.2475
Epoch 4, Step 190: Loss = 0.0374
Epoch 4, Step 200: Loss = 0.2764
Epoch 4, Step 210: Loss = 0.0247
Epoch 4, Step 220: Loss = 0.3401
Epoch 4, Step 230: Loss = 0.0887
Epoch 4, Step 240: Loss = 0.0229
Epoch 4, Step 250: Loss = 0.0481
Epoch 4, Step 260: Loss = 0.1089
Epoch 4, Step 270: Loss = 0.0139
Epoch 4, Step 280: Loss = 0.0949
Epoch 4, Step 290: Loss = 0.1055
Epoch 4, Step 300: Lo




Epoch 4 Summary:
Training Metrics:
accuracy: 0.9325
precision: 0.9240
recall: 0.9425
f1: 0.9332
loss: 0.1894

Validation Metrics:
accuracy: 0.9290
precision: 0.9133
recall: 0.9480
f1: 0.9303
loss: 0.2145




Epoch 5, Step 0: Loss = 0.4616
Epoch 5, Step 10: Loss = 0.0521
Epoch 5, Step 20: Loss = 0.0421
Epoch 5, Step 30: Loss = 0.8226
Epoch 5, Step 40: Loss = 0.6059
Epoch 5, Step 50: Loss = 0.0190
Epoch 5, Step 60: Loss = 0.1420
Epoch 5, Step 70: Loss = 0.0461
Epoch 5, Step 80: Loss = 0.0615
Epoch 5, Step 90: Loss = 0.0140
Epoch 5, Step 100: Loss = 0.0299
Epoch 5, Step 110: Loss = 0.0342
Epoch 5, Step 120: Loss = 0.0391
Epoch 5, Step 130: Loss = 0.5346
Epoch 5, Step 140: Loss = 0.3167
Epoch 5, Step 150: Loss = 0.0245
Epoch 5, Step 160: Loss = 0.0417
Epoch 5, Step 170: Loss = 0.0164
Epoch 5, Step 180: Loss = 0.0846
Epoch 5, Step 190: Loss = 1.1824
Epoch 5, Step 200: Loss = 0.0761
Epoch 5, Step 210: Loss = 0.0715
Epoch 5, Step 220: Loss = 0.0133
Epoch 5, Step 230: Loss = 0.0256
Epoch 5, Step 240: Loss = 0.1237
Epoch 5, Step 250: Loss = 0.1202
Epoch 5, Step 260: Loss = 0.4167
Epoch 5, Step 270: Loss = 0.0409
Epoch 5, Step 280: Loss = 0.2201
Epoch 5, Step 290: Loss = 0.0160
Epoch 5, Step 300: Lo




Epoch 5 Summary:
Training Metrics:
accuracy: 0.9385
precision: 0.9312
recall: 0.9470
f1: 0.9390
loss: 0.1720

Validation Metrics:
accuracy: 0.9300
precision: 0.9151
recall: 0.9480
f1: 0.9312
loss: 0.2035




Epoch 6, Step 0: Loss = 0.0603
Epoch 6, Step 10: Loss = 0.0830
Epoch 6, Step 20: Loss = 0.0285
Epoch 6, Step 30: Loss = 0.0256
Epoch 6, Step 40: Loss = 0.0521
Epoch 6, Step 50: Loss = 0.2630
Epoch 6, Step 60: Loss = 0.1048
Epoch 6, Step 70: Loss = 0.1136
Epoch 6, Step 80: Loss = 0.0752
Epoch 6, Step 90: Loss = 0.0093
Epoch 6, Step 100: Loss = 0.3118
Epoch 6, Step 110: Loss = 0.1549
Epoch 6, Step 120: Loss = 0.3602
Epoch 6, Step 130: Loss = 0.0214
Epoch 6, Step 140: Loss = 0.0355
Epoch 6, Step 150: Loss = 0.0323
Epoch 6, Step 160: Loss = 0.1068
Epoch 6, Step 170: Loss = 0.0415
Epoch 6, Step 180: Loss = 0.0162
Epoch 6, Step 190: Loss = 0.0177
Epoch 6, Step 200: Loss = 0.2701
Epoch 6, Step 210: Loss = 0.3086
Epoch 6, Step 220: Loss = 0.3561
Epoch 6, Step 230: Loss = 0.5186
Epoch 6, Step 240: Loss = 0.0574
Epoch 6, Step 250: Loss = 0.0512
Epoch 6, Step 260: Loss = 0.0367
Epoch 6, Step 270: Loss = 0.1612
Epoch 6, Step 280: Loss = 0.6101
Epoch 6, Step 290: Loss = 0.6731
Epoch 6, Step 300: Lo




Epoch 6 Summary:
Training Metrics:
accuracy: 0.9417
precision: 0.9372
recall: 0.9470
f1: 0.9421
loss: 0.1644

Validation Metrics:
accuracy: 0.9330
precision: 0.9188
recall: 0.9500
f1: 0.9341
loss: 0.1986




Epoch 7, Step 0: Loss = 0.3467
Epoch 7, Step 10: Loss = 0.6382
Epoch 7, Step 20: Loss = 0.0456
Epoch 7, Step 30: Loss = 0.0727
Epoch 7, Step 40: Loss = 0.2743
Epoch 7, Step 50: Loss = 0.2281
Epoch 7, Step 60: Loss = 0.0490
Epoch 7, Step 70: Loss = 0.0299
Epoch 7, Step 80: Loss = 0.0896
Epoch 7, Step 90: Loss = 0.0379
Epoch 7, Step 100: Loss = 0.1629
Epoch 7, Step 110: Loss = 0.1489
Epoch 7, Step 120: Loss = 0.0159
Epoch 7, Step 130: Loss = 0.3015
Epoch 7, Step 140: Loss = 0.0601
Epoch 7, Step 150: Loss = 0.0585
Epoch 7, Step 160: Loss = 0.4882
Epoch 7, Step 170: Loss = 0.0245
Epoch 7, Step 180: Loss = 0.4731
Epoch 7, Step 190: Loss = 0.0462
Epoch 7, Step 200: Loss = 0.5094
Epoch 7, Step 210: Loss = 0.0592
Epoch 7, Step 220: Loss = 0.0113
Epoch 7, Step 230: Loss = 0.8364
Epoch 7, Step 240: Loss = 0.0385
Epoch 7, Step 250: Loss = 0.5789
Epoch 7, Step 260: Loss = 0.0605
Epoch 7, Step 270: Loss = 0.0926
Epoch 7, Step 280: Loss = 0.0730
Epoch 7, Step 290: Loss = 0.0913
Epoch 7, Step 300: Lo




Epoch 7 Summary:
Training Metrics:
accuracy: 0.9437
precision: 0.9404
recall: 0.9475
f1: 0.9440
loss: 0.1594

Validation Metrics:
accuracy: 0.9340
precision: 0.9205
recall: 0.9500
f1: 0.9350
loss: 0.1958




Epoch 8, Step 0: Loss = 0.2702
Epoch 8, Step 10: Loss = 0.0108
Epoch 8, Step 20: Loss = 0.0666
Epoch 8, Step 30: Loss = 0.0223
Epoch 8, Step 40: Loss = 0.0295
Epoch 8, Step 50: Loss = 0.1812
Epoch 8, Step 60: Loss = 0.3145
Epoch 8, Step 70: Loss = 0.0597
Epoch 8, Step 80: Loss = 0.0207
Epoch 8, Step 90: Loss = 0.0042
Epoch 8, Step 100: Loss = 0.1102
Epoch 8, Step 110: Loss = 0.0725
Epoch 8, Step 120: Loss = 0.5240
Epoch 8, Step 130: Loss = 0.0180
Epoch 8, Step 140: Loss = 0.1036
Epoch 8, Step 150: Loss = 0.1882
Epoch 8, Step 160: Loss = 0.3888
Epoch 8, Step 170: Loss = 0.0287
Epoch 8, Step 180: Loss = 0.2654
Epoch 8, Step 190: Loss = 0.0583
Epoch 8, Step 200: Loss = 0.0428
Epoch 8, Step 210: Loss = 0.0184
Epoch 8, Step 220: Loss = 0.0221
Epoch 8, Step 230: Loss = 0.0024
Epoch 8, Step 240: Loss = 0.0189
Epoch 8, Step 250: Loss = 0.0878
Epoch 8, Step 260: Loss = 0.3123
Epoch 8, Step 270: Loss = 0.2578
Epoch 8, Step 280: Loss = 0.0055
Epoch 8, Step 290: Loss = 0.0778
Epoch 8, Step 300: Lo




Epoch 8 Summary:
Training Metrics:
accuracy: 0.9450
precision: 0.9419
recall: 0.9485
f1: 0.9452
loss: 0.1583

Validation Metrics:
accuracy: 0.9320
precision: 0.9170
recall: 0.9500
f1: 0.9332
loss: 0.1945




Epoch 9, Step 0: Loss = 0.1577
Epoch 9, Step 10: Loss = 0.4113
Epoch 9, Step 20: Loss = 0.8369
Epoch 9, Step 30: Loss = 0.0295
Epoch 9, Step 40: Loss = 0.3583
Epoch 9, Step 50: Loss = 0.0446
Epoch 9, Step 60: Loss = 0.2088
Epoch 9, Step 70: Loss = 0.0130
Epoch 9, Step 80: Loss = 0.1815
Epoch 9, Step 90: Loss = 0.0147
Epoch 9, Step 100: Loss = 0.3995
Epoch 9, Step 110: Loss = 0.0241
Epoch 9, Step 120: Loss = 0.0483
Epoch 9, Step 130: Loss = 0.0666
Epoch 9, Step 140: Loss = 0.4397
Epoch 9, Step 150: Loss = 0.0398
Epoch 9, Step 160: Loss = 0.7025
Epoch 9, Step 170: Loss = 0.4561
Epoch 9, Step 180: Loss = 0.2175
Epoch 9, Step 190: Loss = 0.0099
Epoch 9, Step 200: Loss = 0.0454
Epoch 9, Step 210: Loss = 0.0854
Epoch 9, Step 220: Loss = 0.2663
Epoch 9, Step 230: Loss = 0.1689
Epoch 9, Step 240: Loss = 0.0602
Epoch 9, Step 250: Loss = 0.0813
Epoch 9, Step 260: Loss = 0.0792
Epoch 9, Step 270: Loss = 0.0653
Epoch 9, Step 280: Loss = 0.0273
Epoch 9, Step 290: Loss = 0.1192
Epoch 9, Step 300: Lo




Epoch 9 Summary:
Training Metrics:
accuracy: 0.9437
precision: 0.9413
recall: 0.9465
f1: 0.9439
loss: 0.1564

Validation Metrics:
accuracy: 0.9330
precision: 0.9188
recall: 0.9500
f1: 0.9341
loss: 0.1939


In [3]:
import os
import json
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (AutoTokenizer,AutoModelForSequenceClassification,
    LlamaForSequenceClassification,
    LlamaTokenizer,
                           BitsAndBytesConfig,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import gc
import seaborn as sns
import matplotlib.pyplot as plt

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

class EmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        self.emails_df = emails_df.copy()
        self.emails_df['sender'] = self.emails_df['sender'].apply(clean_text)
        self.emails_df['subject'] = self.emails_df['subject'].apply(clean_text)
        self.emails_df['body'] = self.emails_df['body'].apply(clean_text)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.emails_df)

    def __getitem__(self, idx):
        email = self.emails_df.iloc[idx]
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        
        encoding = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(email['label'], dtype=torch.long)
        }

def clean_text(text):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def setup_model_and_tokenizer(model_name, device):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "right"
    tokenizer.pad_token = tokenizer.eos_token
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    model_config.pad_token_id = tokenizer.pad_token_id
    model_config.use_cache = False
    
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=model_config,
        torch_dtype=torch.bfloat16,
        quantization_config=quantization_config
    )
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(base_model, lora_config)
    model.gradient_checkpointing_enable()
    return model, tokenizer

def compute_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    conf_matrix = confusion_matrix(labels, preds)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

def plot_confusion_matrix(conf_matrix, output_dir):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['label'].cpu().numpy())
    
    metrics = compute_metrics(all_preds, all_labels)
    metrics['loss'] = total_loss / len(data_loader)
    return metrics

def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=9):
    best_val_metrics = {'f1': 0}
    best_model_state = None
    model = model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_preds = []
        train_labels = []
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['label']
                )
            
            loss = outputs.loss
            loss.backward()
            
            if (step + 1) % 2 == 0:  # Gradient accumulation steps = 2
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(batch['label'].cpu().numpy())
            
            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")
            
            torch.cuda.empty_cache()
  
        train_metrics = compute_metrics(train_preds, train_labels)
        train_metrics['loss'] = total_loss / len(train_loader)
      
        val_metrics = evaluate_model(model, val_loader, device)
      
        print(f"\nEpoch {epoch + 1} Summary:")
        print("Training Metrics:")
        for metric, value in train_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        print("\nValidation Metrics:")
        for metric, value in val_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        # Save best model
        if val_metrics['f1'] > best_val_metrics['f1']:
            best_val_metrics = val_metrics
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
        
        torch.cuda.empty_cache()
        gc.collect()
    
    return best_model_state, best_val_metrics

def main():
    login(token=
    device = setup_environment()
    model_name = 'dreamgen/WizardLM-2-7B'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")
    
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")
    
    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)
    
    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)
    
    train_dataset = EmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = EmailDataset(val_df, tokenizer, max_length=512)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, num_workers=2, pin_memory=True)
    
    optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=2e-5)
    num_epochs = 9
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_model_state, best_metrics = train_model(
        model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs
    )
    
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/wizard8b_binary_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    plot_confusion_matrix(best_metrics['confusion_matrix'], output_dir)
    
  
    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 8,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "best_metrics": {k: float(v) if k != 'confusion_matrix' else v.tolist() 
                        for k, v in best_metrics.items()}
    }
    
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()

Using GPU: NVIDIA RTX A5000
GPU Memory: 23.68 GB


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 3.1648
Epoch 1, Step 10: Loss = 1.1832
Epoch 1, Step 20: Loss = 0.0461
Epoch 1, Step 30: Loss = 0.6317
Epoch 1, Step 40: Loss = 0.6844
Epoch 1, Step 50: Loss = 4.5570
Epoch 1, Step 60: Loss = 0.5165
Epoch 1, Step 70: Loss = 3.0345
Epoch 1, Step 80: Loss = 0.6065
Epoch 1, Step 90: Loss = 1.2399
Epoch 1, Step 100: Loss = 1.1913
Epoch 1, Step 110: Loss = 0.7395
Epoch 1, Step 120: Loss = 0.0087
Epoch 1, Step 130: Loss = 2.8056
Epoch 1, Step 140: Loss = 0.3464
Epoch 1, Step 150: Loss = 0.2242
Epoch 1, Step 160: Loss = 0.6680
Epoch 1, Step 170: Loss = 1.2313
Epoch 1, Step 180: Loss = 1.0059
Epoch 1, Step 190: Loss = 0.5259
Epoch 1, Step 200: Loss = 1.1587
Epoch 1, Step 210: Loss = 2.7515
Epoch 1, Step 220: Loss = 4.0594
Epoch 1, Step 230: Loss = 2.3415
Epoch 1, Step 240: Loss = 1.2338
Epoch 1, Step 250: Loss = 2.5145
Epoch 1, Step 260: Loss = 0.7798
Epoch 1, Step 270: Loss = 0.1291
Epoch 1, Step 280: Loss = 0.3329
Epoch 1, Step 290: Loss = 5.1360
Epoch 1, Step 300: Lo




Epoch 1 Summary:
Training Metrics:
accuracy: 0.7282
precision: 0.7359
recall: 0.7120
f1: 0.7238
loss: 1.6450

Validation Metrics:
accuracy: 0.7520
precision: 0.7614
recall: 0.7340
f1: 0.7475
loss: 1.5600




Epoch 2, Step 0: Loss = 0.6680
Epoch 2, Step 10: Loss = 5.3790
Epoch 2, Step 20: Loss = 2.0410
Epoch 2, Step 30: Loss = 0.3160
Epoch 2, Step 40: Loss = 0.7476
Epoch 2, Step 50: Loss = 1.2208
Epoch 2, Step 60: Loss = 2.3000
Epoch 2, Step 70: Loss = 0.1875
Epoch 2, Step 80: Loss = 0.0956
Epoch 2, Step 90: Loss = 1.4404
Epoch 2, Step 100: Loss = 0.5797
Epoch 2, Step 110: Loss = 0.9452
Epoch 2, Step 120: Loss = 0.0022
Epoch 2, Step 130: Loss = 1.6293
Epoch 2, Step 140: Loss = 1.2620
Epoch 2, Step 150: Loss = 0.0062
Epoch 2, Step 160: Loss = 0.7824
Epoch 2, Step 170: Loss = 2.5513
Epoch 2, Step 180: Loss = 1.5934
Epoch 2, Step 190: Loss = 0.2229
Epoch 2, Step 200: Loss = 1.0765
Epoch 2, Step 210: Loss = 0.9237
Epoch 2, Step 220: Loss = 0.3269
Epoch 2, Step 230: Loss = 0.3030
Epoch 2, Step 240: Loss = 0.7473
Epoch 2, Step 250: Loss = 1.0516
Epoch 2, Step 260: Loss = 0.0087
Epoch 2, Step 270: Loss = 0.9324
Epoch 2, Step 280: Loss = 2.4138
Epoch 2, Step 290: Loss = 1.5012
Epoch 2, Step 300: Lo




Epoch 2 Summary:
Training Metrics:
accuracy: 0.7725
precision: 0.7806
recall: 0.7580
f1: 0.7692
loss: 1.3109

Validation Metrics:
accuracy: 0.7950
precision: 0.8004
recall: 0.7860
f1: 0.7931
loss: 1.2643




Epoch 3, Step 0: Loss = 1.9813
Epoch 3, Step 10: Loss = 1.4896
Epoch 3, Step 20: Loss = 1.3373
Epoch 3, Step 30: Loss = 0.4250
Epoch 3, Step 40: Loss = 3.3058
Epoch 3, Step 50: Loss = 0.5395
Epoch 3, Step 60: Loss = 0.0563
Epoch 3, Step 70: Loss = 0.2794
Epoch 3, Step 80: Loss = 1.1111
Epoch 3, Step 90: Loss = 0.6751
Epoch 3, Step 100: Loss = 0.4981
Epoch 3, Step 110: Loss = 3.0836
Epoch 3, Step 120: Loss = 0.0008
Epoch 3, Step 130: Loss = 0.0072
Epoch 3, Step 140: Loss = 1.8848
Epoch 3, Step 150: Loss = 0.2924
Epoch 3, Step 160: Loss = 1.2520
Epoch 3, Step 170: Loss = 0.6967
Epoch 3, Step 180: Loss = 0.0687
Epoch 3, Step 190: Loss = 1.2162
Epoch 3, Step 200: Loss = 0.9944
Epoch 3, Step 210: Loss = 1.4511
Epoch 3, Step 220: Loss = 0.4858
Epoch 3, Step 230: Loss = 0.7678
Epoch 3, Step 240: Loss = 3.5716
Epoch 3, Step 250: Loss = 0.0038
Epoch 3, Step 260: Loss = 0.7577
Epoch 3, Step 270: Loss = 0.3862
Epoch 3, Step 280: Loss = 0.3920
Epoch 3, Step 290: Loss = 0.2247
Epoch 3, Step 300: Lo




Epoch 3 Summary:
Training Metrics:
accuracy: 0.8090
precision: 0.8150
recall: 0.7995
f1: 0.8072
loss: 1.0917

Validation Metrics:
accuracy: 0.8150
precision: 0.8182
recall: 0.8100
f1: 0.8141
loss: 1.1312




Epoch 4, Step 0: Loss = 0.0235
Epoch 4, Step 10: Loss = 0.0675
Epoch 4, Step 20: Loss = 1.7205
Epoch 4, Step 30: Loss = 0.0070
Epoch 4, Step 40: Loss = 1.5523
Epoch 4, Step 50: Loss = 0.0353
Epoch 4, Step 60: Loss = 1.0944
Epoch 4, Step 70: Loss = 0.1157
Epoch 4, Step 80: Loss = 0.0001
Epoch 4, Step 90: Loss = 3.7725
Epoch 4, Step 100: Loss = 0.1539
Epoch 4, Step 110: Loss = 1.3683
Epoch 4, Step 120: Loss = 0.2842
Epoch 4, Step 130: Loss = 0.5060
Epoch 4, Step 140: Loss = 1.1001
Epoch 4, Step 150: Loss = 1.8131
Epoch 4, Step 160: Loss = 0.0000
Epoch 4, Step 170: Loss = 0.2027
Epoch 4, Step 180: Loss = 0.8934
Epoch 4, Step 190: Loss = 0.0274
Epoch 4, Step 200: Loss = 2.2252
Epoch 4, Step 210: Loss = 2.7705
Epoch 4, Step 220: Loss = 0.0204
Epoch 4, Step 230: Loss = 0.7386
Epoch 4, Step 240: Loss = 1.1641
Epoch 4, Step 250: Loss = 0.6545
Epoch 4, Step 260: Loss = 1.7471
Epoch 4, Step 270: Loss = 0.9409
Epoch 4, Step 280: Loss = 0.5092
Epoch 4, Step 290: Loss = 3.3073
Epoch 4, Step 300: Lo




Epoch 4 Summary:
Training Metrics:
accuracy: 0.8277
precision: 0.8329
recall: 0.8200
f1: 0.8264
loss: 0.9861

Validation Metrics:
accuracy: 0.8250
precision: 0.8283
recall: 0.8200
f1: 0.8241
loss: 1.0580




Epoch 5, Step 0: Loss = 0.0252
Epoch 5, Step 10: Loss = 0.9990
Epoch 5, Step 20: Loss = 0.2506
Epoch 5, Step 30: Loss = 0.2359
Epoch 5, Step 40: Loss = 0.5984
Epoch 5, Step 50: Loss = 2.0826
Epoch 5, Step 60: Loss = 0.2964
Epoch 5, Step 70: Loss = 2.6683
Epoch 5, Step 80: Loss = 1.2632
Epoch 5, Step 90: Loss = 0.0204
Epoch 5, Step 100: Loss = 0.4017
Epoch 5, Step 110: Loss = 0.3554
Epoch 5, Step 120: Loss = 0.0049
Epoch 5, Step 130: Loss = 0.2204
Epoch 5, Step 140: Loss = 0.7600
Epoch 5, Step 150: Loss = 4.6856
Epoch 5, Step 160: Loss = 0.7785
Epoch 5, Step 170: Loss = 2.0710
Epoch 5, Step 180: Loss = 0.7506
Epoch 5, Step 190: Loss = 0.0309
Epoch 5, Step 200: Loss = 0.0668
Epoch 5, Step 210: Loss = 1.4275
Epoch 5, Step 220: Loss = 3.5564
Epoch 5, Step 230: Loss = 0.0254
Epoch 5, Step 240: Loss = 2.2915
Epoch 5, Step 250: Loss = 3.4798
Epoch 5, Step 260: Loss = 2.8224
Epoch 5, Step 270: Loss = 4.6205
Epoch 5, Step 280: Loss = 5.9182
Epoch 5, Step 290: Loss = 0.4326
Epoch 5, Step 300: Lo




Epoch 5 Summary:
Training Metrics:
accuracy: 0.8327
precision: 0.8363
recall: 0.8275
f1: 0.8319
loss: 0.9378

Validation Metrics:
accuracy: 0.8310
precision: 0.8330
recall: 0.8280
f1: 0.8305
loss: 1.0209




Epoch 6, Step 0: Loss = 0.0105
Epoch 6, Step 10: Loss = 1.4917
Epoch 6, Step 20: Loss = 1.4822
Epoch 6, Step 30: Loss = 0.0328
Epoch 6, Step 40: Loss = 1.8999
Epoch 6, Step 50: Loss = 2.8694
Epoch 6, Step 60: Loss = 1.4722
Epoch 6, Step 70: Loss = 0.0027
Epoch 6, Step 80: Loss = 0.1290
Epoch 6, Step 90: Loss = 0.1732
Epoch 6, Step 100: Loss = 0.4670
Epoch 6, Step 110: Loss = 0.0830
Epoch 6, Step 120: Loss = 2.3553
Epoch 6, Step 130: Loss = 0.4628
Epoch 6, Step 140: Loss = 1.5506
Epoch 6, Step 150: Loss = 0.7670
Epoch 6, Step 160: Loss = 0.5367
Epoch 6, Step 170: Loss = 0.8829
Epoch 6, Step 180: Loss = 0.0512
Epoch 6, Step 190: Loss = 0.0002
Epoch 6, Step 200: Loss = 0.3914
Epoch 6, Step 210: Loss = 0.9976
Epoch 6, Step 220: Loss = 0.9878
Epoch 6, Step 230: Loss = 0.5310
Epoch 6, Step 240: Loss = 3.6658
Epoch 6, Step 250: Loss = 4.5252
Epoch 6, Step 260: Loss = 0.7195
Epoch 6, Step 270: Loss = 1.6892
Epoch 6, Step 280: Loss = 0.9117
Epoch 6, Step 290: Loss = 1.9796
Epoch 6, Step 300: Lo




Epoch 6 Summary:
Training Metrics:
accuracy: 0.8373
precision: 0.8412
recall: 0.8315
f1: 0.8363
loss: 0.9063

Validation Metrics:
accuracy: 0.8350
precision: 0.8370
recall: 0.8320
f1: 0.8345
loss: 1.0031




Epoch 7, Step 0: Loss = 1.4759
Epoch 7, Step 10: Loss = 1.0453
Epoch 7, Step 20: Loss = 1.0154
Epoch 7, Step 30: Loss = 0.1179
Epoch 7, Step 40: Loss = 2.3122
Epoch 7, Step 50: Loss = 0.4518
Epoch 7, Step 60: Loss = 0.7367
Epoch 7, Step 70: Loss = 0.0109
Epoch 7, Step 80: Loss = 0.0075
Epoch 7, Step 90: Loss = 0.0212
Epoch 7, Step 100: Loss = 0.0852
Epoch 7, Step 110: Loss = 0.1691
Epoch 7, Step 120: Loss = 0.0352
Epoch 7, Step 130: Loss = 2.2136
Epoch 7, Step 140: Loss = 0.8398
Epoch 7, Step 150: Loss = 2.2052
Epoch 7, Step 160: Loss = 0.0096
Epoch 7, Step 170: Loss = 0.0016
Epoch 7, Step 180: Loss = 0.5589
Epoch 7, Step 190: Loss = 1.2883
Epoch 7, Step 200: Loss = 2.7291
Epoch 7, Step 210: Loss = 0.0200
Epoch 7, Step 220: Loss = 0.7748
Epoch 7, Step 230: Loss = 0.0274
Epoch 7, Step 240: Loss = 0.0118
Epoch 7, Step 250: Loss = 0.4291
Epoch 7, Step 260: Loss = 3.4669
Epoch 7, Step 270: Loss = 2.6309
Epoch 7, Step 280: Loss = 0.1900
Epoch 7, Step 290: Loss = 1.2725
Epoch 7, Step 300: Lo




Epoch 7 Summary:
Training Metrics:
accuracy: 0.8445
precision: 0.8480
recall: 0.8395
f1: 0.8437
loss: 0.9052

Validation Metrics:
accuracy: 0.8370
precision: 0.8377
recall: 0.8360
f1: 0.8368
loss: 0.9929




Epoch 8, Step 0: Loss = 0.0014
Epoch 8, Step 10: Loss = 0.0007
Epoch 8, Step 20: Loss = 0.4336
Epoch 8, Step 30: Loss = 0.2674
Epoch 8, Step 40: Loss = 0.0563
Epoch 8, Step 50: Loss = 1.7358
Epoch 8, Step 60: Loss = 0.1541
Epoch 8, Step 70: Loss = 0.2670
Epoch 8, Step 80: Loss = 4.8683
Epoch 8, Step 90: Loss = 0.7329
Epoch 8, Step 100: Loss = 1.0300
Epoch 8, Step 110: Loss = 0.0068
Epoch 8, Step 120: Loss = 1.6105
Epoch 8, Step 130: Loss = 2.3194
Epoch 8, Step 140: Loss = 0.2140
Epoch 8, Step 150: Loss = 1.3234
Epoch 8, Step 160: Loss = 0.0667
Epoch 8, Step 170: Loss = 2.4458
Epoch 8, Step 180: Loss = 0.8809
Epoch 8, Step 190: Loss = 0.7185
Epoch 8, Step 200: Loss = 0.2951
Epoch 8, Step 210: Loss = 0.9569
Epoch 8, Step 220: Loss = 0.0390
Epoch 8, Step 230: Loss = 0.4283
Epoch 8, Step 240: Loss = 2.7035
Epoch 8, Step 250: Loss = 1.0798
Epoch 8, Step 260: Loss = 0.0638
Epoch 8, Step 270: Loss = 0.4754
Epoch 8, Step 280: Loss = 1.8602
Epoch 8, Step 290: Loss = 1.2391
Epoch 8, Step 300: Lo




Epoch 8 Summary:
Training Metrics:
accuracy: 0.8430
precision: 0.8458
recall: 0.8390
f1: 0.8424
loss: 0.8889

Validation Metrics:
accuracy: 0.8370
precision: 0.8377
recall: 0.8360
f1: 0.8368
loss: 0.9870




Epoch 9, Step 0: Loss = 2.5176
Epoch 9, Step 10: Loss = 0.3947
Epoch 9, Step 20: Loss = 0.0217
Epoch 9, Step 30: Loss = 0.8879
Epoch 9, Step 40: Loss = 1.2829
Epoch 9, Step 50: Loss = 0.0131
Epoch 9, Step 60: Loss = 3.5580
Epoch 9, Step 70: Loss = 0.0556
Epoch 9, Step 80: Loss = 0.0010
Epoch 9, Step 90: Loss = 0.6217
Epoch 9, Step 100: Loss = 0.2274
Epoch 9, Step 110: Loss = 1.0076
Epoch 9, Step 120: Loss = 0.3086
Epoch 9, Step 130: Loss = 0.2957
Epoch 9, Step 140: Loss = 2.1750
Epoch 9, Step 150: Loss = 1.9072
Epoch 9, Step 160: Loss = 1.6052
Epoch 9, Step 170: Loss = 0.0998
Epoch 9, Step 180: Loss = 1.3741
Epoch 9, Step 190: Loss = 0.0251
Epoch 9, Step 200: Loss = 1.7213
Epoch 9, Step 210: Loss = 1.2618
Epoch 9, Step 220: Loss = 0.0000
Epoch 9, Step 230: Loss = 0.1913
Epoch 9, Step 240: Loss = 0.9177
Epoch 9, Step 250: Loss = 0.4012
Epoch 9, Step 260: Loss = 1.3685
Epoch 9, Step 270: Loss = 0.0023
Epoch 9, Step 280: Loss = 0.9741
Epoch 9, Step 290: Loss = 0.0424
Epoch 9, Step 300: Lo




Epoch 9 Summary:
Training Metrics:
accuracy: 0.8468
precision: 0.8508
recall: 0.8410
f1: 0.8459
loss: 0.8753

Validation Metrics:
accuracy: 0.8370
precision: 0.8377
recall: 0.8360
f1: 0.8368
loss: 0.9835


In [3]:
import os
import json
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    AutoConfig
)
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch.nn.functional as F
from huggingface_hub import login
import bitsandbytes as bnb
import re
import gc


def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

class EmailDataset(Dataset):
    def __init__(self, emails_df, tokenizer, max_length=512):
        self.emails_df = emails_df.copy()
        self.emails_df['sender'] = self.emails_df['sender'].apply(clean_text)
        self.emails_df['subject'] = self.emails_df['subject'].apply(clean_text)
        self.emails_df['body'] = self.emails_df['body'].apply(clean_text)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.emails_df)

    def __getitem__(self, idx):
        email = self.emails_df.iloc[idx]
        input_text = f"Sender: {email['sender']} [SEP] Subject: {email['subject']} [SEP] {email['body']}"
        
        encoding = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(email['label'], dtype=torch.long)
        }

def clean_text(text):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def setup_model_and_tokenizer(model_name, device):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    
    model_config = AutoConfig.from_pretrained(model_name)
    model_config.num_labels = 2
    
    model = BertForSequenceClassification.from_pretrained(
        model_name,
        config=model_config
    )
    

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["query", "value"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(model, lora_config)
    
    return model, tokenizer

def compute_metrics(preds, labels):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    conf_matrix = confusion_matrix(labels, preds)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['label']
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['label'].cpu().numpy())
    
    metrics = compute_metrics(all_preds, all_labels)
    metrics['loss'] = total_loss / len(data_loader)
    return metrics

def train_model(model, train_loader, val_loader, optimizer, scheduler, device, num_epochs=5):
    best_val_metrics = {'f1': 0}
    best_model_state = None
    model = model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_preds = []
        train_labels = []
        
        for step, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['label']
            )
            
            loss = outputs.loss
            loss.backward()
            
            if (step + 1) % 2 == 0:  # Gradient accumulation steps = 2
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(batch['label'].cpu().numpy())
            
            if step % 10 == 0:
                print(f"Epoch {epoch+1}, Step {step}: Loss = {loss.item():.4f}")
            
            torch.cuda.empty_cache()
  
        train_metrics = compute_metrics(train_preds, train_labels)
        train_metrics['loss'] = total_loss / len(train_loader)
      
        val_metrics = evaluate_model(model, val_loader, device)
      
        print(f"\nEpoch {epoch + 1} Summary:")
        print("Training Metrics:")
        for metric, value in train_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        print("\nValidation Metrics:")
        for metric, value in val_metrics.items():
            if metric != 'confusion_matrix':
                print(f"{metric}: {value:.4f}")
        
        if val_metrics['f1'] > best_val_metrics['f1']:
            best_val_metrics = val_metrics
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
        
        torch.cuda.empty_cache()
        gc.collect()
    
    return best_model_state, best_val_metrics

def main():
    login(token=
    device = setup_environment()
    model_name = 'bert-base-uncased'
    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")
    
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")
    
    model, tokenizer = setup_model_and_tokenizer(model_name, device)
    emails_df = pd.read_csv(data_path)
    
    train_df, val_df = train_test_split(emails_df, test_size=0.2, stratify=emails_df['label'], random_state=42)
    
    train_dataset = EmailDataset(train_df, tokenizer, max_length=512)
    val_dataset = EmailDataset(val_df, tokenizer, max_length=512)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=16, num_workers=2, pin_memory=True)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  
    num_epochs = 5
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = num_training_steps // 20
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    best_model_state, best_metrics = train_model(
        model,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device,
        num_epochs=num_epochs
    )
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/bert_binary_classification_model")
    os.makedirs(output_dir, exist_ok=True)
    
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    
    config = {
        "model_name": model_name,
        "num_epochs": num_epochs,
        "learning_rate": 2e-5,
        "batch_size": 16,
        "max_length": 512,
        "warmup_steps": num_warmup_steps,
        "total_steps": num_training_steps,
        "device": str(device),
        "best_metrics": {k: float(v) if k != 'confusion_matrix' else v.tolist() 
                        for k, v in best_metrics.items()}
    }
    
    with open(os.path.join(output_dir, "training_config.json"), "w") as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    main()

Using GPU: NVIDIA RTX A5000
GPU Memory: 23.68 GB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Step 0: Loss = 0.7362
Epoch 1, Step 10: Loss = 0.8049
Epoch 1, Step 20: Loss = 0.7581
Epoch 1, Step 30: Loss = 0.7583
Epoch 1, Step 40: Loss = 0.6847
Epoch 1, Step 50: Loss = 0.7262
Epoch 1, Step 60: Loss = 0.7710
Epoch 1, Step 70: Loss = 0.6936
Epoch 1, Step 80: Loss = 0.6389
Epoch 1, Step 90: Loss = 0.6876
Epoch 1, Step 100: Loss = 0.7184
Epoch 1, Step 110: Loss = 0.7079
Epoch 1, Step 120: Loss = 0.6588
Epoch 1, Step 130: Loss = 0.7551
Epoch 1, Step 140: Loss = 0.6582
Epoch 1, Step 150: Loss = 0.7095
Epoch 1, Step 160: Loss = 0.6176
Epoch 1, Step 170: Loss = 0.6758
Epoch 1, Step 180: Loss = 0.7016
Epoch 1, Step 190: Loss = 0.6051
Epoch 1, Step 200: Loss = 0.6323
Epoch 1, Step 210: Loss = 0.6035
Epoch 1, Step 220: Loss = 0.6091
Epoch 1, Step 230: Loss = 0.5587
Epoch 1, Step 240: Loss = 0.5516

Epoch 1 Summary:
Training Metrics:
accuracy: 0.5992
precision: 0.6195
recall: 0.5145
f1: 0.5621
loss: 0.6784

Validation Metrics:
accuracy: 0.8950
precision: 0.8791
recall: 0.9160
f1: 0

In [4]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import LlamaForSequenceClassification, LlamaTokenizer
from huggingface_hub import login
import re

def setup_environment():
    """Set up GPU/CPU environment and optimize CUDA settings."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'  # Set GPU device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if torch.cuda.is_available():
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        print("Using CPU")

    return device

def clean_text(text):
    """Clean and normalize text data."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  # Convert to lowercase and remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

def load_model(model_dir, device):
    """Load fine-tuned LLaMA model and tokenizer from local directory."""
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = LlamaForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = LlamaTokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token
    print("Model loaded successfully!")
    return model, tokenizer

def predict_email(model, tokenizer, email_df, device, max_length=512):
    """Predict whether an email is Ham (0) or Phishing (1) with confidence scores."""
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

    
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token="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") 
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_binary_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject', 'Email'}
    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "lama7b_predictions.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/lama7b_binary_classification_model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           1   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           1   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           1   
4  <p><strong>The account number associated with ...   Phishing           1   

   confidence_score  
0          

In [6]:
 file = pd.read_csv("lama7b_predictions/lama7b_predictions.csv")

In [7]:
file

Unnamed: 0,Sender,Subject,Email,Email_type,prediction,confidence_score
0,noreply@powerballs.com,You Have Won!,<p>*********PLEASE DO NOT RESPOND TO THIS EMAI...,Phishing,1,0.795860
1,noreply@paypalceo.com,PayPal Breach,<p>********* RESPONES TO THIS EMAIL WILL NOT B...,Phishing,1,0.913134
2,support@credit.chase.com,URGENT: Fraudulent activity detected,"<p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...",Phishing,1,0.805710
3,mary@yahoo.com,Donations needed for Mark,"<p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...",Phishing,1,0.886202
4,support@security.amazon.com,Your Amazon Account,<p><strong>The account number associated with ...,Phishing,1,0.834758
...,...,...,...,...,...,...
236,no-reply@yahoo.com,Password change for your Yahoo account,"Hi Ethan,<br><br>The password for your Yahoo a...",ham,1,0.887091
237,communications@em.aetna.com,Protect your health records on your Aetna memb...,"<h2 align=""center"">Protecting your personal in...",ham,1,0.895285
238,no-reply@dropboxmail.com,jacab invited you to check out Dropbox,"Hi there,<br><p>Jacob (jacob14@gmail.com) thin...",ham,1,0.793260
239,help-check@human.resource.com,An important email to respond,<p>Hello!</p><p>This email is used for <b>atte...,ham,1,0.847476


In [1]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoTokenizer
from huggingface_hub import login
import re

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

def clean_text(text):
    """Clean and normalize text data."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  
    text = re.sub(r'\s+', ' ', text)  
    return text

def load_model(model_dir, device):
    
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = LlamaForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token
    print("Model loaded successfully!")
    return model, tokenizer

def predict_email(model, tokenizer, email_df, device, max_length=512):
    
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

    # Ensure required columns are included
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token= 
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama8b_binary_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject', 'Email'}
    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "lama8b_predictions.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/llama8b_binary_classification_model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           1   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           1   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           1   
4  <p><strong>The account number associated with ...   Phishing           1   

   confidence_score  
0          

In [2]:
 file = pd.read_csv("lama7b_predictions/lama8b_predictions.csv")
    

In [3]:
import pandas as pd


file = pd.read_csv("lama7b_predictions/lama8b_predictions.csv")

pred_counts = file["prediction"].value_counts()


print(f"Number of Ham emails: {pred_counts.get(0, 0)}")
print(f"Number of Phishing emails: {pred_counts.get(1, 0)}")


Number of Ham emails: 1
Number of Phishing emails: 240


In [5]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoTokenizer
from huggingface_hub import login
import re

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

def clean_text(text):
    
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  
    text = re.sub(r'\s+', ' ', text)  
    return text

def load_model(model_dir, device):
    
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = LlamaForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token
    print("Model loaded successfully!")
    return model, tokenizer

def predict_email(model, tokenizer, email_df, device, max_length=512):
    
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

    # Ensure required columns are included
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token= 
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/wizard8b_binary_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject', 'Email'}
    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "wizard8b_predictions.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/wizard8b_binary_classification_model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           1   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           0   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           0   
4  <p><strong>The account number associated with ...   Phishing           1   

   confidence_score  
0          

In [6]:
import pandas as pd


file = pd.read_csv("lama7b_predictions/wizard8b_predictions.csv")

pred_counts = file["prediction"].value_counts()


print(f"Number of Ham emails: {pred_counts.get(0, 0)}")
print(f"Number of Phishing emails: {pred_counts.get(1, 0)}")


Number of Ham emails: 121
Number of Phishing emails: 120


In [7]:
file = pd.read_csv("lama7b_predictions/wizard8b_predictions.csv")

In [11]:
file.head(17)

Unnamed: 0,Sender,Subject,Email,Email_type,prediction,confidence_score
0,noreply@powerballs.com,You Have Won!,<p>*********PLEASE DO NOT RESPOND TO THIS EMAI...,Phishing,1,0.577586
1,noreply@paypalceo.com,PayPal Breach,<p>********* RESPONES TO THIS EMAIL WILL NOT B...,Phishing,0,0.996301
2,support@credit.chase.com,URGENT: Fraudulent activity detected,"<p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...",Phishing,1,0.778545
3,mary@yahoo.com,Donations needed for Mark,"<p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...",Phishing,0,0.652719
4,support@security.amazon.com,Your Amazon Account,<p><strong>The account number associated with ...,Phishing,1,0.781194
5,account@micrsoft.com,RE: Help Desk,"<p>Dear user,</p><p>Regarding your account, we...",Phishing,1,0.935164
6,safety@privacy.chase.com,Update for you Account,<p>Dear Valued Chase Memeber:&nbsp;</p><p>&nbs...,Phishing,1,0.909046
7,customer@fed.ex.com,RE: FED EX TRACKING NUMBER,"<p>Dear user,</p><p>Unfortunately, we missed y...",Phishing,1,0.900277
8,jeremyp@gmail.com,Summer Internship Application,<p>To Whom It May Concern:</p><p>&nbsp;</p><p>...,Phishing,0,0.695772
9,offer@coupons.walmart.com,Walmart Reward Coupons,<p>Account No: 108-455294-800125-MN</p><p>&nbs...,Phishing,0,0.928215


In [16]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import BertForSequenceClassification, BertTokenizer, AutoTokenizer
from huggingface_hub import login
import re

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

def clean_text(text):
    """Clean and normalize text data."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  
    text = re.sub(r'\s+', ' ', text)  
    return text

def load_model(model_dir, device):
    """Load fine-tuned BERT model and tokenizer from local directory."""
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = BertForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    
  
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    print("Model loaded successfully!")
    return model, tokenizer


def predict_email(model, tokenizer, email_df, device, max_length=512):
    
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

   
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token= 
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/bert_binary_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject','Email_ID' 'Email'}
    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "bert_predictions.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/bert_binary_classification_model...
Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           1   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           1   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           0   
4  <

In [17]:
import pandas as pd


file = pd.read_csv("lama7b_predictions/bert_predictions.csv")

pred_counts = file["prediction"].value_counts()


print(f"Number of Ham emails: {pred_counts.get(0, 0)}")
print(f"Number of Phishing emails: {pred_counts.get(1, 0)}")


Number of Ham emails: 100
Number of Phishing emails: 141


In [18]:
file = pd.read_csv("lama7b_predictions/bert_predictions.csv")

In [20]:
file.head(10)

Unnamed: 0,Sender,Subject,Email,Email_type,prediction,confidence_score
0,noreply@powerballs.com,You Have Won!,<p>*********PLEASE DO NOT RESPOND TO THIS EMAI...,Phishing,1,0.729289
1,noreply@paypalceo.com,PayPal Breach,<p>********* RESPONES TO THIS EMAIL WILL NOT B...,Phishing,1,0.576073
2,support@credit.chase.com,URGENT: Fraudulent activity detected,"<p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...",Phishing,1,0.680934
3,mary@yahoo.com,Donations needed for Mark,"<p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...",Phishing,0,0.640179
4,support@security.amazon.com,Your Amazon Account,<p><strong>The account number associated with ...,Phishing,1,0.886676
5,account@micrsoft.com,RE: Help Desk,"<p>Dear user,</p><p>Regarding your account, we...",Phishing,1,0.906669
6,safety@privacy.chase.com,Update for you Account,<p>Dear Valued Chase Memeber:&nbsp;</p><p>&nbs...,Phishing,1,0.902423
7,customer@fed.ex.com,RE: FED EX TRACKING NUMBER,"<p>Dear user,</p><p>Unfortunately, we missed y...",Phishing,1,0.985623
8,jeremyp@gmail.com,Summer Internship Application,<p>To Whom It May Concern:</p><p>&nbsp;</p><p>...,Phishing,0,0.990661
9,offer@coupons.walmart.com,Walmart Reward Coupons,<p>Account No: 108-455294-800125-MN</p><p>&nbs...,Phishing,0,0.784708


In [2]:
import os
import json
import pandas as pd
import numpy as np
import re
import gc
import traceback

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.pipeline import Pipeline

# Data cleaning function
def clean_text(text):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    #text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def main():

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/final_data.csv")
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"File not found: {data_path}")
    
    # Load dataset
    df = pd.read_csv(data_path)
    

    df['sender'] = df['sender'].astype(str).apply(clean_text)
    df['subject'] = df['subject'].astype(str).apply(clean_text)
    df['body'] = df['body'].astype(str).apply(clean_text)
    df['text'] = "Sender: " + df['sender'] + " Subject: " + df['subject'] + " " + df['body']
    
    
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
    
  
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('lsa', TruncatedSVD(n_components=300, random_state=42)),
        ('clf', LogisticRegression(max_iter=1000, random_state=42))
    ])
    
    
    pipeline.fit(train_df['text'], train_df['label'])
    
  
    train_preds = pipeline.predict(train_df['text'])
    train_proba = pipeline.predict_proba(train_df['text'])
    train_accuracy = accuracy_score(train_df['label'], train_preds)
    train_loss = log_loss(train_df['label'], train_proba)
 
    val_preds = pipeline.predict(val_df['text'])
    val_proba = pipeline.predict_proba(val_df['text'])
    val_accuracy = accuracy_score(val_df['label'], val_preds)
    val_loss = log_loss(val_df['label'], val_proba)
    
    print("Training Accuracy:", train_accuracy)
    print("Training Loss:", train_loss)
    print("Validation Accuracy:", val_accuracy)
    print("Validation Loss:", val_loss)
    
   
    df['prediction'] = pipeline.predict(df['text'])
    df['prediction_probability'] = pipeline.predict_proba(df['text']).max(axis=1)
    
   
    output_path = os.path.expanduser("~/Downloads/Tune/FineTune/lsa_predictions.csv")
    df.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")

if __name__ == "__main__":
    main()


Training Accuracy: 0.9805
Training Loss: 0.1227047289093097
Validation Accuracy: 0.984
Validation Loss: 0.12054307129765304
Predictions saved to /home/users/skuikel/Downloads/Tune/FineTune/lsa_predictions.csv


In [1]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import LlamaForSequenceClassification, LlamaTokenizer
from huggingface_hub import login
import re

def setup_environment():
    """Set up GPU/CPU environment and optimize CUDA settings."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'  # Set GPU device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if torch.cuda.is_available():
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        print("Using CPU")

    return device

def clean_text(text):
    """Clean and normalize text data."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  # Convert to lowercase and remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

def load_model(model_dir, device):
    """Load fine-tuned LLaMA model and tokenizer from local directory."""
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = LlamaForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = LlamaTokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token
    print("Model loaded successfully!")
    return model, tokenizer

def predict_email(model, tokenizer, email_df, device, max_length=512):
    """Predict whether an email is Ham (0) or Phishing (1) with confidence scores."""
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

    
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token=
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_7b_dpo123_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject', 'Email'}
    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "lama7b_predictions_dpo.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/llama_7b_dpo123_classification_model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           1   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           1   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           1   
4  <p><strong>The account number associated with ...   Phishing           1   

   confidence_score  
0          

In [4]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoTokenizer
from huggingface_hub import login
import re

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

def clean_text(text):
    """Clean and normalize text data."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  
    text = re.sub(r'\s+', ' ', text)  
    return text

def load_model(model_dir, device):
    
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = LlamaForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token
    print("Model loaded successfully!")
    return model, tokenizer

def predict_email(model, tokenizer, email_df, device, max_length=512):
    
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

    # Ensure required columns are included
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token= 
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_8b_dpo123_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject', 'Email'}
    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "lama8b_predictions_dpo.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/llama_8b_dpo123_classification_model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           1   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           1   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           1   
4  <p><strong>The account number associated with ...   Phishing           1   

   confidence_score  
0          

In [5]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import LlamaForSequenceClassification, LlamaTokenizer
from huggingface_hub import login
import re

def setup_environment():
    """Set up GPU/CPU environment and optimize CUDA settings."""
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if torch.cuda.is_available():
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        print("Using CPU")

    return device

def clean_text(text):
    """Clean and normalize text data."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  
    text = re.sub(r'\s+', ' ', text) 
    return text

def load_model(model_dir, device):
    """Load fine-tuned LLaMA model and tokenizer from local directory."""
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = LlamaForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = LlamaTokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token
    print("Model loaded successfully!")
    return model, tokenizer

def predict_email(model, tokenizer, email_df, device, max_length=512):
    """Predict whether an email is Ham (0) or Phishing (1) with confidence scores."""
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

    
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token=
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/wizard_7b_dpo_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject', 'Email'}
    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "wizard7b_predictions_dpo.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/wizard_7b_dpo_classification_model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           1   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           1   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           1   
4  <p><strong>The account number associated with ...   Phishing           1   

   confidence_score  
0          

In [11]:
import os
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import BertForSequenceClassification, BertTokenizer, AutoTokenizer
from huggingface_hub import login
import re

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

def clean_text(text):
    """Clean and normalize text data."""
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  
    text = re.sub(r'\s+', ' ', text)  
    return text

def load_model(model_dir, device):
    """Load fine-tuned BERT model and tokenizer from local directory."""
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"Trained model directory not found: {model_dir}")

    print(f"Loading model from {model_dir}...")
    model = BertForSequenceClassification.from_pretrained(model_dir).to(device)
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    
  
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    print("Model loaded successfully!")
    return model, tokenizer


def predict_email(model, tokenizer, email_df, device, max_length=512):
    
    model.eval()

    predictions = []
    confidence_scores = []

    for _, email in email_df.iterrows():
        input_text = f"Sender: {clean_text(email['Sender'])} [SEP] Subject: {clean_text(email['Subject'])} [SEP] {clean_text(email['Email'])}"
        
        encoding = tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoding)
        
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)  
        confidence, pred_label = torch.max(probs, dim=-1)

        predictions.append(pred_label.item())
        confidence_scores.append(confidence.item())

    email_df['prediction'] = predictions  
    email_df['confidence_score'] = confidence_scores

   
    required_columns = ['Sender', 'Subject', 'Email']
    if 'Email_type' in email_df.columns:
        required_columns.append('Email_type')

    final_df = email_df[required_columns + ['prediction', 'confidence_score']]
    return final_df

def main():
    
    try:
        login(token= 
    except Exception as e:
        print(f"Login failed: {e}")
        return

    device = setup_environment()

    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/bert_dpo123_classification_model")
    model, tokenizer = load_model(model_dir, device)

    data_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    if not os.path.exists(data_path):
        print(f"Data file not found: {data_path}")
        return

    new_emails_df = pd.read_excel(data_path)

    required_columns = {'Sender', 'Subject', 'Email_ID', 'Email'}

    if not required_columns.issubset(new_emails_df.columns):
        raise ValueError(f"Dataset must contain the following columns: {required_columns}")

    predictions_df = predict_email(model, tokenizer, new_emails_df, device)
    
    output_dir = os.path.expanduser("~/Downloads/Tune/FineTune/lama7b_predictions")
    os.makedirs(output_dir, exist_ok=True)
    predictions_df.to_csv(os.path.join(output_dir, "bert_predictions_dpo.csv"), index=False)
    
    print("Predictions saved successfully!")
    print(predictions_df.head())

if __name__ == "__main__":
    main()


Using CPU
Loading model from /home/users/skuikel/Downloads/Tune/FineTune/bert_dpo123_classification_model...
Model loaded successfully!
Predictions saved successfully!
                        Sender                               Subject  \
0       noreply@powerballs.com                         You Have Won!   
1        noreply@paypalceo.com                         PayPal Breach   
2     support@credit.chase.com  URGENT: Fraudulent activity detected   
3               mary@yahoo.com             Donations needed for Mark   
4  support@security.amazon.com                  Your Amazon Account    

                                               Email Email_type  prediction  \
0  <p>*********PLEASE DO NOT RESPOND TO THIS EMAI...   Phishing           0   
1  <p>********* RESPONES TO THIS EMAIL WILL NOT B...   Phishing           0   
2  <p>Hello,&nbsp;</p><p>&nbsp;</p><p>We are writ...   Phishing           1   
3  <p>Hello,</p><p>&nbsp;</p><p>I&#39;m contactin...   Phishing           0   
4  <