In [2]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSequenceClassification,LlamaForSequenceClassification
import gc

def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device
def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer

def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding

def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
           
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)


def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_7B")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

    model, tokenizer = load_fine_tuned_model(model_dir, device)

    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)

    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    

    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_llama7b_ft_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_llama7b_ft_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_llama7b_ft_dpo.csv")

    # Clean up memory using cache 
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved similarity matrix to sender_similarity_matrix_llama7b_ft_dpo.csv
Saved similarity matrix to subject_similarity_matrix_llama7b_ft_dpo.csv
Saved similarity matrix to body_similarity_matrix_llama7b_ft_dpo.csv


In [3]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import gc



def setup_environment(preferred_gpu=None):
    
    if torch.cuda.is_available():
        gpu_count = torch.cuda.device_count()

        if preferred_gpu is not None and preferred_gpu < gpu_count:
            os.environ['CUDA_VISIBLE_DEVICES'] = str(preferred_gpu)
            device = torch.device("cuda:0")  # first visible GPU
            print(f"Using preferred GPU {preferred_gpu}: {torch.cuda.get_device_name(0)}")
        else:
            if preferred_gpu is not None:
                print(f"Preferred GPU {preferred_gpu} not found. Using GPU 0 instead.")
            device = torch.device("cuda:0")
            print(f"Using GPU 0: {torch.cuda.get_device_name(0)}")

        # CUDA optimizations
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

    else:
        device = torch.device("cpu")
        print("No GPU found. Using CPU.")

    # Seed for reproducibility
    torch.manual_seed(42)
    if device.type == "cuda":
        torch.cuda.manual_seed_all(42)
    np.random.seed(42)

    return device




def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer



def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)


def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_8B")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_llama8b_ft_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_llama8b_ft_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_llama8b_ft_dpo.csv")

gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


No GPU found. Using CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved similarity matrix to sender_similarity_matrix_llama8b_ft_dpo.csv
Saved similarity matrix to subject_similarity_matrix_llama8b_ft_dpo.csv
Saved similarity matrix to body_similarity_matrix_llama8b_ft_dpo.csv


In [4]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer,AutoModelForSequenceClassification
import gc


from transformers import LlamaForSequenceClassification, AutoTokenizer

def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer


def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)



def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_Mistral")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_mistral_ft_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_mistral_ft_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_mistral_ft_dpo.csv")

 #  
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


No GPU found. Using CPU.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved similarity matrix to sender_similarity_matrix_mistral_ft_dpo.csv
Saved similarity matrix to subject_similarity_matrix_mistral_ft_dpo.csv
Saved similarity matrix to body_similarity_matrix_mistral_ft_dpo.csv


In [5]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer,AutoModelForSequenceClassification
import gc


from transformers import LlamaForSequenceClassification, AutoTokenizer

def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model =AutoModelForSequenceClassification .from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer


def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)



def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_7B_Wizard")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_wizard_ft_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_wizard_ft_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_wizard_ft_dpo.csv")

 #  
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


No GPU found. Using CPU.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved similarity matrix to sender_similarity_matrix_wizard_ft_dpo.csv
Saved similarity matrix to subject_similarity_matrix_wizard_ft_dpo.csv
Saved similarity matrix to body_similarity_matrix_wizard_ft_dpo.csv


In [6]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer,AutoModelForSequenceClassification
import gc


from transformers import LlamaForSequenceClassification, AutoTokenizer

def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model =AutoModelForSequenceClassification .from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer


def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)



def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_Qwen")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_qwen_ft_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_qwen_ft_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_qwen_ft_dpo.csv")

 #  
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


No GPU found. Using CPU.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved similarity matrix to sender_similarity_matrix_qwen_ft_dpo.csv
Saved similarity matrix to subject_similarity_matrix_qwen_ft_dpo.csv
Saved similarity matrix to body_similarity_matrix_qwen_ft_dpo.csv


In [7]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer,AutoModelForSequenceClassification
import gc


from transformers import LlamaForSequenceClassification, AutoTokenizer

def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model =AutoModelForSequenceClassification .from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer


def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)



def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_bert_uncased")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_bert_ft_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_bert_ft_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_bert_ft_dpo.csv")

 #  
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


No GPU found. Using CPU.
Saved similarity matrix to sender_similarity_matrix_bert_ft_dpo.csv
Saved similarity matrix to subject_similarity_matrix_bert_ft_dpo.csv
Saved similarity matrix to body_similarity_matrix_bert_ft_dpo.csv


In [11]:
# pip install -U "transformers>=4.41" "peft>=0.11.1" pandas numpy scikit-learn torch tqdm

import os, gc
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM


MODEL_DIR        = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_7B")  # full model OR adapter folder
BASE_MODEL_ID    = os.environ.get("BASE_MODEL_ID", "meta-llama/Llama-2-7b-hf")     # used only if MODEL_DIR is an adapter
BASE_MODEL_LOCAL = os.environ.get("BASE_MODEL_LOCAL")                               # local base path (preferred if you have it)
XLSX_PATH        = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

BATCH_SIZE_GPU   = 32
BATCH_SIZE_CPU   = 4

MAXLEN_SENDER    = 48
MAXLEN_SUBJECT   = 64
MAXLEN_BODY      = 384
TARGET_DIM       = 200

OUT_SENDER       = "sender_similarity_matrix_llama7b_ft_pca_dpo.csv"
OUT_SUBJECT      = "subject_similarity_matrix_llama7b_ft_pca_dpo.csv"
OUT_BODY         = "body_similarity_matrix_llama7b_ft_pca_dpo.csv"

# -------------- SETUP --------------
def setup_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print(f"Using device: {torch.cuda.get_device_name(None)}")
        print(f"Total CUDA devices: {torch.cuda.device_count()}")
        return device
    print("No CUDA device available. Falling back to CPU.")
    return torch.device("cpu")

def _is_adapter(path: str) -> bool:
    return os.path.isfile(os.path.join(path, "adapter_config.json"))

def _load_tokenizer(src: str):
    tok = AutoTokenizer.from_pretrained(src, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok

def load_model_and_tokenizer(device):
   
    if _is_adapter(MODEL_DIR):
        base_src = BASE_MODEL_LOCAL if BASE_MODEL_LOCAL else BASE_MODEL_ID
        print(f"Detected adapter in: {MODEL_DIR}")
        print(f"Loading base from:   {base_src}")
        tok = _load_tokenizer(base_src)
        base = AutoModelForCausalLM.from_pretrained(
            base_src,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()

        from peft import PeftModel
        model = PeftModel.from_pretrained(base, MODEL_DIR).eval()
        print("Loaded base + adapter.")
        return model, tok
    else:
        print(f"Loading full model from: {MODEL_DIR}")
        tok = _load_tokenizer(MODEL_DIR)
        model = AutoModel.from_pretrained(
            MODEL_DIR,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()
        print("Loaded full model.")
        return model, tok

@torch.inference_mode()
def embed_batch(model, tok, texts, device, max_length):
    enc = tok(
        ["" if t is None else str(t) for t in texts],
        padding=True,                 # pad to longest in batch
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    if device.type == "cuda":
        with torch.cuda.amp.autocast(dtype=torch.float16):
            out = model(**enc, output_hidden_states=True, use_cache=False)
    else:
        out = model(**enc, output_hidden_states=True, use_cache=False)

    # Robustly get the final hidden states:
    if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None:
        last = out.last_hidden_state
    else:
        # CausalLM returns CausalLMOutputWithPast (no last_hidden_state). Use hidden_states[-1].
        if getattr(out, "hidden_states", None) is None:
            raise RuntimeError("Model did not return hidden states. Ensure output_hidden_states=True.")
        last = out.hidden_states[-1]  # [B, T, H]

    mask = enc["attention_mask"].unsqueeze(-1)    # [B, T, 1]
    pooled = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
    return pooled.detach().cpu().numpy()          # [B, H]

def embed_series(model, tok, series: pd.Series, device, max_length, batch_size, desc="Embedding"):
    vals = series.astype(str).fillna("").tolist()
    # De-dup for speed
    uniq = list(dict.fromkeys(vals))
    cache = {}

    for i in tqdm(range(0, len(uniq), batch_size), desc=desc, leave=False):
        batch = uniq[i:i+batch_size]
        vecs  = embed_batch(model, tok, batch, device, max_length)
        for t, v in zip(batch, vecs):
            cache[t] = v

    # Map back to original order
    return np.stack([cache[v] for v in vals], axis=0)

# -------------- PCA + SIM --------------
def pca_reduce(X: np.ndarray, target_dim: int) -> np.ndarray:
    n_samples, n_features = X.shape
    n_components = min(target_dim, n_samples, n_features)
    if n_components < target_dim:
        print(f"PCA components clipped to {n_components} (samples={n_samples}, features={n_features}).")
    return PCA(n_components=n_components, random_state=42).fit_transform(X)

def save_csv(matrix: np.ndarray, path: str):
    pd.DataFrame(matrix).to_csv(path, index=False)
    print(f"Saved: {path}")

# -------------- MAIN --------------
def main():
    device = setup_device()

    # free mem
    if device.type == "cuda":
        batch_size = BATCH_SIZE_GPU
        torch.cuda.empty_cache()
    else:
        batch_size = BATCH_SIZE_CPU
    gc.collect()

    model, tok = load_model_and_tokenizer(device)

    # Read data
    try:
        df = pd.read_excel(XLSX_PATH)
    except Exception:
        df = pd.read_excel(XLSX_PATH, engine="openpyxl")

    for col in ["Sender", "Subject", "Email"]:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found. Got: {list(df.columns)}")

    print("Embedding Sender…")
    sender_emb  = embed_series(model, tok, df["Sender"],  device, MAXLEN_SENDER,  batch_size, desc="Sender")
    print("Embedding Subject…")
    subject_emb = embed_series(model, tok, df["Subject"], device, MAXLEN_SUBJECT, batch_size, desc="Subject")
    print("Embedding Body…")
    body_emb    = embed_series(model, tok, df["Email"],   device, MAXLEN_BODY,    batch_size, desc="Body")

    print("PCA reducing…")
    sender_r  = pca_reduce(sender_emb,  TARGET_DIM)
    subject_r = pca_reduce(subject_emb, TARGET_DIM)
    body_r    = pca_reduce(body_emb,    TARGET_DIM)

    print("Computing cosine similarity & saving…")
    save_csv(cosine_similarity(sender_r),  OUT_SENDER)
    save_csv(cosine_similarity(subject_r), OUT_SUBJECT)
    save_csv(cosine_similarity(body_r),    OUT_BODY)

    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


No CUDA device available. Falling back to CPU.
Detected adapter in: /home/users/skuikel/Downloads/Tune/FineTune/dpo_7B
Loading base from:   meta-llama/Llama-2-7b-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded base + adapter.
Embedding Sender…


Sender:   0%|          | 0/51 [00:00<?, ?it/s]

Embedding Subject…


Subject:   0%|          | 0/55 [00:00<?, ?it/s]

Embedding Body…


Body:   0%|          | 0/61 [00:00<?, ?it/s]

PCA reducing…
Computing cosine similarity & saving…
Saved: sender_similarity_matrix_llama7b_ft_pca_dpo.csv
Saved: subject_similarity_matrix_llama7b_ft_pca_dpo.csv
Saved: body_similarity_matrix_llama7b_ft_pca_dpo.csv


In [4]:


import os, gc
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM


MODEL_DIR        = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_8B")  # full model OR adapter folder
BASE_MODEL_ID    = os.environ.get("BASE_MODEL_ID", "meta-llama/Meta-Llama-3-8B")     # used only if MODEL_DIR is an adapter
BASE_MODEL_LOCAL = os.environ.get("BASE_MODEL_LOCAL")                               # local base path (preferred if you have it)
XLSX_PATH        = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

BATCH_SIZE_GPU   = 32
BATCH_SIZE_CPU   = 4

MAXLEN_SENDER    = 48
MAXLEN_SUBJECT   = 64
MAXLEN_BODY      = 384
TARGET_DIM       = 200

OUT_SENDER       = "sender_similarity_matrix_llama8b_ft_pca_dpo.csv"
OUT_SUBJECT      = "subject_similarity_matrix_llama8b_ft_pca_dpo.csv"
OUT_BODY         = "body_similarity_matrix_llama8b_ft_pca_dpo.csv"

def setup_device():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device

def _is_adapter(path: str) -> bool:
    return os.path.isfile(os.path.join(path, "adapter_config.json"))

def _load_tokenizer(src: str):
    tok = AutoTokenizer.from_pretrained(src, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok

def load_model_and_tokenizer(device):
   
    if _is_adapter(MODEL_DIR):
        base_src = BASE_MODEL_LOCAL if BASE_MODEL_LOCAL else BASE_MODEL_ID
        print(f"Detected adapter in: {MODEL_DIR}")
        print(f"Loading base from:   {base_src}")
        tok = _load_tokenizer(base_src)
        base = AutoModelForCausalLM.from_pretrained(
            base_src,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()

        from peft import PeftModel
        model = PeftModel.from_pretrained(base, MODEL_DIR).eval()
        print("Loaded base + adapter.")
        return model, tok
    else:
        print(f"Loading full model from: {MODEL_DIR}")
        tok = _load_tokenizer(MODEL_DIR)
        model = AutoModel.from_pretrained(
            MODEL_DIR,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()
        print("Loaded full model.")
        return model, tok

@torch.inference_mode()
def embed_batch(model, tok, texts, device, max_length):
    enc = tok(
        ["" if t is None else str(t) for t in texts],
        padding=True,                 # pad to longest in batch
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    if device.type == "cuda":
        with torch.cuda.amp.autocast(dtype=torch.float16):
            out = model(**enc, output_hidden_states=True, use_cache=False)
    else:
        out = model(**enc, output_hidden_states=True, use_cache=False)

    # Robustly get the final hidden states:
    if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None:
        last = out.last_hidden_state
    else:
        # CausalLM returns CausalLMOutputWithPast (no last_hidden_state). Use hidden_states[-1].
        if getattr(out, "hidden_states", None) is None:
            raise RuntimeError("Model did not return hidden states. Ensure output_hidden_states=True.")
        last = out.hidden_states[-1]  # [B, T, H]

    mask = enc["attention_mask"].unsqueeze(-1)    # [B, T, 1]
    pooled = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
    return pooled.detach().cpu().numpy()          # [B, H]

def embed_series(model, tok, series: pd.Series, device, max_length, batch_size, desc="Embedding"):
    vals = series.astype(str).fillna("").tolist()
    # De-dup for speed
    uniq = list(dict.fromkeys(vals))
    cache = {}

    for i in tqdm(range(0, len(uniq), batch_size), desc=desc, leave=False):
        batch = uniq[i:i+batch_size]
        vecs  = embed_batch(model, tok, batch, device, max_length)
        for t, v in zip(batch, vecs):
            cache[t] = v

    
    return np.stack([cache[v] for v in vals], axis=0)


def pca_reduce(X: np.ndarray, target_dim: int) -> np.ndarray:
    n_samples, n_features = X.shape
    n_components = min(target_dim, n_samples, n_features)
    if n_components < target_dim:
        print(f"PCA components clipped to {n_components} (samples={n_samples}, features={n_features}).")
    return PCA(n_components=n_components, random_state=42).fit_transform(X)

def save_csv(matrix: np.ndarray, path: str):
    pd.DataFrame(matrix).to_csv(path, index=False)
    print(f"Saved: {path}")

def main():
    device = setup_device()

    # free mem
    if device.type == "cuda":
        batch_size = BATCH_SIZE_GPU
        torch.cuda.empty_cache()
    else:
        batch_size = BATCH_SIZE_CPU
    gc.collect()

    model, tok = load_model_and_tokenizer(device)

    # Read data
    try:
        df = pd.read_excel(XLSX_PATH)
    except Exception:
        df = pd.read_excel(XLSX_PATH, engine="openpyxl")

    for col in ["Sender", "Subject", "Email"]:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found. Got: {list(df.columns)}")

    print("Embedding Sender…")
    sender_emb  = embed_series(model, tok, df["Sender"],  device, MAXLEN_SENDER,  batch_size, desc="Sender")
    print("Embedding Subject…")
    subject_emb = embed_series(model, tok, df["Subject"], device, MAXLEN_SUBJECT, batch_size, desc="Subject")
    print("Embedding Body…")
    body_emb    = embed_series(model, tok, df["Email"],   device, MAXLEN_BODY,    batch_size, desc="Body")

    print("PCA reducing…")
    sender_r  = pca_reduce(sender_emb,  TARGET_DIM)
    subject_r = pca_reduce(subject_emb, TARGET_DIM)
    body_r    = pca_reduce(body_emb,    TARGET_DIM)

    print("Computing cosine similarity & saving…")
    save_csv(cosine_similarity(sender_r),  OUT_SENDER)
    save_csv(cosine_similarity(subject_r), OUT_SUBJECT)
    save_csv(cosine_similarity(body_r),    OUT_BODY)

    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Detected adapter in: /home/users/skuikel/Downloads/Tune/FineTune/dpo_8B
Loading base from:   meta-llama/Meta-Llama-3-8B


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loaded base + adapter.
Embedding Sender…


Sender:   0%|          | 0/51 [00:00<?, ?it/s]

Embedding Subject…


Subject:   0%|          | 0/55 [00:00<?, ?it/s]

Embedding Body…


Body:   0%|          | 0/61 [00:00<?, ?it/s]

PCA reducing…
Computing cosine similarity & saving…
Saved: sender_similarity_matrix_llama8b_ft_pca_dpo.csv
Saved: subject_similarity_matrix_llama8b_ft_pca_dpo.csv
Saved: body_similarity_matrix_llama8b_ft_pca_dpo.csv


In [3]:


import os, gc
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM


MODEL_DIR        = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_Mistral")  # full model OR adapter folder
BASE_MODEL_ID    = os.environ.get("BASE_MODEL_ID", "mistralai/Mistral-7B-v0.1")     # used only if MODEL_DIR is an adapter
BASE_MODEL_LOCAL = os.environ.get("BASE_MODEL_LOCAL")                               # local base path (preferred if you have it)
XLSX_PATH        = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

BATCH_SIZE_GPU   = 32
BATCH_SIZE_CPU   = 4

MAXLEN_SENDER    = 48
MAXLEN_SUBJECT   = 64
MAXLEN_BODY      = 384
TARGET_DIM       = 200

OUT_SENDER       = "sender_similarity_matrix_mistral_ft_pca_dpo.csv"
OUT_SUBJECT      = "subject_similarity_matrix_mistral_ft_pca_dpo.csv"
OUT_BODY         = "body_similarity_matrix_mistral_ft_pca_dpo.csv"

def setup_device():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device

def _is_adapter(path: str) -> bool:
    return os.path.isfile(os.path.join(path, "adapter_config.json"))

def _load_tokenizer(src: str):
    tok = AutoTokenizer.from_pretrained(src, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok

def load_model_and_tokenizer(device):
   
    if _is_adapter(MODEL_DIR):
        base_src = BASE_MODEL_LOCAL if BASE_MODEL_LOCAL else BASE_MODEL_ID
        print(f"Detected adapter in: {MODEL_DIR}")
        print(f"Loading base from:   {base_src}")
        tok = _load_tokenizer(base_src)
        base = AutoModelForCausalLM.from_pretrained(
            base_src,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()

        from peft import PeftModel
        model = PeftModel.from_pretrained(base, MODEL_DIR).eval()
        print("Loaded base + adapter.")
        return model, tok
    else:
        print(f"Loading full model from: {MODEL_DIR}")
        tok = _load_tokenizer(MODEL_DIR)
        model = AutoModel.from_pretrained(
            MODEL_DIR,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()
        print("Loaded full model.")
        return model, tok

@torch.inference_mode()
def embed_batch(model, tok, texts, device, max_length):
    enc = tok(
        ["" if t is None else str(t) for t in texts],
        padding=True,                 # pad to longest in batch
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    if device.type == "cuda":
        with torch.cuda.amp.autocast(dtype=torch.float16):
            out = model(**enc, output_hidden_states=True, use_cache=False)
    else:
        out = model(**enc, output_hidden_states=True, use_cache=False)

    # Robustly get the final hidden states:
    if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None:
        last = out.last_hidden_state
    else:
        # CausalLM returns CausalLMOutputWithPast (no last_hidden_state). Use hidden_states[-1].
        if getattr(out, "hidden_states", None) is None:
            raise RuntimeError("Model did not return hidden states. Ensure output_hidden_states=True.")
        last = out.hidden_states[-1]  # [B, T, H]

    mask = enc["attention_mask"].unsqueeze(-1)    # [B, T, 1]
    pooled = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
    return pooled.detach().cpu().numpy()          # [B, H]

def embed_series(model, tok, series: pd.Series, device, max_length, batch_size, desc="Embedding"):
    vals = series.astype(str).fillna("").tolist()
    # De-dup for speed
    uniq = list(dict.fromkeys(vals))
    cache = {}

    for i in tqdm(range(0, len(uniq), batch_size), desc=desc, leave=False):
        batch = uniq[i:i+batch_size]
        vecs  = embed_batch(model, tok, batch, device, max_length)
        for t, v in zip(batch, vecs):
            cache[t] = v

    
    return np.stack([cache[v] for v in vals], axis=0)


def pca_reduce(X: np.ndarray, target_dim: int) -> np.ndarray:
    n_samples, n_features = X.shape
    n_components = min(target_dim, n_samples, n_features)
    if n_components < target_dim:
        print(f"PCA components clipped to {n_components} (samples={n_samples}, features={n_features}).")
    return PCA(n_components=n_components, random_state=42).fit_transform(X)

def save_csv(matrix: np.ndarray, path: str):
    pd.DataFrame(matrix).to_csv(path, index=False)
    print(f"Saved: {path}")

def main():
    device = setup_device()

    # free mem
    if device.type == "cuda":
        batch_size = BATCH_SIZE_GPU
        torch.cuda.empty_cache()
    else:
        batch_size = BATCH_SIZE_CPU
    gc.collect()

    model, tok = load_model_and_tokenizer(device)

    # Read data
    try:
        df = pd.read_excel(XLSX_PATH)
    except Exception:
        df = pd.read_excel(XLSX_PATH, engine="openpyxl")

    for col in ["Sender", "Subject", "Email"]:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found. Got: {list(df.columns)}")

    print("Embedding Sender…")
    sender_emb  = embed_series(model, tok, df["Sender"],  device, MAXLEN_SENDER,  batch_size, desc="Sender")
    print("Embedding Subject…")
    subject_emb = embed_series(model, tok, df["Subject"], device, MAXLEN_SUBJECT, batch_size, desc="Subject")
    print("Embedding Body…")
    body_emb    = embed_series(model, tok, df["Email"],   device, MAXLEN_BODY,    batch_size, desc="Body")

    print("PCA reducing…")
    sender_r  = pca_reduce(sender_emb,  TARGET_DIM)
    subject_r = pca_reduce(subject_emb, TARGET_DIM)
    body_r    = pca_reduce(body_emb,    TARGET_DIM)

    print("Computing cosine similarity & saving…")
    save_csv(cosine_similarity(sender_r),  OUT_SENDER)
    save_csv(cosine_similarity(subject_r), OUT_SUBJECT)
    save_csv(cosine_similarity(body_r),    OUT_BODY)

    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Detected adapter in: /home/users/skuikel/Downloads/Tune/FineTune/dpo_Mistral
Loading base from:   mistralai/Mistral-7B-v0.1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded base + adapter.
Embedding Sender…


Sender:   0%|          | 0/51 [00:00<?, ?it/s]

Embedding Subject…


Subject:   0%|          | 0/55 [00:00<?, ?it/s]

Embedding Body…


Body:   0%|          | 0/61 [00:00<?, ?it/s]

PCA reducing…
Computing cosine similarity & saving…
Saved: sender_similarity_matrix_mistral_ft_pca_dpo.csv
Saved: subject_similarity_matrix_mistral_ft_pca_dpo.csv
Saved: body_similarity_matrix_mistral_ft_pca_dpo.csv


In [2]:


import os, gc
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM


MODEL_DIR        = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_7B_Wizard")  # full model OR adapter folder
BASE_MODEL_ID    = os.environ.get("BASE_MODEL_ID", "dreamgen/WizardLM-2-7B")     # used only if MODEL_DIR is an adapter
BASE_MODEL_LOCAL = os.environ.get("BASE_MODEL_LOCAL")                               # local base path (preferred if you have it)
XLSX_PATH        = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

BATCH_SIZE_CPU   = 4

MAXLEN_SENDER    = 48
MAXLEN_SUBJECT   = 64
MAXLEN_BODY      = 384
TARGET_DIM       = 200

OUT_SENDER       = "sender_similarity_matrix_wizard_ft_pca_dpo.csv"
OUT_SUBJECT      = "subject_similarity_matrix_wizard_ft_pca_dpo.csv"
OUT_BODY         = "body_similarity_matrix_wizard_ft_pca_dpo.csv"

def setup_device():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device

def _is_adapter(path: str) -> bool:
    return os.path.isfile(os.path.join(path, "adapter_config.json"))

def _load_tokenizer(src: str):
    tok = AutoTokenizer.from_pretrained(src, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok

def load_model_and_tokenizer(device):
   
    if _is_adapter(MODEL_DIR):
        base_src = BASE_MODEL_LOCAL if BASE_MODEL_LOCAL else BASE_MODEL_ID
        print(f"Detected adapter in: {MODEL_DIR}")
        print(f"Loading base from:   {base_src}")
        tok = _load_tokenizer(base_src)
        base = AutoModelForCausalLM.from_pretrained(
            base_src,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()

        from peft import PeftModel
        model = PeftModel.from_pretrained(base, MODEL_DIR).eval()
        print("Loaded base + adapter.")
        return model, tok
    else:
        print(f"Loading full model from: {MODEL_DIR}")
        tok = _load_tokenizer(MODEL_DIR)
        model = AutoModel.from_pretrained(
            MODEL_DIR,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()
        print("Loaded full model.")
        return model, tok

@torch.inference_mode()
def embed_batch(model, tok, texts, device, max_length):
    enc = tok(
        ["" if t is None else str(t) for t in texts],
        padding=True,                 # pad to longest in batch
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    if device.type == "cuda":
        with torch.cuda.amp.autocast(dtype=torch.float16):
            out = model(**enc, output_hidden_states=True, use_cache=False)
    else:
        out = model(**enc, output_hidden_states=True, use_cache=False)

    # Robustly get the final hidden states:
    if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None:
        last = out.last_hidden_state
    else:
        # CausalLM returns CausalLMOutputWithPast (no last_hidden_state). Use hidden_states[-1].
        if getattr(out, "hidden_states", None) is None:
            raise RuntimeError("Model did not return hidden states. Ensure output_hidden_states=True.")
        last = out.hidden_states[-1]  # [B, T, H]

    mask = enc["attention_mask"].unsqueeze(-1)    # [B, T, 1]
    pooled = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
    return pooled.detach().cpu().numpy()          # [B, H]

def embed_series(model, tok, series: pd.Series, device, max_length, batch_size, desc="Embedding"):
    vals = series.astype(str).fillna("").tolist()
    # De-dup for speed
    uniq = list(dict.fromkeys(vals))
    cache = {}

    for i in tqdm(range(0, len(uniq), batch_size), desc=desc, leave=False):
        batch = uniq[i:i+batch_size]
        vecs  = embed_batch(model, tok, batch, device, max_length)
        for t, v in zip(batch, vecs):
            cache[t] = v

    
    return np.stack([cache[v] for v in vals], axis=0)


def pca_reduce(X: np.ndarray, target_dim: int) -> np.ndarray:
    n_samples, n_features = X.shape
    n_components = min(target_dim, n_samples, n_features)
    if n_components < target_dim:
        print(f"PCA components clipped to {n_components} (samples={n_samples}, features={n_features}).")
    return PCA(n_components=n_components, random_state=42).fit_transform(X)

def save_csv(matrix: np.ndarray, path: str):
    pd.DataFrame(matrix).to_csv(path, index=False)
    print(f"Saved: {path}")

def main():
    device = setup_device()

    # free mem
    if device.type == "cuda":
        batch_size = BATCH_SIZE_GPU
        torch.cuda.empty_cache()
    else:
        batch_size = BATCH_SIZE_CPU
    gc.collect()

    model, tok = load_model_and_tokenizer(device)

    # Read data
    try:
        df = pd.read_excel(XLSX_PATH)
    except Exception:
        df = pd.read_excel(XLSX_PATH, engine="openpyxl")

    for col in ["Sender", "Subject", "Email"]:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found. Got: {list(df.columns)}")

    print("Embedding Sender…")
    sender_emb  = embed_series(model, tok, df["Sender"],  device, MAXLEN_SENDER,  batch_size, desc="Sender")
    print("Embedding Subject…")
    subject_emb = embed_series(model, tok, df["Subject"], device, MAXLEN_SUBJECT, batch_size, desc="Subject")
    print("Embedding Body…")
    body_emb    = embed_series(model, tok, df["Email"],   device, MAXLEN_BODY,    batch_size, desc="Body")

    print("PCA reducing…")
    sender_r  = pca_reduce(sender_emb,  TARGET_DIM)
    subject_r = pca_reduce(subject_emb, TARGET_DIM)
    body_r    = pca_reduce(body_emb,    TARGET_DIM)

    print("Computing cosine similarity & saving…")
    save_csv(cosine_similarity(sender_r),  OUT_SENDER)
    save_csv(cosine_similarity(subject_r), OUT_SUBJECT)
    save_csv(cosine_similarity(body_r),    OUT_BODY)

    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Detected adapter in: /home/users/skuikel/Downloads/Tune/FineTune/dpo_7B_Wizard
Loading base from:   dreamgen/WizardLM-2-7B


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loaded base + adapter.
Embedding Sender…


Sender:   0%|          | 0/51 [00:00<?, ?it/s]

Embedding Subject…


Subject:   0%|          | 0/55 [00:00<?, ?it/s]

Embedding Body…


Body:   0%|          | 0/61 [00:00<?, ?it/s]

PCA reducing…
Computing cosine similarity & saving…
Saved: sender_similarity_matrix_wizard_ft_pca_dpo.csv
Saved: subject_similarity_matrix_wizard_ft_pca_dpo.csv
Saved: body_similarity_matrix_wizard_ft_pca_dpo.csv


In [1]:


import os, gc
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM


MODEL_DIR        = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_Qwen")  # full model OR adapter folder
BASE_MODEL_ID    = os.environ.get("BASE_MODEL_ID", "Qwen/Qwen3-8B")     # used only if MODEL_DIR is an adapter
BASE_MODEL_LOCAL = os.environ.get("BASE_MODEL_LOCAL")                               # local base path (preferred if you have it)
XLSX_PATH        = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

BATCH_SIZE_GPU   = 32
BATCH_SIZE_CPU   = 4

MAXLEN_SENDER    = 48
MAXLEN_SUBJECT   = 64
MAXLEN_BODY      = 384
TARGET_DIM       = 200

OUT_SENDER       = "sender_similarity_matrix_qwen_ft_pca_dpo.csv"
OUT_SUBJECT      = "subject_similarity_matrix_qwen_ft_pca_dpo.csv"
OUT_BODY         = "body_similarity_matrix_qwen_ft_pca_dpo.csv"

def setup_device():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device

def _is_adapter(path: str) -> bool:
    return os.path.isfile(os.path.join(path, "adapter_config.json"))

def _load_tokenizer(src: str):
    tok = AutoTokenizer.from_pretrained(src, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok

def load_model_and_tokenizer(device):
   
    if _is_adapter(MODEL_DIR):
        base_src = BASE_MODEL_LOCAL if BASE_MODEL_LOCAL else BASE_MODEL_ID
        print(f"Detected adapter in: {MODEL_DIR}")
        print(f"Loading base from:   {base_src}")
        tok = _load_tokenizer(base_src)
        base = AutoModelForCausalLM.from_pretrained(
            base_src,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()

        from peft import PeftModel
        model = PeftModel.from_pretrained(base, MODEL_DIR).eval()
        print("Loaded base + adapter.")
        return model, tok
    else:
        print(f"Loading full model from: {MODEL_DIR}")
        tok = _load_tokenizer(MODEL_DIR)
        model = AutoModel.from_pretrained(
            MODEL_DIR,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()
        print("Loaded full model.")
        return model, tok

@torch.inference_mode()
def embed_batch(model, tok, texts, device, max_length):
    enc = tok(
        ["" if t is None else str(t) for t in texts],
        padding=True,                 # pad to longest in batch
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    if device.type == "cuda":
        with torch.cuda.amp.autocast(dtype=torch.float16):
            out = model(**enc, output_hidden_states=True, use_cache=False)
    else:
        out = model(**enc, output_hidden_states=True, use_cache=False)

    # Robustly get the final hidden states:
    if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None:
        last = out.last_hidden_state
    else:
        # CausalLM returns CausalLMOutputWithPast (no last_hidden_state). Use hidden_states[-1].
        if getattr(out, "hidden_states", None) is None:
            raise RuntimeError("Model did not return hidden states. Ensure output_hidden_states=True.")
        last = out.hidden_states[-1]  # [B, T, H]

    mask = enc["attention_mask"].unsqueeze(-1)    # [B, T, 1]
    pooled = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
    return pooled.detach().cpu().numpy()          # [B, H]

def embed_series(model, tok, series: pd.Series, device, max_length, batch_size, desc="Embedding"):
    vals = series.astype(str).fillna("").tolist()
    # De-dup for speed
    uniq = list(dict.fromkeys(vals))
    cache = {}

    for i in tqdm(range(0, len(uniq), batch_size), desc=desc, leave=False):
        batch = uniq[i:i+batch_size]
        vecs  = embed_batch(model, tok, batch, device, max_length)
        for t, v in zip(batch, vecs):
            cache[t] = v

    
    return np.stack([cache[v] for v in vals], axis=0)


def pca_reduce(X: np.ndarray, target_dim: int) -> np.ndarray:
    n_samples, n_features = X.shape
    n_components = min(target_dim, n_samples, n_features)
    if n_components < target_dim:
        print(f"PCA components clipped to {n_components} (samples={n_samples}, features={n_features}).")
    return PCA(n_components=n_components, random_state=42).fit_transform(X)

def save_csv(matrix: np.ndarray, path: str):
    pd.DataFrame(matrix).to_csv(path, index=False)
    print(f"Saved: {path}")

def main():
    device = setup_device()

    # free mem
    if device.type == "cuda":
        batch_size = BATCH_SIZE_GPU
        torch.cuda.empty_cache()
    else:
        batch_size = BATCH_SIZE_CPU
    gc.collect()

    model, tok = load_model_and_tokenizer(device)

    # Read data
    try:
        df = pd.read_excel(XLSX_PATH)
    except Exception:
        df = pd.read_excel(XLSX_PATH, engine="openpyxl")

    for col in ["Sender", "Subject", "Email"]:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found. Got: {list(df.columns)}")

    print("Embedding Sender…")
    sender_emb  = embed_series(model, tok, df["Sender"],  device, MAXLEN_SENDER,  batch_size, desc="Sender")
    print("Embedding Subject…")
    subject_emb = embed_series(model, tok, df["Subject"], device, MAXLEN_SUBJECT, batch_size, desc="Subject")
    print("Embedding Body…")
    body_emb    = embed_series(model, tok, df["Email"],   device, MAXLEN_BODY,    batch_size, desc="Body")

    print("PCA reducing…")
    sender_r  = pca_reduce(sender_emb,  TARGET_DIM)
    subject_r = pca_reduce(subject_emb, TARGET_DIM)
    body_r    = pca_reduce(body_emb,    TARGET_DIM)

    print("Computing cosine similarity & saving…")
    save_csv(cosine_similarity(sender_r),  OUT_SENDER)
    save_csv(cosine_similarity(subject_r), OUT_SUBJECT)
    save_csv(cosine_similarity(body_r),    OUT_BODY)

    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Detected adapter in: /home/users/skuikel/Downloads/Tune/FineTune/dpo_Qwen
Loading base from:   Qwen/Qwen3-8B


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Loaded base + adapter.
Embedding Sender…


Sender:   0%|          | 0/51 [00:00<?, ?it/s]

Embedding Subject…


Subject:   0%|          | 0/55 [00:00<?, ?it/s]

Embedding Body…


Body:   0%|          | 0/61 [00:00<?, ?it/s]

PCA reducing…
Computing cosine similarity & saving…
Saved: sender_similarity_matrix_qwen_ft_pca_dpo.csv
Saved: subject_similarity_matrix_qwen_ft_pca_dpo.csv
Saved: body_similarity_matrix_qwen_ft_pca_dpo.csv


In [6]:


import os, gc
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM


MODEL_DIR        = os.path.expanduser("~/Downloads/Tune/FineTune/dpo_bert_uncased")  # full model OR adapter folder
BASE_MODEL_ID    = os.environ.get("BASE_MODEL_ID", "bert-base-uncased")     # used only if MODEL_DIR is an adapter
BASE_MODEL_LOCAL = os.environ.get("BASE_MODEL_LOCAL")                               # local base path (preferred if you have it)
XLSX_PATH        = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

BATCH_SIZE_GPU   = 32
BATCH_SIZE_CPU   = 4

MAXLEN_SENDER    = 48
MAXLEN_SUBJECT   = 64
MAXLEN_BODY      = 384
TARGET_DIM       = 200

OUT_SENDER       = "sender_similarity_matrix_bert_ft_pca_dpo.csv"
OUT_SUBJECT      = "subject_similarity_matrix_bert_ft_pca_dpo.csv"
OUT_BODY         = "body_similarity_matrix_bert_ft_pca_dpo.csv"

def setup_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        print(f"Using device: {torch.cuda.get_device_name(None)}")
        print(f"Total CUDA devices: {torch.cuda.device_count()}")
        return device
    print("No CUDA device available. Falling back to CPU.")
    return torch.device("cpu")

def _is_adapter(path: str) -> bool:
    return os.path.isfile(os.path.join(path, "adapter_config.json"))

def _load_tokenizer(src: str):
    tok = AutoTokenizer.from_pretrained(src, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok

def load_model_and_tokenizer(device):
   
    if _is_adapter(MODEL_DIR):
        base_src = BASE_MODEL_LOCAL if BASE_MODEL_LOCAL else BASE_MODEL_ID
        print(f"Detected adapter in: {MODEL_DIR}")
        print(f"Loading base from:   {base_src}")
        tok = _load_tokenizer(base_src)
        base = AutoModelForCausalLM.from_pretrained(
            base_src,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()

        from peft import PeftModel
        model = PeftModel.from_pretrained(base, MODEL_DIR).eval()
        print("Loaded base + adapter.")
        return model, tok
    else:
        print(f"Loading full model from: {MODEL_DIR}")
        tok = _load_tokenizer(MODEL_DIR)
        model = AutoModel.from_pretrained(
            MODEL_DIR,
            torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        ).to(device).eval()
        print("Loaded full model.")
        return model, tok

@torch.inference_mode()
def embed_batch(model, tok, texts, device, max_length):
    enc = tok(
        ["" if t is None else str(t) for t in texts],
        padding=True,                 # pad to longest in batch
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    enc = {k: v.to(device, non_blocking=True) for k, v in enc.items()}

    if device.type == "cuda":
        with torch.cuda.amp.autocast(dtype=torch.float16):
            out = model(**enc, output_hidden_states=True, use_cache=False)
    else:
        out = model(**enc, output_hidden_states=True, use_cache=False)

    # Robustly get the final hidden states:
    if hasattr(out, "last_hidden_state") and out.last_hidden_state is not None:
        last = out.last_hidden_state
    else:
        # CausalLM returns CausalLMOutputWithPast (no last_hidden_state). Use hidden_states[-1].
        if getattr(out, "hidden_states", None) is None:
            raise RuntimeError("Model did not return hidden states. Ensure output_hidden_states=True.")
        last = out.hidden_states[-1]  # [B, T, H]

    mask = enc["attention_mask"].unsqueeze(-1)    # [B, T, 1]
    pooled = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
    return pooled.detach().cpu().numpy()          # [B, H]

def embed_series(model, tok, series: pd.Series, device, max_length, batch_size, desc="Embedding"):
    vals = series.astype(str).fillna("").tolist()
    # De-dup for speed
    uniq = list(dict.fromkeys(vals))
    cache = {}

    for i in tqdm(range(0, len(uniq), batch_size), desc=desc, leave=False):
        batch = uniq[i:i+batch_size]
        vecs  = embed_batch(model, tok, batch, device, max_length)
        for t, v in zip(batch, vecs):
            cache[t] = v

    
    return np.stack([cache[v] for v in vals], axis=0)


def pca_reduce(X: np.ndarray, target_dim: int) -> np.ndarray:
    n_samples, n_features = X.shape
    n_components = min(target_dim, n_samples, n_features)
    if n_components < target_dim:
        print(f"PCA components clipped to {n_components} (samples={n_samples}, features={n_features}).")
    return PCA(n_components=n_components, random_state=42).fit_transform(X)

def save_csv(matrix: np.ndarray, path: str):
    pd.DataFrame(matrix).to_csv(path, index=False)
    print(f"Saved: {path}")

def main():
    device = setup_device()

    # free mem
    if device.type == "cuda":
        batch_size = BATCH_SIZE_GPU
        torch.cuda.empty_cache()
    else:
        batch_size = BATCH_SIZE_CPU
    gc.collect()

    model, tok = load_model_and_tokenizer(device)

    # Read data
    try:
        df = pd.read_excel(XLSX_PATH)
    except Exception:
        df = pd.read_excel(XLSX_PATH, engine="openpyxl")

    for col in ["Sender", "Subject", "Email"]:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found. Got: {list(df.columns)}")

    print("Embedding Sender…")
    sender_emb  = embed_series(model, tok, df["Sender"],  device, MAXLEN_SENDER,  batch_size, desc="Sender")
    print("Embedding Subject…")
    subject_emb = embed_series(model, tok, df["Subject"], device, MAXLEN_SUBJECT, batch_size, desc="Subject")
    print("Embedding Body…")
    body_emb    = embed_series(model, tok, df["Email"],   device, MAXLEN_BODY,    batch_size, desc="Body")

    print("PCA reducing…")
    sender_r  = pca_reduce(sender_emb,  TARGET_DIM)
    subject_r = pca_reduce(subject_emb, TARGET_DIM)
    body_r    = pca_reduce(body_emb,    TARGET_DIM)

    print("Computing cosine similarity & saving…")
    save_csv(cosine_similarity(sender_r),  OUT_SENDER)
    save_csv(cosine_similarity(subject_r), OUT_SUBJECT)
    save_csv(cosine_similarity(body_r),    OUT_BODY)

    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Using device: NVIDIA RTX A5000
Total CUDA devices: 3
Loading full model from: /home/users/skuikel/Downloads/Tune/FineTune/dpo_bert_uncased
Loaded full model.
Embedding Sender…


Sender:   0%|          | 0/7 [00:00<?, ?it/s]

Embedding Subject…


  with torch.cuda.amp.autocast(dtype=torch.float16):


Subject:   0%|          | 0/7 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=torch.float16):


Embedding Body…


Body:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=torch.float16):


PCA reducing…
Computing cosine similarity & saving…
Saved: sender_similarity_matrix_bert_ft_pca_dpo.csv
Saved: subject_similarity_matrix_bert_ft_pca_dpo.csv
Saved: body_similarity_matrix_bert_ft_pca_dpo.csv
