In [7]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gc


def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = LlamaForSequenceClassification.from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer

def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding

def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
           
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)


def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/bert_dpo123_classification_model")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

    model, tokenizer = load_fine_tuned_model(model_dir, device)

    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)

    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    

    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_bert_dpo1.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_bert_dpo1.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_bert_dpo1.csv")

    # Clean up memory using cache 
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Using CPU


You are using a model of type bert to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /home/users/skuikel/Downloads/Tune/FineTune/bert_dpo123_classification_model and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v

TypeError: forward() got an unexpected keyword argument 'token_type_ids'

In [2]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer
import gc





def setup_environment():
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    else:
        device = torch.device("cpu")
    return device

def load_fine_tuned_model(model_dir, device):
    tokenizer = LlamaTokenizer.from_pretrained(model_dir)
    model = LlamaForSequenceClassification.from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer



def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)


def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_7b_dpo123_classification_model")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_llama_7b_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_llama_7b_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_llama_7b_dpo.csv")

   
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/users/skuikel/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
Saved similarity matrix to sender_similarity_matrix_llama_7b_dpo.csv
Saved similarity matrix to subject_similarity_matrix_llama_7b_dpo.csv
Saved similarity matrix to body_similarity_matrix_llama_7b_dpo.csv


In [3]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer
import gc


from transformers import LlamaForSequenceClassification, AutoTokenizer

def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = LlamaForSequenceClassification.from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer


def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)



def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/llama_8b_dpo123_classification_model")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_llama_8b_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_llama_8b_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_llama_8b_dpo.csv")

 #  
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved similarity matrix to sender_similarity_matrix_llama_8b_dpo.csv
Saved similarity matrix to subject_similarity_matrix_llama_8b_dpo.csv
Saved similarity matrix to body_similarity_matrix_llama_8b_dpo.csv


In [4]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import LlamaForSequenceClassification, LlamaTokenizer, AutoConfig, AutoTokenizer
import gc


from transformers import LlamaForSequenceClassification, AutoTokenizer

def load_fine_tuned_model(model_dir, device):
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = LlamaForSequenceClassification.from_pretrained(model_dir, output_hidden_states=True)
    model.to(device)
    model.eval()
    return model, tokenizer


def get_embeddings(model, tokenizer, text, device, max_length=512):
    inputs = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.hidden_states[-1] 
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  
    return embedding


def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    data = pd.read_excel(csv_path)
    sender_embeddings, subject_embeddings, body_embeddings = [], [], []
    
    with torch.no_grad():
        for _, row in data.iterrows():
            sender = str(row['Sender'])
            subject = str(row['Subject'])
            body = str(row['Email'])
            
            
            sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
            subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
            body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
    
    return sender_embeddings, subject_embeddings, body_embeddings


def calculate_similarity_matrix(embeddings):
    return cosine_similarity(embeddings)



def save_matrix_to_csv(matrix, filename):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")


def main():
    device = setup_environment()
    model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/wizard_7b_dpo_classification_model")
    csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")
    
 
    model, tokenizer = load_fine_tuned_model(model_dir, device)
    
   
    sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(model, tokenizer, csv_path, device)
    
  
    sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
    subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
    body_similarity_matrix = calculate_similarity_matrix(body_embeddings)
    
   
    save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_wizard_dpo.csv")
    save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_wizard_dpo.csv")
    save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_wizard_dpo.csv")

 #  
gc.collect()
torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at dreamgen/WizardLM-2-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Saved similarity matrix to sender_similarity_matrix_wizard_dpo.csv
Saved similarity matrix to subject_similarity_matrix_wizard_dpo.csv
Saved similarity matrix to body_similarity_matrix_wizard_dpo.csv


In [8]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gc

def setup_environment():
    """Set up the GPU environment and return the appropriate device."""
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
    return device

def load_fine_tuned_model(model_dir, device):
    """Load the fine-tuned BERT model and tokenizer."""
    # Use AutoModelForSequenceClassification instead of LlamaForSequenceClassification
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_dir, 
        output_hidden_states=True,
        num_labels=2
    )
    model.to(device)
    model.eval()
    return model, tokenizer

def get_embeddings(model, tokenizer, text, device, max_length=512):
    """Get embeddings for a given text using the model."""
    # Remove token_type_ids from inputs if using BERT
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Remove token_type_ids if present
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        # Get the last hidden state
        hidden_states = outputs.hidden_states[-1]
        # Mean pooling with attention mask
        attention_mask = inputs['attention_mask']
        mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
        sum_embeddings = torch.sum(hidden_states * mask, dim=1)
        sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
        embedding = (sum_embeddings / sum_mask).squeeze().cpu().numpy()
    return embedding

def extract_embeddings_from_csv(model, tokenizer, csv_path, device, max_length=512):
    """Extract embeddings from CSV file."""
    try:
        data = pd.read_excel(csv_path)
        sender_embeddings, subject_embeddings, body_embeddings = [], [], []
        
        total_rows = len(data)
        print(f"Processing {total_rows} emails...")
        
        with torch.no_grad():
            for idx, row in data.iterrows():
                if idx % 10 == 0:
                    print(f"Processing email {idx}/{total_rows}")
                
                try:
                    sender = str(row['Sender'])
                    subject = str(row['Subject'])
                    body = str(row['Email'])
                    
                    sender_embeddings.append(get_embeddings(model, tokenizer, sender, device, max_length))
                    subject_embeddings.append(get_embeddings(model, tokenizer, subject, device, max_length))
                    body_embeddings.append(get_embeddings(model, tokenizer, body, device, max_length))
                except Exception as e:
                    print(f"Error processing row {idx}: {str(e)}")
                    # Use zero embeddings for failed rows
                    zero_emb = np.zeros(model.config.hidden_size)
                    sender_embeddings.append(zero_emb)
                    subject_embeddings.append(zero_emb)
                    body_embeddings.append(zero_emb)
        
        return sender_embeddings, subject_embeddings, body_embeddings
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        raise

def calculate_similarity_matrix(embeddings):
    """Calculate cosine similarity matrix."""
    return cosine_similarity(embeddings)

def save_matrix_to_csv(matrix, filename):
    """Save similarity matrix to CSV."""
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=False)
    print(f"Saved similarity matrix to {filename}")

def main():
    try:
        device = setup_environment()
        model_dir = os.path.expanduser("~/Downloads/Tune/FineTune/bert_dpo123_classification_model")
        csv_path = os.path.expanduser("~/Downloads/Tune/FineTune/Original_data.xlsx")

        print("Loading model and tokenizer...")
        model, tokenizer = load_fine_tuned_model(model_dir, device)

        print("Extracting embeddings...")
        sender_embeddings, subject_embeddings, body_embeddings = extract_embeddings_from_csv(
            model, tokenizer, csv_path, device
        )

        print("Calculating similarity matrices...")
        sender_similarity_matrix = calculate_similarity_matrix(sender_embeddings)
        subject_similarity_matrix = calculate_similarity_matrix(subject_embeddings)
        body_similarity_matrix = calculate_similarity_matrix(body_embeddings)

        print("Saving matrices...")
        save_matrix_to_csv(sender_similarity_matrix, "sender_similarity_matrix_bert_dpo1.csv")
        save_matrix_to_csv(subject_similarity_matrix, "subject_similarity_matrix_bert_dpo1.csv")
        save_matrix_to_csv(body_similarity_matrix, "body_similarity_matrix_bert_dpo1.csv")

        # Clean up
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print("Processing completed successfully!")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Using CPU
Loading model and tokenizer...
Extracting embeddings...
Processing 241 emails...
Processing email 0/241
Processing email 10/241
Processing email 20/241
Processing email 30/241
Processing email 40/241
Processing email 50/241
Processing email 60/241
Processing email 70/241
Processing email 80/241
Processing email 90/241
Processing email 100/241
Processing email 110/241
Processing email 120/241
Processing email 130/241
Processing email 140/241
Processing email 150/241
Processing email 160/241
Processing email 170/241
Processing email 180/241
Processing email 190/241
Processing email 200/241
Processing email 210/241
Processing email 220/241
Processing email 230/241
Processing email 240/241
Calculating similarity matrices...
Saving matrices...
Saved similarity matrix to sender_similarity_matrix_bert_dpo1.csv
Saved similarity matrix to subject_similarity_matrix_bert_dpo1.csv
Saved similarity matrix to body_similarity_matrix_bert_dpo1.csv
Processing completed successfully!
