In [1]:
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import  models
from sklearn.metrics import accuracy_score, f1_score


import logging
import torch.nn as nn
from sentence_transformers import SentenceTransformer,  models
from scipy.stats import pearsonr, spearmanr
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

2024-08-12 12:02:14.310004: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-12 12:02:14.322026: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-12 12:02:14.335814: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-12 12:02:14.339900: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-12 12:02:14.350474: I tensorflow/core/platform/cpu_feature_guar

In [7]:

class Sent_Embed_Dense(nn.Module):
    def __init__(self, model_name, pooling_mode, input_dim, output_dim=1):
        super(Sent_Embed_Dense, self).__init__()
        self.sbert = SentenceTransformer(model_name)
        self.pooling = models.Pooling(self.sbert.get_sentence_embedding_dimension(), pooling_mode)
        self.dense = nn.Linear(2 * input_dim, output_dim)  # input_dim = input_dim for element wise subtraction
    
    def forward(self, features):
        text1 = features['text1']
        text2 = features['text2']

        # Get embeddings for both texts
        embeddings1 = self.sbert.encode(text1, convert_to_tensor=True)
        embeddings2 = self.sbert.encode(text2, convert_to_tensor=True)

        # Concatenate embeddings
        combined_embeddings = torch.concat((embeddings1, embeddings2), dim=1)

        # element wise subtraction embeddings
        #combined_embeddings = (embeddings1- embeddings2)
        output = self.dense(combined_embeddings)
        return output


class CustomDataset(Dataset):
    def __init__(self, text1, text2, labels):
        self.text1 = text1
        self.text2 = text2
        self.labels = labels
 
    
    def __len__(self):
        return len(self.text1)
    
    def __getitem__(self, idx):
        text1 = self.text1[idx]
        text2 = self.text2[idx]
        label = self.labels[idx]
        return {'text1': text1, 'text2': text2}, torch.tensor(label, dtype=torch.float32)
    
def make_dataloader(df,batch_size):
    texts1 = df['user_thoughts_and _feelings'].values
    texts2 = df['designer_guess'].values
    labels = df['Avg_EA'].values
    
    dataset = CustomDataset(texts1, texts2, labels)
    dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size, collate_fn=custom_collate_fn)
    #print(zip(*dataloader))
    return dataloader

def custom_collate_fn(batch):
    texts, labels = zip(*batch)
    combined_inputs = {}
    for key in texts[0].keys():
        combined_inputs[key] = [text[key] for text in texts]
    
    # Convert labels to tensors and move to GPU
    labels = (labels)
    
    return combined_inputs, labels

def calculate_correlation(predictions, labels):
    # Flatten tensors and convert to numpy arrays
    predictions = torch.cat(predictions).cpu().detach().numpy().flatten()
    labels = torch.cat(labels).cpu().detach().numpy().flatten()
    #print(predictions)
    
    if len(predictions) < 2 or len(labels) < 2:
        raise ValueError("Both predictions and labels must have at least 2 data points")
    
    # Compute Pearson correlation
    pearson, _ = pearsonr(predictions, labels)  
    rmse = np.sqrt(np.mean((labels - predictions)**2))
    spearman, _ = spearmanr(predictions, labels)
    return pearson,spearman, rmse

In [8]:


# Define your SBERT model with additional dense layer
model_name = "sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1"  
pooling_mode = "mean" 
input_dim = 768  
model = Sent_Embed_Dense(model_name, pooling_mode, input_dim)

model.to('cuda')
tokenizer_ = model.sbert.tokenizer 

# Define loss and optimizer
loss_fn = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=2e-5)

# Load your data
data = pd.read_excel(r"full data-EA-preprocessed.xlsx" )

data['Avg_EA'] = data["Average EA"]/2
User= data["Inferences"]
Des= data["Users' thoughts or feeling"]
User = User.str.lower().str.replace(':','').str.replace('i was', '').str.replace('she / he was', '').str.replace('s/he was', '').str.replace('she was', '').str.replace('he was', '').str.replace('they were', '')
Des = Des.str.lower().str.replace(':','').str.replace('i was', '').str.replace('she / he was', '').str.replace('s/he was', '').str.replace('she was', '').str.replace('he was', '').str.replace('they were', '')

data['user_thoughts_and _feelings'] = User
data['designer_guess'] = Des
data['Avg_EA'] = data["Average EA"] / 2

k_folds = 10
kf = KFold(n_splits=k_folds, shuffle=True, random_state=123)

# Training loop
epochs = 50
results = []

All_pred = []
Train_predictions = []
Val_predictions = []
for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
    train_df = data.iloc[train_idx]
    val_df = data.iloc[val_idx]
    
    train_dataloader = make_dataloader(train_df,  batch_size=1)
    val_dataloader = make_dataloader(val_df, batch_size=1)

    for epoch in range(epochs):
        all_predictions_train = []
        all_labels_train = []
        model.train()
        for batch in train_dataloader:
            inputs, labels = batch
            inputs = {k: v for k, v in inputs.items()}  # Move inputs to GPU
            labels = torch.stack(labels).to('cuda')
            labels = labels.view(-1, 1)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)

            all_predictions_train.append(outputs)
            all_labels_train.append(labels)

       
            loss.backward()
            optimizer.step()
        train_pearson, train_spearman, train_rmse = calculate_correlation(all_predictions_train, all_labels_train)
        Train_predictions.append([train_pearson, train_spearman, train_rmse ])
        #print(f"Fold {fold + 1}, Epoch {epoch + 1}, Val Loss: {val_loss / len(val_dataloader)}")
        # Validation step
        model.eval()
        val_loss = 0
        all_predictions = []
        all_labels = []
        
        with torch.no_grad():
            for batch in val_dataloader:
                inputs, labels = batch
                inputs = {k: v for k, v in inputs.items()}  # Move inputs to GPU
                labels = torch.stack(labels).to('cuda')
                labels = labels.view(-1, 1)
                
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
                val_loss += loss.item()
                
                all_predictions.append(outputs)
                all_labels.append(labels)
                
                #print(len(all_predictions))
                #print(all_predictions)
            val_pearson,val_spearman, val_rmse  = calculate_correlation(all_predictions, all_labels)
            Val_predictions.append([val_pearson,val_spearman, val_rmse])
        print(f"Fold {fold + 1}, Epoch {epoch + 1}, Val Loss: {val_loss / len(val_dataloader)}")
    
    results.append({
                'fold': fold,
                'train_pearson': train_pearson, 'train_spearman': train_spearman,'train_rmse': train_rmse,
                'val_pearson': val_pearson, 'val_spearman': val_spearman,'val_rmse': val_rmse,
                #'baseline_pearson':baseline_pearson, 'baseline_spearman':baseline_spearman, 'baseline_rmse':baseline_rmse
            })
    All_pred.append({
        'fold': fold,
        'train_pearson': [i[0] for i in Train_predictions], 'train_spearman': [i[1] for i in Train_predictions],'train_rmse': [i[2] for i in Train_predictions],
         'val_pearson': [i[0] for i in Val_predictions], 'val_spearman': [i[1] for i in Val_predictions],'val_rmse': [i[2] for i in Val_predictions],
    
    })
    
        
# Save the results
results_df = pd.DataFrame(results)

results_df.to_csv("Dense_layer_experiment_with_SBERT Concatination evaluation.csv", index=False)




Fold 1, Epoch 1, Val Loss: 0.0841490375453658
Fold 1, Epoch 2, Val Loss: 0.08341630381918574
Fold 1, Epoch 3, Val Loss: 0.08320680377565977
Fold 1, Epoch 4, Val Loss: 0.08361910475745431
Fold 1, Epoch 5, Val Loss: 0.08329329764237628
Fold 1, Epoch 6, Val Loss: 0.0836639200513471
Fold 1, Epoch 7, Val Loss: 0.0838973924349476
Fold 1, Epoch 8, Val Loss: 0.08370037173380196
Fold 1, Epoch 9, Val Loss: 0.08342840529088183
Fold 1, Epoch 10, Val Loss: 0.08456017305208105
Fold 1, Epoch 11, Val Loss: 0.08451349695785919
Fold 1, Epoch 12, Val Loss: 0.08519762037762525
Fold 1, Epoch 13, Val Loss: 0.0851822223226514
Fold 1, Epoch 14, Val Loss: 0.08519039881745509
Fold 1, Epoch 15, Val Loss: 0.08647027837076975
Fold 1, Epoch 16, Val Loss: 0.08637963502163377
Fold 1, Epoch 17, Val Loss: 0.08717418283793248
Fold 1, Epoch 18, Val Loss: 0.08724933824269102
Fold 1, Epoch 19, Val Loss: 0.08854503751693604
Fold 1, Epoch 20, Val Loss: 0.08846234194481642
Fold 1, Epoch 21, Val Loss: 0.08827874321205956
Fold 

Fold 4, Epoch 23, Val Loss: 0.02672941023105192
Fold 4, Epoch 24, Val Loss: 0.027842554444620502
Fold 4, Epoch 25, Val Loss: 0.027285002978136286
Fold 4, Epoch 26, Val Loss: 0.028837831689078153
Fold 4, Epoch 27, Val Loss: 0.029337229482073533
Fold 4, Epoch 28, Val Loss: 0.029833091203148466
Fold 4, Epoch 29, Val Loss: 0.030130797817181785
Fold 4, Epoch 30, Val Loss: 0.02993427580270882
Fold 4, Epoch 31, Val Loss: 0.030807317675473976
Fold 4, Epoch 32, Val Loss: 0.030093618380233653
Fold 4, Epoch 33, Val Loss: 0.03174664764165672
Fold 4, Epoch 34, Val Loss: 0.032069495655191946
Fold 4, Epoch 35, Val Loss: 0.03214895934347662
Fold 4, Epoch 36, Val Loss: 0.03147570405185847
Fold 4, Epoch 37, Val Loss: 0.03378253420198069
Fold 4, Epoch 38, Val Loss: 0.03568193331582072
Fold 4, Epoch 39, Val Loss: 0.03415493198516825
Fold 4, Epoch 40, Val Loss: 0.034756638277837155
Fold 4, Epoch 41, Val Loss: 0.03476056904393555
Fold 4, Epoch 42, Val Loss: 0.03682978931614012
Fold 4, Epoch 43, Val Loss: 0.

Fold 7, Epoch 43, Val Loss: 0.025781625691892754
Fold 7, Epoch 44, Val Loss: 0.02688139560979669
Fold 7, Epoch 45, Val Loss: 0.026044412319565294
Fold 7, Epoch 46, Val Loss: 0.026164974194326597
Fold 7, Epoch 47, Val Loss: 0.026982840643974916
Fold 7, Epoch 48, Val Loss: 0.02828281737079537
Fold 7, Epoch 49, Val Loss: 0.027851185242401367
Fold 7, Epoch 50, Val Loss: 0.027260894255425164
Fold 8, Epoch 1, Val Loss: 0.00579370520339172
Fold 8, Epoch 2, Val Loss: 0.005444158017326117
Fold 8, Epoch 3, Val Loss: 0.006710628563144961
Fold 8, Epoch 4, Val Loss: 0.007053612569011007
Fold 8, Epoch 5, Val Loss: 0.0074171345044801865
Fold 8, Epoch 6, Val Loss: 0.008097666746008063
Fold 8, Epoch 7, Val Loss: 0.007389459457372864
Fold 8, Epoch 8, Val Loss: 0.00858183106515753
Fold 8, Epoch 9, Val Loss: 0.00891250876958855
Fold 8, Epoch 10, Val Loss: 0.00880386626985538
Fold 8, Epoch 11, Val Loss: 0.009000009279506312
Fold 8, Epoch 12, Val Loss: 0.008946404186396896
Fold 8, Epoch 13, Val Loss: 0.0098