In [None]:
import torch
torch.cuda.is_available()

In [None]:
import logging
import pandas as pd
import numpy as np
import torch.nn as nn
from sentence_transformers import SentenceTransformer, losses, InputExample
from scipy.stats import pearsonr, spearmanr
from scipy.spatial import distance
from torch.utils.data import  DataLoader
from sklearn.model_selection import KFold
from sentence_transformers import SentenceTransformer, models


In [None]:
def make_dataloader(df, batch_size):
    train_sample = []
    for _, x in df.iterrows():
        train_sample.append(InputExample(texts=[x['user_thoughts_and _feelings'], x['designer_guess']], label=x['Avg_EA']))

    return DataLoader(train_sample, shuffle=True, batch_size=batch_size)

def predict_scores(Designer, User):
    return [1 - distance.cosine(Designer[i], User[i])
                  for i in range(User.shape[0])]

def evaluate(actual, predicted):
    rmse = np.sqrt(np.mean((actual - predicted)**2))
    pearson = pearsonr(actual, predicted)[0]
    spearman = spearmanr(actual, predicted)[0]
    print("Pearson:", pearson)
    print("spearman:", spearman)
    print("RMSE:", rmse)
    return pearson,spearman, rmse

def model_evaluate(model,sent_1,sent_2, actual):
    s1 = model.encode(sent_1)
    s2 = model.encode(sent_2)
    scores = predict_scores(s1, s2)
    pearson,spearman, rmse = evaluate(actual, scores)
    return pearson,spearman, rmse



In [None]:

pool = [ 'mean', 'cls','max','lasttoken','weightedmean']
model_ = [ "sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1",
          "princeton-nlp/sup-simcse-roberta-base",
          "princeton-nlp/sup-simcse-bert-base-uncased",
          "Contrastive-Tension/RoBerta-Large-CT-STSb",
          "Contrastive-Tension/BERT-Base-CT-STSb",
          "voidism/diffcse-roberta-base-sts",
          "voidism/diffcse-bert-base-uncased-sts",
          "kwang2049/TSDAE-askubuntu2nli_stsb",
           "thenlper/gte-base"
          ]
for i in model_:
    for j in pool:
        EA = (r"ondemand/data/full data-EA-preprocessed.xlsx")
        data = pd.read_excel(EA)

        data['Avg_EA'] = data["Average EA"]/2
        User= data["Users' thoughts or feeling"]
        Des= data["Inferences"]
        User = User.str.lower().str.replace(':','').str.replace('i was', '').str.replace('s/he was', '').str.replace('she was', '').str.replace('he was', '').str.replace('they were', '')
        Des = Des.str.lower().str.replace(':','').str.replace('i was', '').str.replace('s/he was', '').str.replace('she was', '').str.replace('he was', '').str.replace('they were', '')

        data['user_thoughts_and _feelings'] = User
        data['designer_guess'] = Des

        k = 10
        print(k)
        cross_val = KFold(n_splits=k, shuffle=True, random_state=123)
        epochs = 5
        batch_size = 1



        results = []
        for k, (train, val) in enumerate(cross_val.split(data)):
        
            word_embedding_model = models.Transformer(i)
            pooling_model= models.Pooling(word_embedding_model.get_word_embedding_dimension(),j)
            model_new_ = SentenceTransformer(modules=[word_embedding_model, pooling_model])
            model = model_new_
            model.to('cuda')
            train_loss = losses.CosineSimilarityLoss(model=model)
            #AnglELoss
            #CosineSimilarityLoss
            train_df = data.iloc[train]
            val_df = data.iloc[val]
            train_dataloader = make_dataloader(train_df, batch_size=batch_size)
            baseline_pearson,baseline_spearman, baseline_rmse = model_evaluate(model, val_df['user_thoughts_and _feelings'].values, val_df['designer_guess'].values,
                                                        val_df['Avg_EA'].values
                                                        )
            model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs)
            train_pearson, train_spearman, train_rmse = model_evaluate(model, train_df['user_thoughts_and _feelings'].values, train_df['designer_guess'].values,
                                                  train_df['Avg_EA'].values
                                                  )
            val_pearson,val_spearman, val_rmse = model_evaluate(model, val_df['user_thoughts_and _feelings'].values, val_df['designer_guess'].values,
                                              val_df['Avg_EA'].values
                                              )

            results.append({
                'k': k,
                'train_pearson': train_pearson, 'train_spearman': train_spearman,'train_rmse': train_rmse,
                'val_pearson': val_pearson, 'val_spearman': val_spearman,'val_rmse': val_rmse,
                'baseline_pearson':baseline_pearson, 'baseline_spearman':baseline_spearman, 'baseline_rmse':baseline_rmse
            })

        #pd.DataFrame(results).to_csv('/content/drive/MyDrive/SBERT preprocessed Supervised results (1 Epoch).csv')
        pd.DataFrame(results).to_csv('SBERT preprocessed Supervised results'+ str(x) + " extra " + j +' pooling (5 Epoch).csv')